Is there a way to convert CSV columns into hierarchical relationships?
For creating the exact nested object you want we'll use a mix of pure JavaScript and a D3 method named d3.stratify
. However, have in mind that 7 million rows (please see the post scriptum below) is a lot to compute.
It's very important to mention that, for this proposed solution, you'll have to separate the Kingdoms in different data arrays (for instance, using Array.prototype.filter
). This restriction occurs because we need a root node, and in the Linnaean taxonomy there is no relationship between Kingdoms (unless you create "Domain" as a top rank, which will be the root for all eukaryotes, but then you'll have the same problem for Archaea and Bacteria).
So, suppose you have this CSV (I added some more rows) with just one Kingdom:
RecordID,kingdom,phylum,class,order,family,genus,species1,Animalia,Chordata,Mammalia,Primates,Hominidae,Homo,Homo sapiens2,Animalia,Chordata,Mammalia,Carnivora,Canidae,Canis,Canis latrans3,Animalia,Chordata,Mammalia,Cetacea,Delphinidae,Tursiops,Tursiops truncatus1,Animalia,Chordata,Mammalia,Primates,Hominidae,Pan,Pan paniscus
Based on that CSV, we'll create an array here named tableOfRelationships
which, as the name implies, has the relationships between the ranks:
const data = d3.csvParse(csv);const taxonomicRanks = data.columns.filter(d => d !== "RecordID");const tableOfRelationships = [];data.forEach(row => { taxonomicRanks.forEach((d, i) => { if (!tableOfRelationships.find(e => e.name === row[d])) tableOfRelationships.push({ name: row[d], parent: row[taxonomicRanks[i - 1]] || null }) })});
For the data above, this is the tableOfRelationships
:
+---------+----------------------+---------------+| (Index) | name | parent |+---------+----------------------+---------------+| 0 | "Animalia" | null || 1 | "Chordata" | "Animalia" || 2 | "Mammalia" | "Chordata" || 3 | "Primates" | "Mammalia" || 4 | "Hominidae" | "Primates" || 5 | "Homo" | "Hominidae" || 6 | "Homo sapiens" | "Homo" || 7 | "Carnivora" | "Mammalia" || 8 | "Canidae" | "Carnivora" || 9 | "Canis" | "Canidae" || 10 | "Canis latrans" | "Canis" || 11 | "Cetacea" | "Mammalia" || 12 | "Delphinidae" | "Cetacea" || 13 | "Tursiops" | "Delphinidae" || 14 | "Tursiops truncatus" | "Tursiops" || 15 | "Pan" | "Hominidae" || 16 | "Pan paniscus" | "Pan" |+---------+----------------------+---------------+
Have a look at null
as the parent of Animalia
: that's why I told you that you need to separate your dataset by Kingdoms, there can be only one null
value in the whole table.
Finally, based on that table, we create the hierarchy using d3.stratify()
:
const stratify = d3.stratify() .id(function(d) { return d.name; }) .parentId(function(d) { return d.parent; });const hierarchicalData = stratify(tableOfRelationships);
And here is the demo. Open your browser's console (the snippet's one is not very good for this task) and inspect the several levels (children
) of the object:
PS: I don't know what kind of dataviz you'll create, but you really should avoid taxonomic ranks. The whole Linnaean taxonomy is outdated, we don't use ranks anymore: since the phylogenetic systematics was developed in mid-60's we use only taxa, without any taxonomic rank (evolutionary biology teacher here). Also, I'm quite curious about these 7 million rows, since we have described just over 1 million species!
It is easy to do exactly what you need using python and python-benedict
library (it is open source on Github, note: I am the author):
Installation pip install python-benedict
from benedict import benedict as bdict# data source can be a filepath or an urldata_source = """RecordID,kingdom,phylum,class,order,family,genus,species1,Animalia,Chordata,Mammalia,Primates,Hominidae,Homo,Homo sapiens2,Animalia,Chordata,Mammalia,Carnivora,Canidae,Canis,Canis3,Plantae,nan,Magnoliopsida,Brassicales,Brassicaceae,Arabidopsis,Arabidopsis thaliana4,Plantae,nan,Magnoliopsida,Fabales,Fabaceae,Phaseoulus,Phaseolus vulgaris"""data_input = bdict.from_csv(data_source)data_output = bdict()ancestors_hierarchy = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']for value in data_input['values']: data_output['.'.join([value[ancestor] for ancestor in ancestors_hierarchy])] = bdict()print(data_output.dump())# if this output is ok for your needs, you don't need the following codekeypaths = sorted(data_output.keypaths(), key=lambda item: len(item.split('.')), reverse=True)data_output['children'] = []def transform_data(d, key, value): if isinstance(value, dict): value.update({ 'name':key, 'children':[] })data_output.traverse(transform_data)for keypath in keypaths: target_keypath = '.'.join(keypath.split('.')[:-1] + ['children']) data_output[target_keypath].append(data_output.pop(keypath))print(data_output.dump())
The first print output will be:
{ "Animalia": { "Chordata": { "Mammalia": { "Carnivora": { "Canidae": { "Canis": { "Canis": {} } } }, "Primates": { "Hominidae": { "Homo": { "Homo sapiens": {} } } } } } }, "Plantae": { "nan": { "Magnoliopsida": { "Brassicales": { "Brassicaceae": { "Arabidopsis": { "Arabidopsis thaliana": {} } } }, "Fabales": { "Fabaceae": { "Phaseoulus": { "Phaseolus vulgaris": {} } } } } } }}
The second printed output will be:
{ "children": [ { "name": "Animalia", "children": [ { "name": "Chordata", "children": [ { "name": "Mammalia", "children": [ { "name": "Carnivora", "children": [ { "name": "Canidae", "children": [ { "name": "Canis", "children": [ { "name": "Canis", "children": [] } ] } ] } ] }, { "name": "Primates", "children": [ { "name": "Hominidae", "children": [ { "name": "Homo", "children": [ { "name": "Homo sapiens", "children": [] } ] } ] } ] } ] } ] } ] }, { "name": "Plantae", "children": [ { "name": "nan", "children": [ { "name": "Magnoliopsida", "children": [ { "name": "Brassicales", "children": [ { "name": "Brassicaceae", "children": [ { "name": "Arabidopsis", "children": [ { "name": "Arabidopsis thaliana", "children": [] } ] } ] } ] }, { "name": "Fabales", "children": [ { "name": "Fabaceae", "children": [ { "name": "Phaseoulus", "children": [ { "name": "Phaseolus vulgaris", "children": [] } ] } ] } ] } ] } ] } ] } ]}
var log = console.log;var data = `1,Animalia,Chordata,Mammalia,Primates,Hominidae,Homo,Homo sapiens2,Animalia,Chordata,Mammalia,Carnivora,Canidae,Canis,Canis3,Plantae,nan,Magnoliopsida,Brassicales,Brassicaceae,Arabidopsis,Arabidopsis thaliana4,Plantae,nan,Magnoliopsida,Fabales,Fabaceae,Phaseoulus,Phaseolus vulgaris`;//make array of rows with array of valuesdata = data.split("\n").map(v=>v.split(","));//init treevar tree = {};data.forEach(row=>{ //set current = root of tree for every row var cur = tree; var id = false; row.forEach((value,i)=>{ if (i == 0) { //set id and skip value id = value; return; } //If branch not exists create. //If last value - write id if (!cur[value]) cur[value] = (i == row.length - 1) ? id : {}; //Move link down on hierarhy cur = cur[value]; });}); log("Tree:");log(JSON.stringify(tree, null, " "));//Now you have hierarhy in tree and can do anything with it.var toStruct = function(obj) { let ret = []; for (let key in obj) { let child = obj[key]; let rec = {}; rec.name = key; if (typeof child == "object") rec.children = toStruct(child); ret.push(rec); } return ret;}var struct = toStruct(tree);console.log("Struct:");console.log(struct);