python nested json to csv/xlsx with specified headers python nested json to csv/xlsx with specified headers json json

python nested json to csv/xlsx with specified headers


Following code is able to parse the provided data as per expected format.

from typing import Listdef parse_recursive(dat)->List[List]:    ret=[]    if type(dat) is list:        for item in dat:            if type(item)==dict:                for k in item:                    #print(k, item[k], sep=" # ")#debug print                    if item[k]==[]: #empty list                        ret.append([k])                    else:                        for l in parse_recursive(item[k]):                            #print(k,l,sep=" : ") #debug print                            ret.append([k]+l) #always returns List of List            else: #Right now only possibility is string eg. "one", "two"                return [[",".join(dat)]]    else: #can be int or string eg. 97, "23"        return [[dat]]    return retdef write_to_csv(file_name:str, fields:List, row_data:List[List]):    import csv    with open(file_name, 'w') as csvfile:          # creating a csv writer object          csvwriter = csv.writer(csvfile)          # writing the fields          csvwriter.writerow(fields)          # writing the data rows          csvwriter.writerows(row_data)if __name__=="__main__":    org_data = [{"a": [        {"a1": [            {"id0": [                {                    "aa": [                        {"aaa": 97},                        {"aab": "one"}],                    "ab": [                        {"aba": 97},                        {"abb": ["one", "two"]}                        ]                }            ]            },            {"id1": [                {"aa": [                    {"aaa": 23}]}]}            ]        },        {"a2": []}        ]},        {"b": [{"b1": [{"Common": [{"bb": [{"value": 4}]}]}]}]}]    print(parse_recursive(org_data)) #Debug    file_name="data_file.csv"    fields=['Section', 'Subsection', 'pId', 'Group', 'Parameter', 'Value']    write_to_csv(file_name, fields, parse_recursive(org_data))

parse_recursive tries to parse arbitrary depth dictionary as per rule i tried deducing from your input and output formats.

Following is the output of parse_recursive for your provided input -

mahorir@mahorir-Vostro-3446:~/Desktop$ python3 so.py [['a', 'a1', 'id0', 'aa', 'aaa', 97], ['a', 'a1', 'id0', 'aa', 'aab', 'one'], ['a', 'a1', 'id0', 'ab', 'aba', 97], ['a', 'a1', 'id0', 'ab', 'abb', 'one,two'], ['a', 'a1', 'id1', 'aa', 'aaa', 23], ['a', 'a2'], ['b', 'b1', 'Common', 'bb', 'value', 4]]

write_to_csv is a trivial function that write to a csv file.


This was kind-of a fun problem...There are really two problems with the formatting here:

  1. The data is lists of dicts, where they really just wanted dictionaries. e.g. they wanted {"foo": 1, "bar": 2} but instead formatted it as [{"foo": 1}, {"bar": 2}].

    a. I'm not judging here. There may be reasons why they did this. It just makes it a bit annoying for us to parse.

  2. The data is sometimes truncated; if there are usually 5 levels deep, sometimes if they don't have data beyond a point, they just omit it. e.g. 'a2' in your example.

So I'll show two possible approaches to solving these problems.

The Pandas Way

This solution is a bit different from the other one mentioned here. Let me know what you think:

import pandas as pdfrom copy import deepcopyhdrs = ['Section', 'Subsection', 'pId', 'Group', 'Parameter', 'Value']js = [{"a": [{"a1": [{"id0": [{"aa": [{"aaa": 97}, {"aab": "one"}],                               "ab": [{"aba": 98}, {"abb": ["one", "two"]}]}]},                     {"id1": [{"aa": [{"aaa": 23}]}]}                    ]},             {"a2": []}            ]},      {"b": [{"b1": [{"Common": [{"bb": [{"value": 4}]}]}]}]}]def list_to_dict(lst):    """convert a list of dicts as you have to a single dict    The idea here is that you have a bunch of structures that look    like [{x: ...}, {y: ...}] that should probably have been stored as    {x:..., y:...}. So this function does that (but just one level in).        Note:    If there is a duplicate key in one of your dicts (meaning you have    something like [{x:...},...,{x:...}]), then this function will overwrite    it without warning!    """    d = {}    for new_d in lst:        d.update(new_d)    return ddef recursive_parse(lst, levels):    "Parse the nested json into a single pandas dataframe"    name = levels.pop(0)  # I should have used a counter instead    d = list_to_dict(lst)  # get a sensible dict instead of the list of dicts    if len(levels) <= 1: # meaning there are no more levels to be parsed.        if len(d) == 0:            d = {'': ''} # to handle the uneven depths (e.g. think 'a2')        return pd.Series(d, name=levels[-1])    if len(d) == 0: # again to handle the uneven depths of json        d = {'': []}    # below is a list-comprehension to recursively parse the thing.    d = {k: recursive_parse(v, deepcopy(levels)) for k, v in d.items()}    return pd.concat(d)def json_to_df(js, headers):    "calls recursive_parse, and then adds the column names and whatnot"    df = recursive_parse(js, deepcopy(headers))    df.index.names = headers[:-1]    df = df.reset_index()    return dfdf = json_to_df(js, hdrs)display(df)

And the output is exactly the dataframe you want (but with an index column you may not want). If you write it to csv after, do so like this:

df.to_csv('path/to/desired/file.csv', index=False)

Does that make sense?

The minimalist way

Better version (not using pandas)...

import csvhdrs = ['Section', 'Subsection', 'pId', 'Group', 'Parameter', 'Value']js = [{"a": [{"a1": [{"id0": [{"aa": [{"aaa": 97}, {"aab": "one"}],                               "ab": [{"aba": 98}, {"abb": ["one", "two"]}]}]},                     {"id1": [{"aa": [{"aaa": 23}]}]}                    ]},             {"a2": []}            ]},      {"b": [{"b1": [{"Common": [{"bb": [{"value": 4}]}]}]}]}]def list_of_dicts_to_lists(lst, n_levels=len(hdrs)):    if n_levels == 1:        if isinstance(lst, list):            if len(lst) == 0: # we fill the shorter ones with empty lists                lst = None # replacing them back to None            else: # [1, 2] => "1,2"                lst = ','.join(str(x) for x in lst if x is not None)        return [[lst]] # the later ones are going to be lists of lists so let's start out that way to keep everything consistent.    if len(lst) == 0:        lst = [{None: []}] # filling with an empty list    output = []    for d in lst:        for k, v in d.items():            tmp = list_of_dicts_to_lists(v, n_levels - 1)            for x in tmp:                output.append([k] + x)    return outputdef to_csv(values, header, outfile):    with open(outfile, 'w', newline='') as csv_file:        # pretty much straight from the docs @        # https://docs.python.org/3.7/library/csv.html        csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL)        csv_writer.writerow(header)        for line in values:            csv_writer.writerow(line)    return Truerows = list_of_dicts_to_lists(js)to_csv(rows, hdrs, 'tmp.csv')

I see now that this solution is super similar to the other answer here... My bad.