Writing Nested JSON Dictionary List To CSV
I'm sure someone will come by with a much more elegant solution. That being said:
You have a few problems.
- You have inconsistent entries with the fields you want to align.
- Even if you pad your data you have intermediate
list
s that need flattened out. - Then you still have separated data that needs to be merged together.
DictWriter
AFAIK expects it's data in the format of[{'column': 'entry'},{'column': 'entry'}
so even if you do all the previous steps you're still not in the right format.
So let's get started.
For the first two parts we can combine.
def pad_list(lst, size, padding=None): # we wouldn't have to make a copy but I prefer to # avoid the possibility of getting bitten by mutability _lst = lst[:] for _ in range(len(lst), size): _lst.append(padding) return _lst# this expects already parsed json datadef flatten(json_data): lst = [] for dct in json_data: # here we're just setting a max size of all dict entries # this is in case the shorter entry is in the first iteration max_size = 0 # we initialize a dict for each of the list entries # this is in case you have inconsistent lengths between lists flattened = dict() for k, v in dct.items(): entries = list(next(iter(v), dict()).values()) flattened[k] = entries max_size = max(len(entries), max_size) # here we append the padded version of the keys for the dict lst.append({k: pad_list(v, max_size) for k, v in flattened.items()}) return lst
So now we have a flattened, list of dict
s whos values are list
s of consistent length. Essentially:
[ { "Basic_Information_Source": [ "image1.png", "PNG", "RGB", 574, 262, 277274 ], "Basic_Information_Destination": [ "image1_dst.png", "PNG", "RGB", 574, 262, 277539 ], "Values": [ 75.05045463635267, 0.006097560975609756, 0.045083481733371615, 0.008639858263904898, None, None ] }]
But this list
has multiple dict
s that need to be merged, not just one.
So we need to merge.
# this should be self explanatorydef merge(flattened): merged = dict() for dct in flattened: for k, v in dct.items(): if k not in merged: merged[k] = [] merged[k].extend(v) return merged
This gives us something close to this:
{ "Basic_Information_Source": [ "image1.png", "PNG", "RGB", 574, 262, 277274, "image2.png", "PNG", "RGB", 1600, 1066, 1786254 ], "Basic_Information_Destination": [ "image1_dst.png", "PNG", "RGB", 574, 262, 277539, "image2_dst.png", "PNG", "RGB", 1600, 1066, 1782197 ], "Values": [ 75.05045463635267, 0.006097560975609756, 0.045083481733371615, 0.008639858263904898, None, None, 85.52662890580055, 0.0005464352720450282, 0.013496113910369758, 0.003800236380811839, None, None ]}
But wait, we still need to format it for the writer.
Our data needs to be in the format of [{'column_1': 'entry', column_2: 'entry'},{'column_1': 'entry', column_2: 'entry'}
So we format:
def format_for_writer(merged): formatted = [] for k, v in merged.items(): for i, item in enumerate(v): # on the first pass this will append an empty dict # on subsequent passes it will be ignored # and add keys into the existing dict if i >= len(formatted): formatted.append(dict()) formatted[i][k] = item return formatted
So finally, we have a nice clean formatted data structure we can just hand to our writer function.
def convert_csv(formatted): keys = formatted[0].keys() with open('test.csv', 'w', encoding='utf8', newline='') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(formatted)
Full code with json
string:
import jsonimport csvjson_raw = """\[ { "Basic_Information_Source": [ { "Image": "image1.png", "Image_Format": "PNG", "Image_Mode": "RGB", "Image_Width": 574, "Image_Height": 262, "Image_Size": 277274 } ], "Basic_Information_Destination": [ { "Image": "image1_dst.png", "Image_Format": "PNG", "Image_Mode": "RGB", "Image_Width": 574, "Image_Height": 262, "Image_Size": 277539 } ], "Values": [ { "Value1": 75.05045463635267, "Value2": 0.006097560975609756, "Value3": 0.045083481733371615, "Value4": 0.008639858263904898 } ] }, { "Basic_Information_Source": [ { "Image": "image2.png", "Image_Format": "PNG", "Image_Mode": "RGB", "Image_Width": 1600, "Image_Height": 1066, "Image_Size": 1786254 } ], "Basic_Information_Destination": [ { "Image": "image2_dst.png", "Image_Format": "PNG", "Image_Mode": "RGB", "Image_Width": 1600, "Image_Height": 1066, "Image_Size": 1782197 } ], "Values": [ { "Value1": 85.52662890580055, "Value2": 0.0005464352720450282, "Value3": 0.013496113910369758, "Value4": 0.003800236380811839 } ] }]"""def pad_list(lst, size, padding=None): _lst = lst[:] for _ in range(len(lst), size): _lst.append(padding) return _lstdef flatten(json_data): lst = [] for dct in json_data: max_size = 0 flattened = dict() for k, v in dct.items(): entries = list(next(iter(v), dict()).values()) flattened[k] = entries max_size = max(len(entries), max_size) lst.append({k: pad_list(v, max_size) for k, v in flattened.items()}) return lstdef merge(flattened): merged = dict() for dct in flattened: for k, v in dct.items(): if k not in merged: merged[k] = [] merged[k].extend(v) return mergeddef format_for_writer(merged): formatted = [] for k, v in merged.items(): for i, item in enumerate(v): if i >= len(formatted): formatted.append(dict()) formatted[i][k] = item return formatteddef convert_csv(formatted): keys = formatted[0].keys() with open('test.csv', 'w', encoding='utf8', newline='') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(formatted)def main(): json_data = json.loads(json_raw) flattened = flatten(json_data) merged = merge(flattened) formatted = format_for_writer(merged) convert_csv(formatted)if __name__ == '__main__': main()