How to parse complex text files using Python? How to parse complex text files using Python? pandas pandas

How to parse complex text files using Python?


Update 2019 (PEG parser):

This answer has received quite some attention so I felt to add another possibility, namely a parsing option. Here we could use a PEG parser instead (e.g. parsimonious) in combination with a NodeVisitor class:

from parsimonious.grammar import Grammarfrom parsimonious.nodes import NodeVisitorimport pandas as pdgrammar = Grammar(    r"""    schools         = (school_block / ws)+    school_block    = school_header ws grade_block+     grade_block     = grade_header ws name_header ws (number_name)+ ws score_header ws (number_score)+ ws?     school_header   = ~"^School = (.*)"m    grade_header    = ~"^Grade = (\d+)"m    name_header     = "Student number, Name"    score_header    = "Student number, Score"    number_name     = index comma name ws    number_score    = index comma score ws    comma           = ws? "," ws?    index           = number+    score           = number+    number          = ~"\d+"    name            = ~"[A-Z]\w+"    ws              = ~"\s*"    """)tree = grammar.parse(data)class SchoolVisitor(NodeVisitor):    output, names = ([], [])    current_school, current_grade = None, None    def _getName(self, idx):        for index, name in self.names:            if index == idx:                return name    def generic_visit(self, node, visited_children):        return node.text or visited_children    def visit_school_header(self, node, children):        self.current_school = node.match.group(1)    def visit_grade_header(self, node, children):        self.current_grade = node.match.group(1)        self.names = []    def visit_number_name(self, node, children):        index, name = None, None        for child in node.children:            if child.expr.name == 'name':                name = child.text            elif child.expr.name == 'index':                index = child.text        self.names.append((index, name))    def visit_number_score(self, node, children):        index, score = None, None        for child in node.children:            if child.expr.name == 'index':                index = child.text            elif child.expr.name == 'score':                score = child.text        name = self._getName(index)        # build the entire entry        entry = (self.current_school, self.current_grade, index, name, score)        self.output.append(entry)sv = SchoolVisitor()sv.visit(tree)df = pd.DataFrame.from_records(sv.output, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])print(df)

Regex option (original answer)

Well then, watching Lord of the Rings the xth time, I had to bridge some time to the very finale:


Broken down, the idea is to split the problem up into several smaller problems:

  1. Separate each school
  2. ... each grade
  3. ... student and scores
  4. ... bind them together in a dataframe afterwards


The school part (see a demo on regex101.com)

^School\s*=\s*(?P<school_name>.+)(?P<school_content>[\s\S]+?)(?=^School|\Z)


The grade part (another demo on regex101.com)

^Grade\s*=\s*(?P<grade>.+)(?P<students>[\s\S]+?)(?=^Grade|\Z)


The student/score part (last demo on regex101.com):

^Student\ number,\ Name[\n\r](?P<student_names>(?:^\d+.+[\n\r])+)\s*^Student\ number,\ Score[\n\r](?P<student_scores>(?:^\d+.+[\n\r])+)

The rest is a generator expression which is then fed into the DataFrame constructor (along with the column names).


The code:

import pandas as pd, rerx_school = re.compile(r'''    ^    School\s*=\s*(?P<school_name>.+)    (?P<school_content>[\s\S]+?)    (?=^School|\Z)''', re.MULTILINE | re.VERBOSE)rx_grade = re.compile(r'''    ^    Grade\s*=\s*(?P<grade>.+)    (?P<students>[\s\S]+?)    (?=^Grade|\Z)''', re.MULTILINE | re.VERBOSE)rx_student_score = re.compile(r'''    ^    Student\ number,\ Name[\n\r]    (?P<student_names>(?:^\d+.+[\n\r])+)    \s*    ^    Student\ number,\ Score[\n\r]    (?P<student_scores>(?:^\d+.+[\n\r])+)''', re.MULTILINE | re.VERBOSE)result = ((school.group('school_name'), grade.group('grade'), student_number, name, score)    for school in rx_school.finditer(string)    for grade in rx_grade.finditer(school.group('school_content'))    for student_score in rx_student_score.finditer(grade.group('students'))    for student in zip(student_score.group('student_names')[:-1].split("\n"), student_score.group('student_scores')[:-1].split("\n"))    for student_number in [student[0].split(", ")[0]]    for name in [student[0].split(", ")[1]]    for score in [student[1].split(", ")[1]])df = pd.DataFrame(result, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])print(df)


Condensed:

rx_school = re.compile(r'^School\s*=\s*(?P<school_name>.+)(?P<school_content>[\s\S]+?)(?=^School|\Z)', re.MULTILINE)rx_grade = re.compile(r'^Grade\s*=\s*(?P<grade>.+)(?P<students>[\s\S]+?)(?=^Grade|\Z)', re.MULTILINE)rx_student_score = re.compile(r'^Student number, Name[\n\r](?P<student_names>(?:^\d+.+[\n\r])+)\s*^Student number, Score[\n\r](?P<student_scores>(?:^\d+.+[\n\r])+)', re.MULTILINE)


This yields

            School Grade Student number      Name Score0   Riverdale High     1              0    Phoebe     31   Riverdale High     1              1    Rachel     72   Riverdale High     2              0    Angela     63   Riverdale High     2              1   Tristan     34   Riverdale High     2              2    Aurora     95         Hogwarts     1              0     Ginny     86         Hogwarts     1              1      Luna     77         Hogwarts     2              0     Harry     58         Hogwarts     2              1  Hermione    109         Hogwarts     3              0      Fred     010        Hogwarts     3              1    George     0


As for timing, this is the result running it a ten thousand times:

import timeitprint(timeit.timeit(makedf, number=10**4))# 11.918397722000009 s


here is my suggestion using split and pd.concat ("txt" stands for a copy of the original text in the question),basicly the idea is to split by the group words and then concat into data frames, the most inner parsing takes advantage of the fact that the names and grades are in a csv like format.here goes:

import pandas as pdfrom io import StringIOschools = txt.lower().split('school = ')schools_dfs = []for school in schools[1:]:    grades = school.split('grade = ')     grades_dfs = []    for grade in grades[1:]:        features = grade.split('student number,')        feature_dfs = []        for feature in features[1:]:            feature_dfs.append(pd.read_csv(StringIO(feature)))        feature_df = pd.concat(feature_dfs, axis=1)        feature_df['grade'] = features[0].replace('\n','')        grades_dfs.append(feature_df)    grades_df = pd.concat(grades_dfs)    grades_df['school'] = grades[0].replace('\n','')    schools_dfs.append(grades_df)schools_df = pd.concat(schools_dfs)schools_df.set_index(['school', 'grade'])

enter image description here


I would suggest using a parser combinator library like parsy. Compared to using regexes, the result will not be as concise, but it will be much more readable and robust, while still being relatively light-weight.

Parsing is in general quite a hard task, and an approach that is good for people at beginner level for general programming might be hard to find.

EDIT:Some actual example code that does minimal parsing of your supplied example. It does not pass to pandas, or even match up names to scores, or students to grades etc. - it just returns a hierarchy of objects starting with School at the top, with the relevant attributes as you would expect:

from parsy import string, regex, seqimport attr@attr.sclass Student():    name = attr.ib()    number = attr.ib()@attr.sclass Score():    score = attr.ib()    number = attr.ib()@attr.sclass Grade():    grade = attr.ib()    students = attr.ib()    scores = attr.ib()@attr.sclass School():    name = attr.ib()    grades = attr.ib()integer = regex(r"\d+").map(int)student_number = integerscore = integerstudent_name = regex(r"[^\n]+")student_def = seq(student_number.tag('number') << string(", "),                  student_name.tag('name') << string("\n")).combine_dict(Student)student_def_list = string("Student number, Name\n") >> student_def.many()score_def = seq(student_number.tag('number') << string(", "),                score.tag('score') << string("\n")).combine_dict(Score)score_def_list = string("Student number, Score\n") >> score_def.many()grade_value = integergrade_def = string("Grade = ") >> grade_value << string("\n")school_grade = seq(grade_def.tag('grade'),                   student_def_list.tag('students') << regex(r"\n*"),                   score_def_list.tag('scores') << regex(r"\n*")                   ).combine_dict(Grade)school_name = regex(r"[^\n]+")school_def = string("School = ") >> school_name << string("\n")school = seq(school_def.tag('name'),             school_grade.many().tag('grades')             ).combine_dict(School)def parse(text):    return school.many().parse(text)

This is much more verbose than a regex solution, but much closer to a declarative definition of your file format.