Python module for converting PDF to text [closed]

python pdf text-extraction pdf-scraping

Try PDFMiner. It can extract text from PDF files as HTML, SGML or "Tagged PDF" format.

The Tagged PDF format seems to be the cleanest, and stripping out the XML tags leaves just the bare text.

A Python 3 version is available under:

https://github.com/pdfminer/pdfminer.six

python pdf text-extraction pdf-scraping

The PDFMiner package has changed since codeape posted.

EDIT (again):

PDFMiner has been updated again in version 20100213

You can check the version you have installed with the following:

>>> import pdfminer>>> pdfminer.__version__'20100213'

Here's the updated version (with comments on what I changed/added):

def pdf_to_csv(filename):    from cStringIO import StringIO  #<-- added so you can copy/paste this to try it    from pdfminer.converter import LTTextItem, TextConverter    from pdfminer.pdfparser import PDFDocument, PDFParser    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter    class CsvConverter(TextConverter):        def __init__(self, *args, **kwargs):            TextConverter.__init__(self, *args, **kwargs)        def end_page(self, i):            from collections import defaultdict            lines = defaultdict(lambda : {})            for child in self.cur_item.objs:                if isinstance(child, LTTextItem):                    (_,_,x,y) = child.bbox                   #<-- changed                    line = lines[int(-y)]                    line[x] = child.text.encode(self.codec)  #<-- changed            for y in sorted(lines.keys()):                line = lines[y]                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))                self.outfp.write("\n")    # ... the following part of the code is a remix of the     # convert() function in the pdfminer/tools/pdf2text module    rsrc = PDFResourceManager()    outfp = StringIO()    device = CsvConverter(rsrc, outfp, codec="utf-8")  #<-- changed         # becuase my test documents are utf-8 (note: utf-8 is the default codec)    doc = PDFDocument()    fp = open(filename, 'rb')    parser = PDFParser(fp)       #<-- changed    parser.set_document(doc)     #<-- added    doc.set_parser(parser)       #<-- added    doc.initialize('')    interpreter = PDFPageInterpreter(rsrc, device)    for i, page in enumerate(doc.get_pages()):        outfp.write("START PAGE %d\n" % i)        interpreter.process_page(page)        outfp.write("END PAGE %d\n" % i)    device.close()    fp.close()    return outfp.getvalue()

Edit (yet again):

Here is an update for the latest version in pypi, 20100619p1. In short I replaced LTTextItem with LTChar and passed an instance of LAParams to the CsvConverter constructor.

def pdf_to_csv(filename):    from cStringIO import StringIO      from pdfminer.converter import LTChar, TextConverter    #<-- changed    from pdfminer.layout import LAParams    from pdfminer.pdfparser import PDFDocument, PDFParser    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter    class CsvConverter(TextConverter):        def __init__(self, *args, **kwargs):            TextConverter.__init__(self, *args, **kwargs)        def end_page(self, i):            from collections import defaultdict            lines = defaultdict(lambda : {})            for child in self.cur_item.objs:                if isinstance(child, LTChar):               #<-- changed                    (_,_,x,y) = child.bbox                                       line = lines[int(-y)]                    line[x] = child.text.encode(self.codec)            for y in sorted(lines.keys()):                line = lines[y]                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))                self.outfp.write("\n")    # ... the following part of the code is a remix of the     # convert() function in the pdfminer/tools/pdf2text module    rsrc = PDFResourceManager()    outfp = StringIO()    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())  #<-- changed        # becuase my test documents are utf-8 (note: utf-8 is the default codec)    doc = PDFDocument()    fp = open(filename, 'rb')    parser = PDFParser(fp)           parser.set_document(doc)         doc.set_parser(parser)           doc.initialize('')    interpreter = PDFPageInterpreter(rsrc, device)    for i, page in enumerate(doc.get_pages()):        outfp.write("START PAGE %d\n" % i)        if page is not None:            interpreter.process_page(page)        outfp.write("END PAGE %d\n" % i)    device.close()    fp.close()    return outfp.getvalue()

EDIT (one more time):

Updated for version 20110515 (thanks to Oeufcoque Penteano!):

def pdf_to_csv(filename):    from cStringIO import StringIO      from pdfminer.converter import LTChar, TextConverter    from pdfminer.layout import LAParams    from pdfminer.pdfparser import PDFDocument, PDFParser    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter    class CsvConverter(TextConverter):        def __init__(self, *args, **kwargs):            TextConverter.__init__(self, *args, **kwargs)        def end_page(self, i):            from collections import defaultdict            lines = defaultdict(lambda : {})            for child in self.cur_item._objs:                #<-- changed                if isinstance(child, LTChar):                    (_,_,x,y) = child.bbox                                       line = lines[int(-y)]                    line[x] = child._text.encode(self.codec) #<-- changed            for y in sorted(lines.keys()):                line = lines[y]                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))                self.outfp.write("\n")    # ... the following part of the code is a remix of the     # convert() function in the pdfminer/tools/pdf2text module    rsrc = PDFResourceManager()    outfp = StringIO()    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())        # becuase my test documents are utf-8 (note: utf-8 is the default codec)    doc = PDFDocument()    fp = open(filename, 'rb')    parser = PDFParser(fp)           parser.set_document(doc)         doc.set_parser(parser)           doc.initialize('')    interpreter = PDFPageInterpreter(rsrc, device)    for i, page in enumerate(doc.get_pages()):        outfp.write("START PAGE %d\n" % i)        if page is not None:            interpreter.process_page(page)        outfp.write("END PAGE %d\n" % i)    device.close()    fp.close()    return outfp.getvalue()

python pdf text-extraction pdf-scraping

Since none for these solutions support the latest version of PDFMiner I wrote a simple solution that will return text of a pdf using PDFMiner. This will work for those who are getting import errors with process_pdf

import sysfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.layout import LAParamsfrom cStringIO import StringIOdef pdfparser(data):    fp = file(data, 'rb')    rsrcmgr = PDFResourceManager()    retstr = StringIO()    codec = 'utf-8'    laparams = LAParams()    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    # Create a PDF interpreter object.    interpreter = PDFPageInterpreter(rsrcmgr, device)    # Process each page contained in the document.    for page in PDFPage.get_pages(fp):        interpreter.process_page(page)        data =  retstr.getvalue()    print dataif __name__ == '__main__':    pdfparser(sys.argv[1])

See below code that works for Python 3:

import sysfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.layout import LAParamsimport iodef pdfparser(data):    fp = open(data, 'rb')    rsrcmgr = PDFResourceManager()    retstr = io.StringIO()    codec = 'utf-8'    laparams = LAParams()    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    # Create a PDF interpreter object.    interpreter = PDFPageInterpreter(rsrcmgr, device)    # Process each page contained in the document.    for page in PDFPage.get_pages(fp):        interpreter.process_page(page)        data =  retstr.getvalue()    print(data)if __name__ == '__main__':    pdfparser(sys.argv[1])

CodeHunter

Python module for converting PDF to text [closed]

Recent Posts

How can I color dots in a xy scatterplot according to column value?

How to update a claim in ASP.NET Identity?

What does {0} mean when initializing an object?

Accessing members of items in a JSONArray with Java

How to log SQL statements in Spring Boot?

Powershell Get-WebSite name parameter is ignored

How to detect scroll to bottom of html element

Java synchronized method

How to test controllers with CodeIgniter?

Detect Visual Composer

Matplotlib: Specify format of floats for tick labels

Rails join a list of strings with commas and "and" before the last