How do I use pdfminer as a library How do I use pdfminer as a library python python

How do I use pdfminer as a library


Here is a new solution that works with the latest version:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfpage import PDFPagefrom cStringIO import StringIOdef convert_pdf_to_txt(path):    rsrcmgr = PDFResourceManager()    retstr = StringIO()    codec = 'utf-8'    laparams = LAParams()    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    fp = file(path, 'rb')    interpreter = PDFPageInterpreter(rsrcmgr, device)    password = ""    maxpages = 0    caching = True    pagenos=set()    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):        interpreter.process_page(page)    fp.close()    device.close()    str = retstr.getvalue()    retstr.close()    return str


Here is a cleaned up version I finally produced that worked for me. The following just simply returns the string in a PDF, given its filename. I hope this saves someone time.

from pdfminer.pdfinterp import PDFResourceManager, process_pdffrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom cStringIO import StringIOdef convert_pdf(path):    rsrcmgr = PDFResourceManager()    retstr = StringIO()    codec = 'utf-8'    laparams = LAParams()    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    fp = file(path, 'rb')    process_pdf(rsrcmgr, device, fp)    fp.close()    device.close()    str = retstr.getvalue()    retstr.close()    return str

This solution was valid until API changes in November 2013.


I know it is poor taste to answer your own question, but I think I may have figured this out and I don't want anyone else to waste their time looking for a solution to my problem.

I followed the suggestion in a one of the links posted in my question and re-purposed the current pdf2txt.py script included with pdfminer. Here is the function in case it is useful to anyone else. Thanks to the user skyl for posting that answer, all I had to to was make a couple of changes to make it work with the current version of pdfminer.

This function take a pdf and creates a .txt file in the same directory with the same name.

def convert_pdf(path, outtype='txt', opts={}):import sysfrom pdfminer.pdfparser import PDFDocument, PDFParserfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdffrom pdfminer.pdfdevice import PDFDevice, TagExtractorfrom pdfminer.converter import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.cmapdb import CMapDBfrom pdfminer.layout import LAParamsimport getoptoutfile = path[:-3] + outtypeoutdir = '/'.join(path.split('/')[:-1])# debug optiondebug = 0# input optionpassword = ''pagenos = set()maxpages = 0# output option# ?outfile = None# ?outtype = Noneoutdir = None#layoutmode = 'normal'codec = 'utf-8'pageno = 1scale = 1showpageno = Truelaparams = LAParams()for (k, v) in opts:    if k == '-d': debug += 1    elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )    elif k == '-m': maxpages = int(v)    elif k == '-P': password = v    elif k == '-o': outfile = v    elif k == '-n': laparams = None    elif k == '-A': laparams.all_texts = True    elif k == '-V': laparams.detect_vertical = True    elif k == '-M': laparams.char_margin = float(v)    elif k == '-L': laparams.line_margin = float(v)    elif k == '-W': laparams.word_margin = float(v)    elif k == '-F': laparams.boxes_flow = float(v)    elif k == '-Y': layoutmode = v    elif k == '-O': outdir = v    elif k == '-t': outtype = v    elif k == '-c': codec = v    elif k == '-s': scale = float(v)##PDFDocument.debug = debug#PDFParser.debug = debugCMapDB.debug = debugPDFResourceManager.debug = debugPDFPageInterpreter.debug = debugPDFDevice.debug = debug#rsrcmgr = PDFResourceManager()outtype = 'text'if outfile:    outfp = file(outfile, 'w')else:    outfp = sys.stdoutdevice = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)fp = file(path, 'rb')process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,                check_extractable=True)fp.close()device.close()outfp.close()return