How do I use pdfminer as a library
Here is a new solution that works with the latest version:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfpage import PDFPagefrom cStringIO import StringIOdef convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
Here is a cleaned up version I finally produced that worked for me. The following just simply returns the string in a PDF, given its filename. I hope this saves someone time.
from pdfminer.pdfinterp import PDFResourceManager, process_pdffrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom cStringIO import StringIOdef convert_pdf(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
This solution was valid until API changes in November 2013.
I know it is poor taste to answer your own question, but I think I may have figured this out and I don't want anyone else to waste their time looking for a solution to my problem.
I followed the suggestion in a one of the links posted in my question and re-purposed the current pdf2txt.py script included with pdfminer. Here is the function in case it is useful to anyone else. Thanks to the user skyl for posting that answer, all I had to to was make a couple of changes to make it work with the current version of pdfminer.
This function take a pdf and creates a .txt file in the same directory with the same name.
def convert_pdf(path, outtype='txt', opts={}):import sysfrom pdfminer.pdfparser import PDFDocument, PDFParserfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdffrom pdfminer.pdfdevice import PDFDevice, TagExtractorfrom pdfminer.converter import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.cmapdb import CMapDBfrom pdfminer.layout import LAParamsimport getoptoutfile = path[:-3] + outtypeoutdir = '/'.join(path.split('/')[:-1])# debug optiondebug = 0# input optionpassword = ''pagenos = set()maxpages = 0# output option# ?outfile = None# ?outtype = Noneoutdir = None#layoutmode = 'normal'codec = 'utf-8'pageno = 1scale = 1showpageno = Truelaparams = LAParams()for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v)##PDFDocument.debug = debug#PDFParser.debug = debugCMapDB.debug = debugPDFResourceManager.debug = debugPDFPageInterpreter.debug = debugPDFDevice.debug = debug#rsrcmgr = PDFResourceManager()outtype = 'text'if outfile: outfp = file(outfile, 'w')else: outfp = sys.stdoutdevice = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)fp = file(path, 'rb')process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True)fp.close()device.close()outfp.close()return