Batch fill PDF forms from python or bash
For Python you'll need the fdfgen lib and pdftk
@Hugh Bothwell's comment is 100% correct so I'll extend that answer with a working implementation.
If you're in windows you'll also need to make sure both python and pdftk are contained in the system path (unless you want to use long folder names).
Here's the code to auto-batch-fill a collection of PDF forms from a CSV data file:
import csvfrom fdfgen import forge_fdfimport osimport syssys.path.insert(0, os.getcwd())filename_prefix = "NVC"csv_file = "NVC.csv"pdf_file = "NVC.pdf"tmp_file = "tmp.fdf"output_folder = './output/'def process_csv(file): headers = [] data = [] csv_data = csv.reader(open(file)) for i, row in enumerate(csv_data): if i == 0: headers = row continue; field = [] for i in range(len(headers)): field.append((headers[i], row[i])) data.append(field) return datadef form_fill(fields): fdf = forge_fdf("",fields,[],[],[]) fdf_file = open(tmp_file,"w") fdf_file.write(fdf) fdf_file.close() output_file = '{0}{1} {2}.pdf'.format(output_folder, filename_prefix, fields[1][1]) cmd = 'pdftk "{0}" fill_form "{1}" output "{2}" dont_ask'.format(pdf_file, tmp_file, output_file) os.system(cmd) os.remove(tmp_file)data = process_csv(csv_file)print('Generating Forms:')print('-----------------------')for i in data: if i[0][1] == 'Yes': continue print('{0} {1} created...'.format(filename_prefix, i[1][1])) form_fill(i)
Note: It shouldn't be rocket-surgery to figure out how to customize this. The initial variable declarations contain the custom configuration.
In the CSV, in the first row each column will contain the name of the corresponding field name in the PDF file. Any columns that don't have corresponding fields in the template will be ignored.
In the PDF template, just create editable fields where you want your data to fill and make sure the names match up with the CSV data.
For this specific configuration, just put this file in the same folder as your NVC.csv, NVC.pdf, and a folder named 'output'. Run it and it automagically does the rest.
Much faster version, no pdftk nor fdfgen needed, pure Python 3.6+:
# -*- coding: utf-8 -*-from collections import OrderedDictfrom PyPDF2 import PdfFileWriter, PdfFileReaderdef _getFields(obj, tree=None, retval=None, fileobj=None): """ Extracts field data if this PDF contains interactive form fields. The *tree* and *retval* parameters are for recursive use. :param fileobj: A file object (usually a text file) to write a report to on all interactive form fields found. :return: A dictionary where each key is a field name, and each value is a :class:`Field<PyPDF2.generic.Field>` object. By default, the mapping name is used for keys. :rtype: dict, or ``None`` if form data could not be located. """ fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name', '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'} if retval is None: retval = OrderedDict() catalog = obj.trailer["/Root"] # get the AcroForm tree if "/AcroForm" in catalog: tree = catalog["/AcroForm"] else: return None if tree is None: return retval obj._checkKids(tree, retval, fileobj) for attr in fieldAttributes: if attr in tree: # Tree is a field obj._buildField(tree, retval, fileobj, fieldAttributes) break if "/Fields" in tree: fields = tree["/Fields"] for f in fields: field = f.getObject() obj._buildField(field, retval, fileobj, fieldAttributes) return retvaldef get_form_fields(infile): infile = PdfFileReader(open(infile, 'rb')) fields = _getFields(infile) return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())def update_form_values(infile, outfile, newvals=None): pdf = PdfFileReader(open(infile, 'rb')) writer = PdfFileWriter() for i in range(pdf.getNumPages()): page = pdf.getPage(i) try: if newvals: writer.updatePageFormFieldValues(page, newvals) else: writer.updatePageFormFieldValues(page, {k: f'#{i} {k}={v}' for i, (k, v) in enumerate(get_form_fields(infile).items()) }) writer.addPage(page) except Exception as e: print(repr(e)) writer.addPage(page) with open(outfile, 'wb') as out: writer.write(out)if __name__ == '__main__': from pprint import pprint pdf_file_name = '2PagesFormExample.pdf' pprint(get_form_fields(pdf_file_name)) update_form_values(pdf_file_name, 'out-' + pdf_file_name) # enumerate & fill the fields with their own names update_form_values(pdf_file_name, 'out2-' + pdf_file_name, {'my_fieldname_1': 'My Value', 'my_fieldname_2': 'My Another 💎alue'}) # update the form fields
Replace Original File
os.system('pdftk "original.pdf" fill_form "data.fdf" output "output.pdf"')os.remove("data.fdf")os.remove("original.pdf")os.rename("output.pdf","original.pdf")