pdfminer.six/tools/pdf2txt.py

119 lines
4.2 KiB
Python
Raw Normal View History

2013-10-17 14:05:27 +00:00
#!/usr/bin/env python
import sys
2013-10-10 09:29:30 +00:00
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# main
def main(argv):
import getopt
def usage():
2013-11-25 09:21:19 +00:00
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
2013-11-25 09:21:19 +00:00
' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# input option
2014-09-03 13:26:08 +00:00
password = b''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
2013-11-25 09:21:19 +00:00
rotation = 0
stripcontrol = False
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
2011-03-02 15:04:43 +00:00
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
2011-03-02 15:04:43 +00:00
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
2010-12-26 08:26:39 +00:00
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
2013-11-25 09:21:19 +00:00
elif k == '-R': rotation = int(v)
elif k == '-S': stripcontrol = True
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
2011-04-21 13:07:52 +00:00
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
2011-03-02 15:04:43 +00:00
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
2014-09-03 13:26:08 +00:00
outfp = open(outfile, 'wb')
else:
outfp = sys.stdout
2014-09-07 16:34:11 +00:00
if outfp.encoding is not None:
codec = None
if outtype == 'text':
2011-11-06 15:15:10 +00:00
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
2014-09-03 13:26:08 +00:00
fp = open(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
2013-11-25 09:21:19 +00:00
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))