pdfminer.six/tools/pdf2txt.py

226 lines
9.7 KiB
Python
Raw Normal View History

2013-10-17 14:05:27 +00:00
#!/usr/bin/env python
"""
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
"""
import sys
import logging
import six
2013-10-10 09:29:30 +00:00
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# main
def main(argv):
import argparse
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("files", type=str, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
# P.add_argument("-o", "--outfile", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)")
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args()
if A.no_laparams:
laparams = None
else:
laparams = LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
param_arg = getattr(A, param, None)
if param_arg is not None:
setattr(laparams, param, param_arg)
if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
imagewriter = None
if A.output_dir:
imagewriter = ImageWriter(A.output_dir)
if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding)
if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag") ):
if A.outfile.endswith(override):
A.output_type = alttype
if A.outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
A.codec = 'utf-8'
#A.codec = outfp.encoding
else:
outfp = open(A.outfile, "wb")
rsrcmgr = PDFResourceManager(caching=not A.disable_caching)
if A.output_type == 'text':
device = TextConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
imagewriter=imagewriter)
elif A.output_type == 'xml':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = XMLConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=A.strip_control)
elif A.output_type == 'html':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = HTMLConverter(rsrcmgr, outfp, codec=A.codec, scale=A.scale,
layoutmode=A.layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif A.output_type == 'tag':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = TagExtractor(rsrcmgr, outfp, codec=A.codec)
else:
return usage()
for fname in A.files:
fp = open(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, A.page_numbers,
maxpages=A.maxpages, password=A.password,
caching=not A.disable_caching, check_extractable=True):
page.rotate = (page.rotate + A.rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return
def main_old(argv):
import getopt
def usage():
2013-11-25 09:21:19 +00:00
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
2013-11-25 09:21:19 +00:00
' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
2013-11-25 09:21:19 +00:00
rotation = 0
stripcontrol = False
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
2011-03-02 15:04:43 +00:00
caching = True
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
2011-03-02 15:04:43 +00:00
elif k == '-C': caching = False
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
2010-12-26 08:26:39 +00:00
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v)
2013-11-25 09:21:19 +00:00
elif k == '-R': rotation = int(v)
elif k == '-S': stripcontrol = True
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
if six.PY2 and sys.stdin.encoding:
password = password.decode(sys.stdin.encoding)
2011-03-02 15:04:43 +00:00
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
2014-09-03 13:26:08 +00:00
outfp = open(outfile, 'wb')
else:
outfp = sys.stdout
2014-09-07 16:34:11 +00:00
if outfp.encoding is not None:
codec = None
if outtype == 'text':
2011-11-06 15:15:10 +00:00
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'tag':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
for fname in args:
2014-09-03 13:26:08 +00:00
fp = open(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
2013-11-25 09:21:19 +00:00
page.rotate = (page.rotate+rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return
#if __name__ == '__main__': sys.exit(main_old(sys.argv))
if __name__ == '__main__': sys.exit(main(sys.argv))