Cleaning up pdf2txt.py after the partition/move.

pull/5/head
Cathal Garvey 2015-05-30 17:03:55 +01:00
parent cbe270a4bf
commit b3553cef10
1 changed files with 12 additions and 101 deletions

View File

@ -5,16 +5,8 @@ Converts PDF text content (though not images containing text) to plain text, htm
import sys
import logging
import six
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
import pdfminer.high_level
import pdfminer.layout
def _check_arg():
@ -32,95 +24,23 @@ def _check_arg():
).format(arg_name, contains_permitted, type(contained))
def extract_text_to_fp(inf, outfp,
output_type='text', codec='utf-8', laparams = None,
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
layoutmode='normal', output_dir=None, strip_control=False,
debug=False, disable_caching=False, **other):
"""
Parses text from inf-file and writes to outfp file-like object.
Takes loads of optional arguments but the defaults are somewhat sane.
Beware laparams: Including an empty LAParams is not the same as passing None!
Returns nothing, acting as it does on two streams. Use StringIO to get strings.
"""
if six.PY2 and sys.stdin.encoding:
password = password.decode(sys.stdin.encoding)
imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)
rsrcmgr = PDFResourceManager(caching=not disable_caching)
if output_type == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
if output_type == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=strip_control)
elif output_type == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif output_type == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(inf,
page_numbers,
maxpages=maxpages,
password=password,
caching=not disable_caching,
check_extractable=True):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
def extract_text(files=[], outfile='-',
_py2_no_more_posargs=None, # Bloody Python2 users need a shim for mandatory keyword args..
output_type='text', codec='utf-8', maxpages=0, page_numbers=None, password="", scale=1.0,
all_texts=None, detect_vertical=None, word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
debug=False, layoutmode='normal', no_laparams=False, rotation=0, output_dir=None,
disable_caching=False, strip_control=False, pagenos=None):
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
output_type='text', codec='utf-8', strip_control=False,
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
layoutmode='normal', output_dir=None, debug=False,
disable_caching=False, **other):
if _py2_no_more_posargs is not None:
raise ValueError("Too many positional arguments passed.")
if not files:
raise ValueError("Must provide files to work upon!")
# == Typechecking ==
# You can be sure for this many arguments that typechecking will catch errors.
# Yet more Py2 stupidity, should be able to use argument annotations to do
# type-checking cleanly, but can't. Not bothering to typecheck everything here.
if debug:
for arg_name, arg_permitted, contains_permitted in (
("files", list, str),
("outfile", str, None),
("password", str, None),
("scale", float, None),
("output_type", str, None),
("codec", str, None),
("maxpages", int, None),
("page_numbers", (type(None), list, set), int)
):
arg = locals()[arg_name]
assert isinstance(arg, arg_permitted), ("Argument '{}' should be of type(s)"
" '{}' but is type '{}'").format(arg_name, arg_permitted, type(arg))
if contains_permitted is not None and arg:
for contained in arg:
assert isinstance(contained, contains_permitted), ("Value within"
" argument '{}' should be of type '{}' but is '{}'"
).format(arg_name, contains_permitted, type(contained))
# == Typechecking over ==
# If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None.
if not no_laparams:
laparams = LAParams()
laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None)
if paramv is not None:
@ -150,7 +70,7 @@ def extract_text(files=[], outfile='-',
for fname in files:
with open(fname, "rb") as fp:
extract_text_to_fp(fp, **locals())
pdfminer.high_level.extract_text_to_fp(fp, **locals())
return outfp
# main
@ -181,15 +101,6 @@ def main(args=None):
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args)
if A.no_laparams:
laparams = None
else:
laparams = LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
param_arg = getattr(A, param, None)
if param_arg is not None:
setattr(laparams, param, param_arg)
if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos:
@ -221,7 +132,7 @@ def main(args=None):
## Test Code
outfp = extract_text(**vars(A))
outfp.close()
return None
return 0
if __name__ == '__main__': sys.exit(main())