Cleaning up pdf2txt.py after the partition/move.
parent
cbe270a4bf
commit
b3553cef10
113
tools/pdf2txt.py
113
tools/pdf2txt.py
|
@ -5,16 +5,8 @@ Converts PDF text content (though not images containing text) to plain text, htm
|
|||
import sys
|
||||
import logging
|
||||
import six
|
||||
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice, TagExtractor
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
||||
from pdfminer.cmapdb import CMapDB
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.image import ImageWriter
|
||||
import pdfminer.high_level
|
||||
import pdfminer.layout
|
||||
|
||||
|
||||
def _check_arg():
|
||||
|
@ -32,95 +24,23 @@ def _check_arg():
|
|||
).format(arg_name, contains_permitted, type(contained))
|
||||
|
||||
|
||||
def extract_text_to_fp(inf, outfp,
|
||||
output_type='text', codec='utf-8', laparams = None,
|
||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||
layoutmode='normal', output_dir=None, strip_control=False,
|
||||
debug=False, disable_caching=False, **other):
|
||||
"""
|
||||
Parses text from inf-file and writes to outfp file-like object.
|
||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
||||
Beware laparams: Including an empty LAParams is not the same as passing None!
|
||||
Returns nothing, acting as it does on two streams. Use StringIO to get strings.
|
||||
"""
|
||||
if six.PY2 and sys.stdin.encoding:
|
||||
password = password.decode(sys.stdin.encoding)
|
||||
|
||||
imagewriter = None
|
||||
if output_dir:
|
||||
imagewriter = ImageWriter(output_dir)
|
||||
|
||||
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
||||
|
||||
if output_type == 'text':
|
||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
|
||||
if six.PY3 and outfp == sys.stdout:
|
||||
outfp = sys.stdout.buffer
|
||||
|
||||
if output_type == 'xml':
|
||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
imagewriter=imagewriter,
|
||||
stripcontrol=strip_control)
|
||||
elif output_type == 'html':
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||
layoutmode=layoutmode, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
elif output_type == 'tag':
|
||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.get_pages(inf,
|
||||
page_numbers,
|
||||
maxpages=maxpages,
|
||||
password=password,
|
||||
caching=not disable_caching,
|
||||
check_extractable=True):
|
||||
page.rotate = (page.rotate + rotation) % 360
|
||||
interpreter.process_page(page)
|
||||
|
||||
|
||||
def extract_text(files=[], outfile='-',
|
||||
_py2_no_more_posargs=None, # Bloody Python2 users need a shim for mandatory keyword args..
|
||||
output_type='text', codec='utf-8', maxpages=0, page_numbers=None, password="", scale=1.0,
|
||||
all_texts=None, detect_vertical=None, word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
||||
debug=False, layoutmode='normal', no_laparams=False, rotation=0, output_dir=None,
|
||||
disable_caching=False, strip_control=False, pagenos=None):
|
||||
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
|
||||
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
||||
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
||||
output_type='text', codec='utf-8', strip_control=False,
|
||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||
layoutmode='normal', output_dir=None, debug=False,
|
||||
disable_caching=False, **other):
|
||||
if _py2_no_more_posargs is not None:
|
||||
raise ValueError("Too many positional arguments passed.")
|
||||
if not files:
|
||||
raise ValueError("Must provide files to work upon!")
|
||||
|
||||
# == Typechecking ==
|
||||
# You can be sure for this many arguments that typechecking will catch errors.
|
||||
# Yet more Py2 stupidity, should be able to use argument annotations to do
|
||||
# type-checking cleanly, but can't. Not bothering to typecheck everything here.
|
||||
if debug:
|
||||
for arg_name, arg_permitted, contains_permitted in (
|
||||
("files", list, str),
|
||||
("outfile", str, None),
|
||||
("password", str, None),
|
||||
("scale", float, None),
|
||||
("output_type", str, None),
|
||||
("codec", str, None),
|
||||
("maxpages", int, None),
|
||||
("page_numbers", (type(None), list, set), int)
|
||||
):
|
||||
arg = locals()[arg_name]
|
||||
assert isinstance(arg, arg_permitted), ("Argument '{}' should be of type(s)"
|
||||
" '{}' but is type '{}'").format(arg_name, arg_permitted, type(arg))
|
||||
if contains_permitted is not None and arg:
|
||||
for contained in arg:
|
||||
assert isinstance(contained, contains_permitted), ("Value within"
|
||||
" argument '{}' should be of type '{}' but is '{}'"
|
||||
).format(arg_name, contains_permitted, type(contained))
|
||||
# == Typechecking over ==
|
||||
|
||||
# If any LAParams group arguments were passed, create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if not no_laparams:
|
||||
laparams = LAParams()
|
||||
laparams = pdfminer.layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||
paramv = locals().get(param, None)
|
||||
if paramv is not None:
|
||||
|
@ -150,7 +70,7 @@ def extract_text(files=[], outfile='-',
|
|||
|
||||
for fname in files:
|
||||
with open(fname, "rb") as fp:
|
||||
extract_text_to_fp(fp, **locals())
|
||||
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
||||
return outfp
|
||||
|
||||
# main
|
||||
|
@ -181,15 +101,6 @@ def main(args=None):
|
|||
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
||||
A = P.parse_args(args=args)
|
||||
|
||||
if A.no_laparams:
|
||||
laparams = None
|
||||
else:
|
||||
laparams = LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||
param_arg = getattr(A, param, None)
|
||||
if param_arg is not None:
|
||||
setattr(laparams, param, param_arg)
|
||||
|
||||
if A.page_numbers:
|
||||
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||
if A.pagenos:
|
||||
|
@ -221,7 +132,7 @@ def main(args=None):
|
|||
## Test Code
|
||||
outfp = extract_text(**vars(A))
|
||||
outfp.close()
|
||||
return None
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__': sys.exit(main())
|
||||
|
|
Loading…
Reference in New Issue