Cleaning up pdf2txt.py after the partition/move.
parent
cbe270a4bf
commit
b3553cef10
113
tools/pdf2txt.py
113
tools/pdf2txt.py
|
@ -5,16 +5,8 @@ Converts PDF text content (though not images containing text) to plain text, htm
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
import six
|
import six
|
||||||
|
import pdfminer.high_level
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
import pdfminer.layout
|
||||||
from pdfminer.pdfparser import PDFParser
|
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
||||||
from pdfminer.pdfdevice import PDFDevice, TagExtractor
|
|
||||||
from pdfminer.pdfpage import PDFPage
|
|
||||||
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
|
||||||
from pdfminer.cmapdb import CMapDB
|
|
||||||
from pdfminer.layout import LAParams
|
|
||||||
from pdfminer.image import ImageWriter
|
|
||||||
|
|
||||||
|
|
||||||
def _check_arg():
|
def _check_arg():
|
||||||
|
@ -32,95 +24,23 @@ def _check_arg():
|
||||||
).format(arg_name, contains_permitted, type(contained))
|
).format(arg_name, contains_permitted, type(contained))
|
||||||
|
|
||||||
|
|
||||||
def extract_text_to_fp(inf, outfp,
|
|
||||||
output_type='text', codec='utf-8', laparams = None,
|
|
||||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
|
||||||
layoutmode='normal', output_dir=None, strip_control=False,
|
|
||||||
debug=False, disable_caching=False, **other):
|
|
||||||
"""
|
|
||||||
Parses text from inf-file and writes to outfp file-like object.
|
|
||||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
|
||||||
Beware laparams: Including an empty LAParams is not the same as passing None!
|
|
||||||
Returns nothing, acting as it does on two streams. Use StringIO to get strings.
|
|
||||||
"""
|
|
||||||
if six.PY2 and sys.stdin.encoding:
|
|
||||||
password = password.decode(sys.stdin.encoding)
|
|
||||||
|
|
||||||
imagewriter = None
|
|
||||||
if output_dir:
|
|
||||||
imagewriter = ImageWriter(output_dir)
|
|
||||||
|
|
||||||
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
|
||||||
|
|
||||||
if output_type == 'text':
|
|
||||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
|
||||||
imagewriter=imagewriter)
|
|
||||||
|
|
||||||
if six.PY3 and outfp == sys.stdout:
|
|
||||||
outfp = sys.stdout.buffer
|
|
||||||
|
|
||||||
if output_type == 'xml':
|
|
||||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
|
||||||
imagewriter=imagewriter,
|
|
||||||
stripcontrol=strip_control)
|
|
||||||
elif output_type == 'html':
|
|
||||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
|
||||||
layoutmode=layoutmode, laparams=laparams,
|
|
||||||
imagewriter=imagewriter)
|
|
||||||
elif output_type == 'tag':
|
|
||||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
|
||||||
|
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
||||||
for page in PDFPage.get_pages(inf,
|
|
||||||
page_numbers,
|
|
||||||
maxpages=maxpages,
|
|
||||||
password=password,
|
|
||||||
caching=not disable_caching,
|
|
||||||
check_extractable=True):
|
|
||||||
page.rotate = (page.rotate + rotation) % 360
|
|
||||||
interpreter.process_page(page)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text(files=[], outfile='-',
|
def extract_text(files=[], outfile='-',
|
||||||
_py2_no_more_posargs=None, # Bloody Python2 users need a shim for mandatory keyword args..
|
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
|
||||||
output_type='text', codec='utf-8', maxpages=0, page_numbers=None, password="", scale=1.0,
|
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
||||||
all_texts=None, detect_vertical=None, word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
||||||
debug=False, layoutmode='normal', no_laparams=False, rotation=0, output_dir=None,
|
output_type='text', codec='utf-8', strip_control=False,
|
||||||
disable_caching=False, strip_control=False, pagenos=None):
|
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||||
|
layoutmode='normal', output_dir=None, debug=False,
|
||||||
|
disable_caching=False, **other):
|
||||||
if _py2_no_more_posargs is not None:
|
if _py2_no_more_posargs is not None:
|
||||||
raise ValueError("Too many positional arguments passed.")
|
raise ValueError("Too many positional arguments passed.")
|
||||||
if not files:
|
if not files:
|
||||||
raise ValueError("Must provide files to work upon!")
|
raise ValueError("Must provide files to work upon!")
|
||||||
|
|
||||||
# == Typechecking ==
|
|
||||||
# You can be sure for this many arguments that typechecking will catch errors.
|
|
||||||
# Yet more Py2 stupidity, should be able to use argument annotations to do
|
|
||||||
# type-checking cleanly, but can't. Not bothering to typecheck everything here.
|
|
||||||
if debug:
|
|
||||||
for arg_name, arg_permitted, contains_permitted in (
|
|
||||||
("files", list, str),
|
|
||||||
("outfile", str, None),
|
|
||||||
("password", str, None),
|
|
||||||
("scale", float, None),
|
|
||||||
("output_type", str, None),
|
|
||||||
("codec", str, None),
|
|
||||||
("maxpages", int, None),
|
|
||||||
("page_numbers", (type(None), list, set), int)
|
|
||||||
):
|
|
||||||
arg = locals()[arg_name]
|
|
||||||
assert isinstance(arg, arg_permitted), ("Argument '{}' should be of type(s)"
|
|
||||||
" '{}' but is type '{}'").format(arg_name, arg_permitted, type(arg))
|
|
||||||
if contains_permitted is not None and arg:
|
|
||||||
for contained in arg:
|
|
||||||
assert isinstance(contained, contains_permitted), ("Value within"
|
|
||||||
" argument '{}' should be of type '{}' but is '{}'"
|
|
||||||
).format(arg_name, contains_permitted, type(contained))
|
|
||||||
# == Typechecking over ==
|
|
||||||
|
|
||||||
# If any LAParams group arguments were passed, create an LAParams object and
|
# If any LAParams group arguments were passed, create an LAParams object and
|
||||||
# populate with given args. Otherwise, set it to None.
|
# populate with given args. Otherwise, set it to None.
|
||||||
if not no_laparams:
|
if not no_laparams:
|
||||||
laparams = LAParams()
|
laparams = pdfminer.layout.LAParams()
|
||||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||||
paramv = locals().get(param, None)
|
paramv = locals().get(param, None)
|
||||||
if paramv is not None:
|
if paramv is not None:
|
||||||
|
@ -150,7 +70,7 @@ def extract_text(files=[], outfile='-',
|
||||||
|
|
||||||
for fname in files:
|
for fname in files:
|
||||||
with open(fname, "rb") as fp:
|
with open(fname, "rb") as fp:
|
||||||
extract_text_to_fp(fp, **locals())
|
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
||||||
return outfp
|
return outfp
|
||||||
|
|
||||||
# main
|
# main
|
||||||
|
@ -181,15 +101,6 @@ def main(args=None):
|
||||||
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
||||||
A = P.parse_args(args=args)
|
A = P.parse_args(args=args)
|
||||||
|
|
||||||
if A.no_laparams:
|
|
||||||
laparams = None
|
|
||||||
else:
|
|
||||||
laparams = LAParams()
|
|
||||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
|
||||||
param_arg = getattr(A, param, None)
|
|
||||||
if param_arg is not None:
|
|
||||||
setattr(laparams, param, param_arg)
|
|
||||||
|
|
||||||
if A.page_numbers:
|
if A.page_numbers:
|
||||||
A.page_numbers = set([x-1 for x in A.page_numbers])
|
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||||
if A.pagenos:
|
if A.pagenos:
|
||||||
|
@ -221,7 +132,7 @@ def main(args=None):
|
||||||
## Test Code
|
## Test Code
|
||||||
outfp = extract_text(**vars(A))
|
outfp = extract_text(**vars(A))
|
||||||
outfp.close()
|
outfp.close()
|
||||||
return None
|
return 0
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main())
|
if __name__ == '__main__': sys.exit(main())
|
||||||
|
|
Loading…
Reference in New Issue