diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index ab855e9..7e4547e 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -5,16 +5,8 @@ Converts PDF text content (though not images containing text) to plain text, htm import sys import logging import six - -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.pdfdevice import PDFDevice, TagExtractor -from pdfminer.pdfpage import PDFPage -from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter -from pdfminer.cmapdb import CMapDB -from pdfminer.layout import LAParams -from pdfminer.image import ImageWriter +import pdfminer.high_level +import pdfminer.layout def _check_arg(): @@ -32,95 +24,23 @@ def _check_arg(): ).format(arg_name, contains_permitted, type(contained)) -def extract_text_to_fp(inf, outfp, - output_type='text', codec='utf-8', laparams = None, - maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, - layoutmode='normal', output_dir=None, strip_control=False, - debug=False, disable_caching=False, **other): - """ - Parses text from inf-file and writes to outfp file-like object. - Takes loads of optional arguments but the defaults are somewhat sane. - Beware laparams: Including an empty LAParams is not the same as passing None! - Returns nothing, acting as it does on two streams. Use StringIO to get strings. - """ - if six.PY2 and sys.stdin.encoding: - password = password.decode(sys.stdin.encoding) - - imagewriter = None - if output_dir: - imagewriter = ImageWriter(output_dir) - - rsrcmgr = PDFResourceManager(caching=not disable_caching) - - if output_type == 'text': - device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, - imagewriter=imagewriter) - - if six.PY3 and outfp == sys.stdout: - outfp = sys.stdout.buffer - - if output_type == 'xml': - device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, - imagewriter=imagewriter, - stripcontrol=strip_control) - elif output_type == 'html': - device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, - layoutmode=layoutmode, laparams=laparams, - imagewriter=imagewriter) - elif output_type == 'tag': - device = TagExtractor(rsrcmgr, outfp, codec=codec) - - interpreter = PDFPageInterpreter(rsrcmgr, device) - for page in PDFPage.get_pages(inf, - page_numbers, - maxpages=maxpages, - password=password, - caching=not disable_caching, - check_extractable=True): - page.rotate = (page.rotate + rotation) % 360 - interpreter.process_page(page) - - def extract_text(files=[], outfile='-', - _py2_no_more_posargs=None, # Bloody Python2 users need a shim for mandatory keyword args.. - output_type='text', codec='utf-8', maxpages=0, page_numbers=None, password="", scale=1.0, - all_texts=None, detect_vertical=None, word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams - debug=False, layoutmode='normal', no_laparams=False, rotation=0, output_dir=None, - disable_caching=False, strip_control=False, pagenos=None): + _py2_no_more_posargs=None, # Bloody Python2 needs a shim + no_laparams=False, all_texts=None, detect_vertical=None, # LAParams + word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams + output_type='text', codec='utf-8', strip_control=False, + maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, + layoutmode='normal', output_dir=None, debug=False, + disable_caching=False, **other): if _py2_no_more_posargs is not None: raise ValueError("Too many positional arguments passed.") if not files: raise ValueError("Must provide files to work upon!") - # == Typechecking == - # You can be sure for this many arguments that typechecking will catch errors. - # Yet more Py2 stupidity, should be able to use argument annotations to do - # type-checking cleanly, but can't. Not bothering to typecheck everything here. - if debug: - for arg_name, arg_permitted, contains_permitted in ( - ("files", list, str), - ("outfile", str, None), - ("password", str, None), - ("scale", float, None), - ("output_type", str, None), - ("codec", str, None), - ("maxpages", int, None), - ("page_numbers", (type(None), list, set), int) - ): - arg = locals()[arg_name] - assert isinstance(arg, arg_permitted), ("Argument '{}' should be of type(s)" - " '{}' but is type '{}'").format(arg_name, arg_permitted, type(arg)) - if contains_permitted is not None and arg: - for contained in arg: - assert isinstance(contained, contains_permitted), ("Value within" - " argument '{}' should be of type '{}' but is '{}'" - ).format(arg_name, contains_permitted, type(contained)) - # == Typechecking over == - # If any LAParams group arguments were passed, create an LAParams object and # populate with given args. Otherwise, set it to None. if not no_laparams: - laparams = LAParams() + laparams = pdfminer.layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) if paramv is not None: @@ -150,7 +70,7 @@ def extract_text(files=[], outfile='-', for fname in files: with open(fname, "rb") as fp: - extract_text_to_fp(fp, **locals()) + pdfminer.high_level.extract_text_to_fp(fp, **locals()) return outfp # main @@ -181,15 +101,6 @@ def main(args=None): P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") A = P.parse_args(args=args) - if A.no_laparams: - laparams = None - else: - laparams = LAParams() - for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): - param_arg = getattr(A, param, None) - if param_arg is not None: - setattr(laparams, param, param_arg) - if A.page_numbers: A.page_numbers = set([x-1 for x in A.page_numbers]) if A.pagenos: @@ -221,7 +132,7 @@ def main(args=None): ## Test Code outfp = extract_text(**vars(A)) outfp.close() - return None + return 0 if __name__ == '__main__': sys.exit(main())