diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py new file mode 100644 index 0000000..74cbcc8 --- /dev/null +++ b/pdfminer/high_level.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +""" +Functions that encapsulate "usual" use-cases for pdfminer, for use making +bundled scripts and for using pdfminer as a module for routine tasks. +""" + +import six +import sys + +from .pdfdocument import PDFDocument +from .pdfparser import PDFParser +from .pdfinterp import PDFResourceManager, PDFPageInterpreter +from .pdfdevice import PDFDevice, TagExtractor +from .pdfpage import PDFPage +from .converter import XMLConverter, HTMLConverter, TextConverter +from .cmapdb import CMapDB +from .image import ImageWriter + + +def extract_text_to_fp(inf, outfp, + _py2_no_more_posargs=None, # Bloody Python2 needs a shim + output_type='text', codec='utf-8', laparams = None, + maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, + layoutmode='normal', output_dir=None, strip_control=False, + debug=False, disable_caching=False, **other): + """ + Parses text from inf-file and writes to outfp file-like object. + Takes loads of optional arguments but the defaults are somewhat sane. + Beware laparams: Including an empty LAParams is not the same as passing None! + Returns nothing, acting as it does on two streams. Use StringIO to get strings. + """ + if six.PY2 and sys.stdin.encoding: + password = password.decode(sys.stdin.encoding) + + imagewriter = None + if output_dir: + imagewriter = ImageWriter(output_dir) + + rsrcmgr = PDFResourceManager(caching=not disable_caching) + + if output_type == 'text': + device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, + imagewriter=imagewriter) + + if six.PY3 and outfp == sys.stdout: + outfp = sys.stdout.buffer + + if output_type == 'xml': + device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, + imagewriter=imagewriter, + stripcontrol=strip_control) + elif output_type == 'html': + device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, + layoutmode=layoutmode, laparams=laparams, + imagewriter=imagewriter) + elif output_type == 'tag': + device = TagExtractor(rsrcmgr, outfp, codec=codec) + + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.get_pages(inf, + page_numbers, + maxpages=maxpages, + password=password, + caching=not disable_caching, + check_extractable=True): + page.rotate = (page.rotate + rotation) % 360 + interpreter.process_page(page) +