Forgot to add the actual compartmentalised function..

2015-05-30 17:04:28 +01:00 · 2015-05-30 17:04:28 +01:00 · 3b7edba48c
parent b3553cef10
commit 3b7edba48c
1 changed files with 68 additions and 0 deletions
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""
+Functions that encapsulate "usual" use-cases for pdfminer, for use making
+bundled scripts and for using pdfminer as a module for routine tasks.
+"""
+
+import six
+import sys
+
+from .pdfdocument import PDFDocument
+from .pdfparser import PDFParser
+from .pdfinterp import PDFResourceManager, PDFPageInterpreter
+from .pdfdevice import PDFDevice, TagExtractor
+from .pdfpage import PDFPage
+from .converter import XMLConverter, HTMLConverter, TextConverter
+from .cmapdb import CMapDB
+from .image import ImageWriter
+
+
+def extract_text_to_fp(inf, outfp,
+                    _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
+                    output_type='text', codec='utf-8', laparams = None,
+                    maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
+                    layoutmode='normal', output_dir=None, strip_control=False,
+                    debug=False, disable_caching=False, **other):
+    """
+    Parses text from inf-file and writes to outfp file-like object.
+    Takes loads of optional arguments but the defaults are somewhat sane.
+    Beware laparams: Including an empty LAParams is not the same as passing None!
+    Returns nothing, acting as it does on two streams. Use StringIO to get strings.
+    """
+    if six.PY2 and sys.stdin.encoding:
+        password = password.decode(sys.stdin.encoding)
+
+    imagewriter = None
+    if output_dir:
+        imagewriter = ImageWriter(output_dir)
+    
+    rsrcmgr = PDFResourceManager(caching=not disable_caching)
+
+    if output_type == 'text':
+        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
+                               imagewriter=imagewriter)
+
+    if six.PY3 and outfp == sys.stdout:
+        outfp = sys.stdout.buffer
+
+    if output_type == 'xml':
+        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
+                              imagewriter=imagewriter,
+                              stripcontrol=strip_control)
+    elif output_type == 'html':
+        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
+                               layoutmode=layoutmode, laparams=laparams,
+                               imagewriter=imagewriter)
+    elif output_type == 'tag':
+        device = TagExtractor(rsrcmgr, outfp, codec=codec)
+
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    for page in PDFPage.get_pages(inf,
+                                  page_numbers,
+                                  maxpages=maxpages,
+                                  password=password,
+                                  caching=not disable_caching,
+                                  check_extractable=True):
+        page.rotate = (page.rotate + rotation) % 360
+        interpreter.process_page(page)    
+