test/pdfminer/high_level.py

214 lines
7.1 KiB
Python

"""Functions that can be used for the most common use-cases for pdfminer.six"""
import logging
import sys
from io import StringIO
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
from .converter import (
XMLConverter,
HTMLConverter,
TextConverter,
PDFPageAggregator,
HOCRConverter,
)
from .image import ImageWriter
from .layout import LAParams, LTPage
from .pdfdevice import PDFDevice, TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
from .utils import open_filename, FileOrName, AnyIO
def extract_text_to_fp(
inf: BinaryIO,
outfp: AnyIO,
output_type: str = "text",
codec: str = "utf-8",
laparams: Optional[LAParams] = None,
maxpages: int = 0,
page_numbers: Optional[Container[int]] = None,
password: str = "",
scale: float = 1.0,
rotation: int = 0,
layoutmode: str = "normal",
output_dir: Optional[str] = None,
strip_control: bool = False,
debug: bool = False,
disable_caching: bool = False,
**kwargs: Any,
) -> None:
"""Parses text from inf-file and writes to outfp file-like object.
Takes loads of optional arguments but the defaults are somewhat sane.
Beware laparams: Including an empty LAParams is not the same as passing
None!
:param inf: a file-like object to read PDF structure from, such as a
file handler (using the builtin `open()` function) or a `BytesIO`.
:param outfp: a file-like object to write the text to.
:param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
Only 'text' works properly.
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. Default is None
but may not layout correctly.
:param maxpages: How many pages to stop parsing after
:param page_numbers: zero-indexed page numbers to operate on.
:param password: For encrypted PDFs, the password to decrypt.
:param scale: Scale factor
:param rotation: Rotation factor
:param layoutmode: Default is 'normal', see
pdfminer.converter.HTMLConverter
:param output_dir: If given, creates an ImageWriter for extracted images.
:param strip_control: Does what it says on the tin
:param debug: Output more logging data
:param disable_caching: Does what it says on the tin
:param other:
:return: nothing, acting as it does on two streams. Use StringIO to get
strings.
"""
if debug:
logging.getLogger().setLevel(logging.DEBUG)
imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)
rsrcmgr = PDFResourceManager(caching=not disable_caching)
device: Optional[PDFDevice] = None
if output_type != "text" and outfp == sys.stdout:
outfp = sys.stdout.buffer
if output_type == "text":
device = TextConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter
)
elif output_type == "xml":
device = XMLConverter(
rsrcmgr,
outfp,
codec=codec,
laparams=laparams,
imagewriter=imagewriter,
stripcontrol=strip_control,
)
elif output_type == "html":
device = HTMLConverter(
rsrcmgr,
outfp,
codec=codec,
scale=scale,
layoutmode=layoutmode,
laparams=laparams,
imagewriter=imagewriter,
)
elif output_type == "hocr":
device = HOCRConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
)
elif output_type == "tag":
# Binary I/O is required, but we have no good way to test it here.
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
else:
msg = f"Output type can be text, html, xml or tag but is " f"{output_type}"
raise ValueError(msg)
assert device is not None
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(
inf,
page_numbers,
maxpages=maxpages,
password=password,
caching=not disable_caching,
):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
device.close()
def extract_text(
pdf_file: FileOrName,
password: str = "",
page_numbers: Optional[Container[int]] = None,
maxpages: int = 0,
caching: bool = True,
codec: str = "utf-8",
laparams: Optional[LAParams] = None,
) -> str:
"""Parse and return the text contained in a PDF file.
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return: a string containing all of the text extracted.
"""
if laparams is None:
laparams = LAParams()
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
fp = cast(BinaryIO, fp) # we opened in binary mode
rsrcmgr = PDFResourceManager(caching=caching)
device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(
fp,
page_numbers,
maxpages=maxpages,
password=password,
caching=caching,
):
interpreter.process_page(page)
return output_string.getvalue()
def extract_pages(
pdf_file: FileOrName,
password: str = "",
page_numbers: Optional[Container[int]] = None,
maxpages: int = 0,
caching: bool = True,
laparams: Optional[LAParams] = None,
) -> Iterator[LTPage]:
"""Extract and yield LTPage objects
:param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return: LTPage objects
"""
if laparams is None:
laparams = LAParams()
with open_filename(pdf_file, "rb") as fp:
fp = cast(BinaryIO, fp) # we opened in binary mode
resource_manager = PDFResourceManager(caching=caching)
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(
fp, page_numbers, maxpages=maxpages, password=password, caching=caching
):
interpreter.process_page(page)
layout = device.get_result()
yield layout