"""Functions that can be used for the most common use-cases for pdfminer.six""" import logging import sys from io import StringIO from typing import Any, BinaryIO, Container, Iterator, Optional, cast from .converter import ( XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator, HOCRConverter, ) from .image import ImageWriter from .layout import LAParams, LTPage from .pdfdevice import PDFDevice, TagExtractor from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .pdfpage import PDFPage from .utils import open_filename, FileOrName, AnyIO def extract_text_to_fp( inf: BinaryIO, outfp: AnyIO, output_type: str = "text", codec: str = "utf-8", laparams: Optional[LAParams] = None, maxpages: int = 0, page_numbers: Optional[Container[int]] = None, password: str = "", scale: float = 1.0, rotation: int = 0, layoutmode: str = "normal", output_dir: Optional[str] = None, strip_control: bool = False, debug: bool = False, disable_caching: bool = False, **kwargs: Any, ) -> None: """Parses text from inf-file and writes to outfp file-like object. Takes loads of optional arguments but the defaults are somewhat sane. Beware laparams: Including an empty LAParams is not the same as passing None! :param inf: a file-like object to read PDF structure from, such as a file handler (using the builtin `open()` function) or a `BytesIO`. :param outfp: a file-like object to write the text to. :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'. Only 'text' works properly. :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. Default is None but may not layout correctly. :param maxpages: How many pages to stop parsing after :param page_numbers: zero-indexed page numbers to operate on. :param password: For encrypted PDFs, the password to decrypt. :param scale: Scale factor :param rotation: Rotation factor :param layoutmode: Default is 'normal', see pdfminer.converter.HTMLConverter :param output_dir: If given, creates an ImageWriter for extracted images. :param strip_control: Does what it says on the tin :param debug: Output more logging data :param disable_caching: Does what it says on the tin :param other: :return: nothing, acting as it does on two streams. Use StringIO to get strings. """ if debug: logging.getLogger().setLevel(logging.DEBUG) imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) device: Optional[PDFDevice] = None if output_type != "text" and outfp == sys.stdout: outfp = sys.stdout.buffer if output_type == "text": device = TextConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter ) elif output_type == "xml": device = XMLConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=strip_control, ) elif output_type == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, ) elif output_type == "hocr": device = HOCRConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control ) elif output_type == "tag": # Binary I/O is required, but we have no good way to test it here. device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) else: msg = f"Output type can be text, html, xml or tag but is " f"{output_type}" raise ValueError(msg) assert device is not None interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( inf, page_numbers, maxpages=maxpages, password=password, caching=not disable_caching, ): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() def extract_text( pdf_file: FileOrName, password: str = "", page_numbers: Optional[Container[int]] = None, maxpages: int = 0, caching: bool = True, codec: str = "utf-8", laparams: Optional[LAParams] = None, ) -> str: """Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: a string containing all of the text extracted. """ if laparams is None: laparams = LAParams() with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: fp = cast(BinaryIO, fp) # we opened in binary mode rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching, ): interpreter.process_page(page) return output_string.getvalue() def extract_pages( pdf_file: FileOrName, password: str = "", page_numbers: Optional[Container[int]] = None, maxpages: int = 0, caching: bool = True, laparams: Optional[LAParams] = None, ) -> Iterator[LTPage]: """Extract and yield LTPage objects :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: LTPage objects """ if laparams is None: laparams = LAParams() with open_filename(pdf_file, "rb") as fp: fp = cast(BinaryIO, fp) # we opened in binary mode resource_manager = PDFResourceManager(caching=caching) device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching ): interpreter.process_page(page) layout = device.get_result() yield layout