"""Functions that can be used for the most common use-cases for pdfminer.six""" import logging import sys from io import StringIO from typing import Any, BinaryIO, Container, Iterator, Optional, cast from .converter import XMLConverter, HTMLConverter, TextConverter, \ PDFPageAggregator from .image import ImageWriter from .layout import LAParams, LTPage from .pdfdevice import PDFDevice, TagExtractor from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .pdfpage import PDFPage from .utils import open_filename, FileOrName, AnyIO def extract_text_to_fp( inf: BinaryIO, outfp: AnyIO, output_type: str = 'text', codec: str = 'utf-8', laparams: Optional[LAParams] = None, maxpages: int = 0, page_numbers: Optional[Container[int]] = None, password: str = "", scale: float = 1.0, rotation: int = 0, layoutmode: str = 'normal', output_dir: Optional[str] = None, strip_control: bool = False, debug: bool = False, disable_caching: bool = False, **kwargs: Any ) -> None: """Parses text from inf-file and writes to outfp file-like object. Takes loads of optional arguments but the defaults are somewhat sane. Beware laparams: Including an empty LAParams is not the same as passing None! :param inf: a file-like object to read PDF structure from, such as a file handler (using the builtin `open()` function) or a `BytesIO`. :param outfp: a file-like object to write the text to. :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works properly. :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. Default is None but may not layout correctly. :param maxpages: How many pages to stop parsing after :param page_numbers: zero-indexed page numbers to operate on. :param password: For encrypted PDFs, the password to decrypt. :param scale: Scale factor :param rotation: Rotation factor :param layoutmode: Default is 'normal', see pdfminer.converter.HTMLConverter :param output_dir: If given, creates an ImageWriter for extracted images. :param strip_control: Does what it says on the tin :param debug: Output more logging data :param disable_caching: Does what it says on the tin :param other: :return: nothing, acting as it does on two streams. Use StringIO to get strings. """ if debug: logging.getLogger().setLevel(logging.DEBUG) imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) device: Optional[PDFDevice] = None if output_type != 'text' and outfp == sys.stdout: outfp = sys.stdout.buffer if output_type == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif output_type == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=strip_control) elif output_type == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif output_type == 'tag': # Binary I/O is required, but we have no good way to test it here. device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) else: msg = f"Output type can be text, html, xml or tag but is " \ f"{output_type}" raise ValueError(msg) assert device is not None interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(inf, page_numbers, maxpages=maxpages, password=password, caching=not disable_caching): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() def extract_text( pdf_file: FileOrName, password: str = '', page_numbers: Optional[Container[int]] = None, maxpages: int = 0, caching: bool = True, codec: str = 'utf-8', laparams: Optional[LAParams] = None ) -> str: """Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: a string containing all of the text extracted. """ if laparams is None: laparams = LAParams() with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: fp = cast(BinaryIO, fp) # we opened in binary mode rsrcmgr = PDFResourceManager(caching=caching) device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, page_numbers, maxpages=maxpages, password=password, caching=caching, ): interpreter.process_page(page) return output_string.getvalue() def extract_pages( pdf_file: FileOrName, password: str = '', page_numbers: Optional[Container[int]] = None, maxpages: int = 0, caching: bool = True, laparams: Optional[LAParams] = None ) -> Iterator[LTPage]: """Extract and yield LTPage objects :param pdf_file: Either a file path or a file-like object for the PDF file to be worked on. :param password: For encrypted PDFs, the password to decrypt. :param page_numbers: List of zero-indexed page numbers to extract. :param maxpages: The maximum number of pages to parse :param caching: If resources should be cached :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. :return: """ if laparams is None: laparams = LAParams() with open_filename(pdf_file, "rb") as fp: fp = cast(BinaryIO, fp) # we opened in binary mode resource_manager = PDFResourceManager(caching=caching) device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages, password=password, caching=caching): interpreter.process_page(page) layout = device.get_result() yield layout