From d87bd025dda25d1e3a9211bc5f251d14f2f5bb1f Mon Sep 17 00:00:00 2001 From: Andrew Baumann <0xabu@users.noreply.github.com> Date: Tue, 25 Jan 2022 13:06:06 -0800 Subject: [PATCH] pdf2txt: clean up construction of LAParams from arguments (#682) * Fix pdf2txt --boxes-flow=disabled Fixes: ``` $ pdf2txt.py --boxes-flow=disabled test.pdf Traceback (most recent call last): File "tools/pdf2txt.py", line 204, in sys.exit(main()) File "tools/pdf2txt.py", line 198, in main outfp = extract_text(**vars(A)) File "tools/pdf2txt.py", line 66, in extract_text pdfminer.high_level.extract_text_to_fp(fp, **locals()) File "pdfminer/high_level.py", line 85, in extract_text_to_fp interpreter.process_page(page) File "pdfminer/pdfinterp.py", line 896, in process_page self.device.end_page(page) File "pdfminer/converter.py", line 51, in end_page self.cur_item.analyze(self.laparams) File "pdfminer/layout.py", line 822, in analyze group.analyze(laparams) File "pdfminer/layout.py", line 575, in analyze LTTextGroup.analyze(self, laparams) File "pdfminer/layout.py", line 362, in analyze obj.analyze(laparams) File "pdfminer/layout.py", line 575, in analyze LTTextGroup.analyze(self, laparams) File "pdfminer/layout.py", line 362, in analyze obj.analyze(laparams) File "pdfminer/layout.py", line 575, in analyze LTTextGroup.analyze(self, laparams) File "pdfminer/layout.py", line 362, in analyze obj.analyze(laparams) File "pdfminer/layout.py", line 577, in analyze self._objs.sort( File "pdfminer/layout.py", line 578, in key=lambda obj: (1 - laparams.boxes_flow) * obj.x0 TypeError: unsupported operand type(s) for -: 'int' and 'str' ``` Related: Issue #477, PR #479 * update CHANGELOG * merge CHANGELOG * pdf2txt: clean up handling of layout parameter arguments * avoid specifying default values twice * construct LAParams earlier, rather than passing its components around * fix crash with --boxes_flow=disabled * update CHANGELOG * construct new LAParams, so _validate runs * Improve readability of setting LAParams by explicitly copying them from parsed_args into init of LAParams. And move all parsed_args post processing to the parse_args() method. * Add cli argument for line_overlap * Also use default values from LAParams for --detect-vertical and --all-texts Co-authored-by: Pieter Marsman --- CHANGELOG.md | 1 + tools/pdf2txt.py | 115 +++++++++++++++++++++++------------------------ 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0a17ed..99a134f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645)) - Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653)) +- Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682)) ## [20211012] diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 8f7896c..f42ea50 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -4,7 +4,7 @@ output it to plain text, html, xml or tags.""" import argparse import logging import sys -from typing import Any, Container, Iterable, List, Optional, Union +from typing import Any, Container, Iterable, List, Optional import pdfminer.high_level from pdfminer.layout import LAParams @@ -17,12 +17,10 @@ OUTPUT_TYPES = ((".htm", "html"), (".xml", "xml"), (".tag", "tag")) -FloatOrDisabled = Union[float, str] # Union[float, Literal["disabled"]] - -def float_or_disabled(x: str) -> FloatOrDisabled: +def float_or_disabled(x: str) -> Optional[float]: if x.lower().strip() == "disabled": - return "disabled" + return None try: return float(x) except ValueError: @@ -32,13 +30,7 @@ def float_or_disabled(x: str) -> FloatOrDisabled: def extract_text( files: Iterable[str] = [], outfile: str = '-', - no_laparams: bool = False, - all_texts: Optional[bool] = None, - detect_vertical: Optional[bool] = None, - word_margin: Optional[float] = None, - char_margin: Optional[float] = None, - line_margin: Optional[float] = None, - boxes_flow: Optional[FloatOrDisabled] = None, + laparams: Optional[LAParams] = None, output_type: str = 'text', codec: str = 'utf-8', strip_control: bool = False, @@ -56,19 +48,6 @@ def extract_text( if not files: raise ValueError("Must provide files to work upon!") - # If any LAParams group arguments were passed, - # create an LAParams object and - # populate with given args. Otherwise, set it to None. - if not no_laparams: - laparams: Optional[LAParams] = LAParams() - for param in ("all_texts", "detect_vertical", "word_margin", - "char_margin", "line_margin", "boxes_flow"): - paramv = locals().get(param, None) - if paramv is not None: - setattr(laparams, param, paramv) - else: - laparams = None - if output_type == "text" and outfile != "-": for override, alttype in OUTPUT_TYPES: if outfile.endswith(override): @@ -87,7 +66,7 @@ def extract_text( return outfp -def maketheparser() -> argparse.ArgumentParser: +def parse_args(args: Optional[List[str]]) -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument( "files", type=str, default=None, nargs="+", @@ -124,32 +103,40 @@ def maketheparser() -> argparse.ArgumentParser: help="The number of degrees to rotate the PDF " "before other types of processing.") - la_params = parser.add_argument_group( + la_params = LAParams() # will be used for defaults + la_param_group = parser.add_argument_group( 'Layout analysis', description='Used during layout analysis.') - la_params.add_argument( + la_param_group.add_argument( "--no-laparams", "-n", default=False, action="store_true", help="If layout analysis parameters should be ignored.") - la_params.add_argument( - "--detect-vertical", "-V", default=False, action="store_true", + la_param_group.add_argument( + "--detect-vertical", "-V", default=la_params.detect_vertical, + action="store_true", help="If vertical text should be considered during layout analysis") - la_params.add_argument( - "--char-margin", "-M", type=float, default=2.0, + la_param_group.add_argument( + "--line-overlap", type=float, default=la_params.line_overlap, + help='If two characters have more overlap than this they ' + 'are considered to be on the same line. The overlap is specified ' + 'relative to the minimum height of both characters.') + la_param_group.add_argument( + "--char-margin", "-M", type=float, default=la_params.char_margin, help="If two characters are closer together than this margin they " "are considered to be part of the same line. The margin is " "specified relative to the width of the character.") - la_params.add_argument( - "--word-margin", "-W", type=float, default=0.1, + la_param_group.add_argument( + "--word-margin", "-W", type=float, default=la_params.word_margin, help="If two characters on the same line are further apart than this " "margin then they are considered to be two separate words, and " "an intermediate space will be added for readability. The margin " "is specified relative to the width of the character.") - la_params.add_argument( - "--line-margin", "-L", type=float, default=0.5, - help="If two lines are are close together they are considered to " + la_param_group.add_argument( + "--line-margin", "-L", type=float, default=la_params.line_margin, + help="If two lines are close together they are considered to " "be part of the same paragraph. The margin is specified " "relative to the height of a line.") - la_params.add_argument( - "--boxes-flow", "-F", type=float_or_disabled, default=0.5, + la_param_group.add_argument( + "--boxes-flow", "-F", type=float_or_disabled, + default=la_params.boxes_flow, help="Specifies how much a horizontal and vertical position of a " "text matters when determining the order of lines. The value " "should be within the range of -1.0 (only horizontal position " @@ -157,8 +144,8 @@ def maketheparser() -> argparse.ArgumentParser: "pass `disabled` to disable advanced layout analysis, and " "instead return text based on the position of the bottom left " "corner of the text box.") - la_params.add_argument( - "--all-texts", "-A", default=False, action="store_true", + la_param_group.add_argument( + "--all-texts", "-A", default=la_params.all_texts, action="store_true", help="If layout analysis should be performed on text in figures.") output_params = parser.add_argument_group( @@ -194,28 +181,40 @@ def maketheparser() -> argparse.ArgumentParser: "--strip-control", "-S", default=False, action="store_true", help="Remove control statement from text. " "Only used when output_type is xml.") - return parser + parsed_args = parser.parse_args(args=args) -# main + # Propagate parsed layout parameters to LAParams object + if parsed_args.no_laparams: + parsed_args.laparams = None + else: + parsed_args.laparams = LAParams( + line_overlap=parsed_args.line_overlap, + char_margin=parsed_args.char_margin, + line_margin=parsed_args.line_margin, + word_margin=parsed_args.word_margin, + boxes_flow=parsed_args.boxes_flow, + detect_vertical=parsed_args.detect_vertical, + all_texts=parsed_args.all_texts, + ) + + if parsed_args.page_numbers: + parsed_args.page_numbers = {x-1 for x in parsed_args.page_numbers} + + if parsed_args.pagenos: + parsed_args.page_numbers = {int(x)-1 for x in parsed_args.pagenos.split(",")} + + if parsed_args.output_type == "text" and parsed_args.outfile != "-": + for override, alttype in OUTPUT_TYPES: + if parsed_args.outfile.endswith(override): + parsed_args.output_type = alttype + + return parsed_args def main(args: Optional[List[str]] = None) -> int: - - P = maketheparser() - A = P.parse_args(args=args) - - if A.page_numbers: - A.page_numbers = {x-1 for x in A.page_numbers} - if A.pagenos: - A.page_numbers = {int(x)-1 for x in A.pagenos.split(",")} - - if A.output_type == "text" and A.outfile != "-": - for override, alttype in OUTPUT_TYPES: - if A.outfile.endswith(override): - A.output_type = alttype - - outfp = extract_text(**vars(A)) + parsed_args = parse_args(args) + outfp = extract_text(**vars(parsed_args)) outfp.close() return 0