pdf2txt: clean up construction of LAParams from arguments (#682)
* Fix pdf2txt --boxes-flow=disabled Fixes: ``` $ pdf2txt.py --boxes-flow=disabled test.pdf Traceback (most recent call last): File "tools/pdf2txt.py", line 204, in <module> sys.exit(main()) File "tools/pdf2txt.py", line 198, in main outfp = extract_text(**vars(A)) File "tools/pdf2txt.py", line 66, in extract_text pdfminer.high_level.extract_text_to_fp(fp, **locals()) File "pdfminer/high_level.py", line 85, in extract_text_to_fp interpreter.process_page(page) File "pdfminer/pdfinterp.py", line 896, in process_page self.device.end_page(page) File "pdfminer/converter.py", line 51, in end_page self.cur_item.analyze(self.laparams) File "pdfminer/layout.py", line 822, in analyze group.analyze(laparams) File "pdfminer/layout.py", line 575, in analyze LTTextGroup.analyze(self, laparams) File "pdfminer/layout.py", line 362, in analyze obj.analyze(laparams) File "pdfminer/layout.py", line 575, in analyze LTTextGroup.analyze(self, laparams) File "pdfminer/layout.py", line 362, in analyze obj.analyze(laparams) File "pdfminer/layout.py", line 575, in analyze LTTextGroup.analyze(self, laparams) File "pdfminer/layout.py", line 362, in analyze obj.analyze(laparams) File "pdfminer/layout.py", line 577, in analyze self._objs.sort( File "pdfminer/layout.py", line 578, in <lambda> key=lambda obj: (1 - laparams.boxes_flow) * obj.x0 TypeError: unsupported operand type(s) for -: 'int' and 'str' ``` Related: Issue #477, PR #479 * update CHANGELOG * merge CHANGELOG * pdf2txt: clean up handling of layout parameter arguments * avoid specifying default values twice * construct LAParams earlier, rather than passing its components around * fix crash with --boxes_flow=disabled * update CHANGELOG * construct new LAParams, so _validate runs * Improve readability of setting LAParams by explicitly copying them from parsed_args into init of LAParams. And move all parsed_args post processing to the parse_args() method. * Add cli argument for line_overlap * Also use default values from LAParams for --detect-vertical and --all-texts Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/679/head^2
parent
aa5dec252f
commit
d87bd025dd
|
@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
||||||
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
|
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
|
||||||
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
|
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
|
||||||
|
- Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))
|
||||||
|
|
||||||
## [20211012]
|
## [20211012]
|
||||||
|
|
||||||
|
|
115
tools/pdf2txt.py
115
tools/pdf2txt.py
|
@ -4,7 +4,7 @@ output it to plain text, html, xml or tags."""
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Container, Iterable, List, Optional, Union
|
from typing import Any, Container, Iterable, List, Optional
|
||||||
|
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
@ -17,12 +17,10 @@ OUTPUT_TYPES = ((".htm", "html"),
|
||||||
(".xml", "xml"),
|
(".xml", "xml"),
|
||||||
(".tag", "tag"))
|
(".tag", "tag"))
|
||||||
|
|
||||||
FloatOrDisabled = Union[float, str] # Union[float, Literal["disabled"]]
|
|
||||||
|
|
||||||
|
def float_or_disabled(x: str) -> Optional[float]:
|
||||||
def float_or_disabled(x: str) -> FloatOrDisabled:
|
|
||||||
if x.lower().strip() == "disabled":
|
if x.lower().strip() == "disabled":
|
||||||
return "disabled"
|
return None
|
||||||
try:
|
try:
|
||||||
return float(x)
|
return float(x)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -32,13 +30,7 @@ def float_or_disabled(x: str) -> FloatOrDisabled:
|
||||||
def extract_text(
|
def extract_text(
|
||||||
files: Iterable[str] = [],
|
files: Iterable[str] = [],
|
||||||
outfile: str = '-',
|
outfile: str = '-',
|
||||||
no_laparams: bool = False,
|
laparams: Optional[LAParams] = None,
|
||||||
all_texts: Optional[bool] = None,
|
|
||||||
detect_vertical: Optional[bool] = None,
|
|
||||||
word_margin: Optional[float] = None,
|
|
||||||
char_margin: Optional[float] = None,
|
|
||||||
line_margin: Optional[float] = None,
|
|
||||||
boxes_flow: Optional[FloatOrDisabled] = None,
|
|
||||||
output_type: str = 'text',
|
output_type: str = 'text',
|
||||||
codec: str = 'utf-8',
|
codec: str = 'utf-8',
|
||||||
strip_control: bool = False,
|
strip_control: bool = False,
|
||||||
|
@ -56,19 +48,6 @@ def extract_text(
|
||||||
if not files:
|
if not files:
|
||||||
raise ValueError("Must provide files to work upon!")
|
raise ValueError("Must provide files to work upon!")
|
||||||
|
|
||||||
# If any LAParams group arguments were passed,
|
|
||||||
# create an LAParams object and
|
|
||||||
# populate with given args. Otherwise, set it to None.
|
|
||||||
if not no_laparams:
|
|
||||||
laparams: Optional[LAParams] = LAParams()
|
|
||||||
for param in ("all_texts", "detect_vertical", "word_margin",
|
|
||||||
"char_margin", "line_margin", "boxes_flow"):
|
|
||||||
paramv = locals().get(param, None)
|
|
||||||
if paramv is not None:
|
|
||||||
setattr(laparams, param, paramv)
|
|
||||||
else:
|
|
||||||
laparams = None
|
|
||||||
|
|
||||||
if output_type == "text" and outfile != "-":
|
if output_type == "text" and outfile != "-":
|
||||||
for override, alttype in OUTPUT_TYPES:
|
for override, alttype in OUTPUT_TYPES:
|
||||||
if outfile.endswith(override):
|
if outfile.endswith(override):
|
||||||
|
@ -87,7 +66,7 @@ def extract_text(
|
||||||
return outfp
|
return outfp
|
||||||
|
|
||||||
|
|
||||||
def maketheparser() -> argparse.ArgumentParser:
|
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"files", type=str, default=None, nargs="+",
|
"files", type=str, default=None, nargs="+",
|
||||||
|
@ -124,32 +103,40 @@ def maketheparser() -> argparse.ArgumentParser:
|
||||||
help="The number of degrees to rotate the PDF "
|
help="The number of degrees to rotate the PDF "
|
||||||
"before other types of processing.")
|
"before other types of processing.")
|
||||||
|
|
||||||
la_params = parser.add_argument_group(
|
la_params = LAParams() # will be used for defaults
|
||||||
|
la_param_group = parser.add_argument_group(
|
||||||
'Layout analysis', description='Used during layout analysis.')
|
'Layout analysis', description='Used during layout analysis.')
|
||||||
la_params.add_argument(
|
la_param_group.add_argument(
|
||||||
"--no-laparams", "-n", default=False, action="store_true",
|
"--no-laparams", "-n", default=False, action="store_true",
|
||||||
help="If layout analysis parameters should be ignored.")
|
help="If layout analysis parameters should be ignored.")
|
||||||
la_params.add_argument(
|
la_param_group.add_argument(
|
||||||
"--detect-vertical", "-V", default=False, action="store_true",
|
"--detect-vertical", "-V", default=la_params.detect_vertical,
|
||||||
|
action="store_true",
|
||||||
help="If vertical text should be considered during layout analysis")
|
help="If vertical text should be considered during layout analysis")
|
||||||
la_params.add_argument(
|
la_param_group.add_argument(
|
||||||
"--char-margin", "-M", type=float, default=2.0,
|
"--line-overlap", type=float, default=la_params.line_overlap,
|
||||||
|
help='If two characters have more overlap than this they '
|
||||||
|
'are considered to be on the same line. The overlap is specified '
|
||||||
|
'relative to the minimum height of both characters.')
|
||||||
|
la_param_group.add_argument(
|
||||||
|
"--char-margin", "-M", type=float, default=la_params.char_margin,
|
||||||
help="If two characters are closer together than this margin they "
|
help="If two characters are closer together than this margin they "
|
||||||
"are considered to be part of the same line. The margin is "
|
"are considered to be part of the same line. The margin is "
|
||||||
"specified relative to the width of the character.")
|
"specified relative to the width of the character.")
|
||||||
la_params.add_argument(
|
la_param_group.add_argument(
|
||||||
"--word-margin", "-W", type=float, default=0.1,
|
"--word-margin", "-W", type=float, default=la_params.word_margin,
|
||||||
help="If two characters on the same line are further apart than this "
|
help="If two characters on the same line are further apart than this "
|
||||||
"margin then they are considered to be two separate words, and "
|
"margin then they are considered to be two separate words, and "
|
||||||
"an intermediate space will be added for readability. The margin "
|
"an intermediate space will be added for readability. The margin "
|
||||||
"is specified relative to the width of the character.")
|
"is specified relative to the width of the character.")
|
||||||
la_params.add_argument(
|
la_param_group.add_argument(
|
||||||
"--line-margin", "-L", type=float, default=0.5,
|
"--line-margin", "-L", type=float, default=la_params.line_margin,
|
||||||
help="If two lines are are close together they are considered to "
|
help="If two lines are close together they are considered to "
|
||||||
"be part of the same paragraph. The margin is specified "
|
"be part of the same paragraph. The margin is specified "
|
||||||
"relative to the height of a line.")
|
"relative to the height of a line.")
|
||||||
la_params.add_argument(
|
la_param_group.add_argument(
|
||||||
"--boxes-flow", "-F", type=float_or_disabled, default=0.5,
|
"--boxes-flow", "-F", type=float_or_disabled,
|
||||||
|
default=la_params.boxes_flow,
|
||||||
help="Specifies how much a horizontal and vertical position of a "
|
help="Specifies how much a horizontal and vertical position of a "
|
||||||
"text matters when determining the order of lines. The value "
|
"text matters when determining the order of lines. The value "
|
||||||
"should be within the range of -1.0 (only horizontal position "
|
"should be within the range of -1.0 (only horizontal position "
|
||||||
|
@ -157,8 +144,8 @@ def maketheparser() -> argparse.ArgumentParser:
|
||||||
"pass `disabled` to disable advanced layout analysis, and "
|
"pass `disabled` to disable advanced layout analysis, and "
|
||||||
"instead return text based on the position of the bottom left "
|
"instead return text based on the position of the bottom left "
|
||||||
"corner of the text box.")
|
"corner of the text box.")
|
||||||
la_params.add_argument(
|
la_param_group.add_argument(
|
||||||
"--all-texts", "-A", default=False, action="store_true",
|
"--all-texts", "-A", default=la_params.all_texts, action="store_true",
|
||||||
help="If layout analysis should be performed on text in figures.")
|
help="If layout analysis should be performed on text in figures.")
|
||||||
|
|
||||||
output_params = parser.add_argument_group(
|
output_params = parser.add_argument_group(
|
||||||
|
@ -194,28 +181,40 @@ def maketheparser() -> argparse.ArgumentParser:
|
||||||
"--strip-control", "-S", default=False, action="store_true",
|
"--strip-control", "-S", default=False, action="store_true",
|
||||||
help="Remove control statement from text. "
|
help="Remove control statement from text. "
|
||||||
"Only used when output_type is xml.")
|
"Only used when output_type is xml.")
|
||||||
return parser
|
|
||||||
|
|
||||||
|
parsed_args = parser.parse_args(args=args)
|
||||||
|
|
||||||
# main
|
# Propagate parsed layout parameters to LAParams object
|
||||||
|
if parsed_args.no_laparams:
|
||||||
|
parsed_args.laparams = None
|
||||||
|
else:
|
||||||
|
parsed_args.laparams = LAParams(
|
||||||
|
line_overlap=parsed_args.line_overlap,
|
||||||
|
char_margin=parsed_args.char_margin,
|
||||||
|
line_margin=parsed_args.line_margin,
|
||||||
|
word_margin=parsed_args.word_margin,
|
||||||
|
boxes_flow=parsed_args.boxes_flow,
|
||||||
|
detect_vertical=parsed_args.detect_vertical,
|
||||||
|
all_texts=parsed_args.all_texts,
|
||||||
|
)
|
||||||
|
|
||||||
|
if parsed_args.page_numbers:
|
||||||
|
parsed_args.page_numbers = {x-1 for x in parsed_args.page_numbers}
|
||||||
|
|
||||||
|
if parsed_args.pagenos:
|
||||||
|
parsed_args.page_numbers = {int(x)-1 for x in parsed_args.pagenos.split(",")}
|
||||||
|
|
||||||
|
if parsed_args.output_type == "text" and parsed_args.outfile != "-":
|
||||||
|
for override, alttype in OUTPUT_TYPES:
|
||||||
|
if parsed_args.outfile.endswith(override):
|
||||||
|
parsed_args.output_type = alttype
|
||||||
|
|
||||||
|
return parsed_args
|
||||||
|
|
||||||
|
|
||||||
def main(args: Optional[List[str]] = None) -> int:
|
def main(args: Optional[List[str]] = None) -> int:
|
||||||
|
parsed_args = parse_args(args)
|
||||||
P = maketheparser()
|
outfp = extract_text(**vars(parsed_args))
|
||||||
A = P.parse_args(args=args)
|
|
||||||
|
|
||||||
if A.page_numbers:
|
|
||||||
A.page_numbers = {x-1 for x in A.page_numbers}
|
|
||||||
if A.pagenos:
|
|
||||||
A.page_numbers = {int(x)-1 for x in A.pagenos.split(",")}
|
|
||||||
|
|
||||||
if A.output_type == "text" and A.outfile != "-":
|
|
||||||
for override, alttype in OUTPUT_TYPES:
|
|
||||||
if A.outfile.endswith(override):
|
|
||||||
A.output_type = alttype
|
|
||||||
|
|
||||||
outfp = extract_text(**vars(A))
|
|
||||||
outfp.close()
|
outfp.close()
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue