Fix not being able to pass boxes flow as None to pdf2txt (#479)
* Fix not being able to pass boxes flow as None to pdf2txt * Changes from code review * Update CHANGELOG.md Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/522/head
parent
f03657e5c4
commit
ef4787d8ad
|
@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
||||||
|
- Option to disable boxes flow layout analysis when using pdf2txt ([#479](https://github.com/pdfminer/pdfminer.six/pull/479))
|
||||||
- Support for `pathlib.PurePath` in `open_filename` ([#491](https://github.com/pdfminer/pdfminer.six/issues/491))
|
- Support for `pathlib.PurePath` in `open_filename` ([#491](https://github.com/pdfminer/pdfminer.six/issues/491))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
|
@ -16,6 +16,15 @@ OUTPUT_TYPES = ((".htm", "html"),
|
||||||
(".tag", "tag"))
|
(".tag", "tag"))
|
||||||
|
|
||||||
|
|
||||||
|
def float_or_disabled(x):
|
||||||
|
if x.lower().strip() == "disabled":
|
||||||
|
return x
|
||||||
|
try:
|
||||||
|
x = float(x)
|
||||||
|
except ValueError:
|
||||||
|
raise argparse.ArgumentTypeError("invalid float value: {}".format(x))
|
||||||
|
|
||||||
|
|
||||||
def extract_text(files=[], outfile='-',
|
def extract_text(files=[], outfile='-',
|
||||||
no_laparams=False, all_texts=None, detect_vertical=None,
|
no_laparams=False, all_texts=None, detect_vertical=None,
|
||||||
word_margin=None, char_margin=None, line_margin=None,
|
word_margin=None, char_margin=None, line_margin=None,
|
||||||
|
@ -120,14 +129,14 @@ def maketheparser():
|
||||||
"be part of the same paragraph. The margin is specified "
|
"be part of the same paragraph. The margin is specified "
|
||||||
"relative to the height of a line.")
|
"relative to the height of a line.")
|
||||||
la_params.add_argument(
|
la_params.add_argument(
|
||||||
"--boxes-flow", "-F", type=float, default=0.5,
|
"--boxes-flow", "-F", type=float_or_disabled, default=0.5,
|
||||||
help="Specifies how much a horizontal and vertical position of a "
|
help="Specifies how much a horizontal and vertical position of a "
|
||||||
"text matters when determining the order of lines. The value "
|
"text matters when determining the order of lines. The value "
|
||||||
"should be within the range of -1.0 (only horizontal position "
|
"should be within the range of -1.0 (only horizontal position "
|
||||||
"matters) to +1.0 (only vertical position matters). You can also "
|
"matters) to +1.0 (only vertical position matters). You can also "
|
||||||
"pass `None` to disable advanced layout analysis, and instead "
|
"pass `disabled` to disable advanced layout analysis, and "
|
||||||
"return text based on the position of the bottom left corner of "
|
"instead return text based on the position of the bottom left "
|
||||||
"the text box.")
|
"corner of the text box.")
|
||||||
la_params.add_argument(
|
la_params.add_argument(
|
||||||
"--all-texts", "-A", default=False, action="store_true",
|
"--all-texts", "-A", default=False, action="store_true",
|
||||||
help="If layout analysis should be performed on text in figures.")
|
help="If layout analysis should be performed on text in figures.")
|
||||||
|
|
Loading…
Reference in New Issue