Fix #395: Update documentation for boxes_flow, allow None (#396)

* Update documentation for boxes_flow, allow None

* Apply comments from code review

* Small wording changes, remove unnecessary comment

* Update boxes_flow documentation for pdf2text

* Pin version of tox to ensure python 3.4 support
pull/399/head
Jake Stockwin 2020-03-26 22:03:49 +00:00 committed by GitHub
parent 518b5d6efc
commit e55560f858
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 12 deletions

View File

@ -6,6 +6,6 @@ python:
- "3.7"
- "3.8"
install:
- pip install tox-travis flake8
- pip install tox==3.14.0 tox-travis flake8
script:
- tox -r

View File

@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
### Added
- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395))
### Fixed
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))

View File

@ -48,7 +48,9 @@ class LAParams:
:param boxes_flow: Specifies how much a horizontal and vertical position
of a text matters when determining the order of text boxes. The value
should be within the range of -1.0 (only horizontal position
matters) to +1.0 (only vertical position matters).
matters) to +1.0 (only vertical position matters). You can also pass
`None` to disable advanced layout analysis, and instead return text
based on the position of the bottom left corner of the text box.
:param detect_vertical: If vertical text should be considered during
layout analysis
:param all_texts: If layout analysis should be performed on text in
@ -70,8 +72,20 @@ class LAParams:
self.boxes_flow = boxes_flow
self.detect_vertical = detect_vertical
self.all_texts = all_texts
self._validate()
return
def _validate(self):
if self.boxes_flow is not None:
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
"number between -1 and +1")
if not (isinstance(self.boxes_flow, int) or
isinstance(self.boxes_flow, float)):
raise TypeError(boxes_flow_err_msg)
if not -1 <= self.boxes_flow <= 1:
raise ValueError(boxes_flow_err_msg)
def __repr__(self):
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
'word_margin=%.1f all_texts=%r>' % \
@ -783,21 +797,20 @@ class LTLayoutContainer(LTContainer):
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.group_textlines(laparams, textlines))
if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 \
and textboxes:
self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in self.groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box: box.index)
else:
if laparams.boxes_flow is None:
def getkey(box):
if isinstance(box, LTTextBoxVertical):
return (0, -box.x1, box.y0)
else:
return (1, box.y0, box.x0)
textboxes.sort(key=getkey)
else:
self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in self.groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box: box.index)
self._objs = textboxes + otherobjs + empties
return

View File

@ -120,7 +120,10 @@ def maketheparser():
help="Specifies how much a horizontal and vertical position of a "
"text matters when determining the order of lines. The value "
"should be within the range of -1.0 (only horizontal position "
"matters) to +1.0 (only vertical position matters).")
"matters) to +1.0 (only vertical position matters). You can also "
"pass `None` to disable advanced layout analysis, and instead "
"return text based on the position of the bottom left corner of "
"the text box.")
la_params.add_argument(
"--all-texts", "-A", default=False, action="store_true",
help="If layout analysis should be performed on text in figures.")