* Update documentation for boxes_flow, allow None * Apply comments from code review * Small wording changes, remove unnecessary comment * Update boxes_flow documentation for pdf2text * Pin version of tox to ensure python 3.4 supportpull/399/head
parent
518b5d6efc
commit
e55560f858
|
@ -6,6 +6,6 @@ python:
|
||||||
- "3.7"
|
- "3.7"
|
||||||
- "3.8"
|
- "3.8"
|
||||||
install:
|
install:
|
||||||
- pip install tox-travis flake8
|
- pip install tox==3.14.0 tox-travis flake8
|
||||||
script:
|
script:
|
||||||
- tox -r
|
- tox -r
|
||||||
|
|
|
@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
|
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
|
||||||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||||
|
|
|
@ -48,7 +48,9 @@ class LAParams:
|
||||||
:param boxes_flow: Specifies how much a horizontal and vertical position
|
:param boxes_flow: Specifies how much a horizontal and vertical position
|
||||||
of a text matters when determining the order of text boxes. The value
|
of a text matters when determining the order of text boxes. The value
|
||||||
should be within the range of -1.0 (only horizontal position
|
should be within the range of -1.0 (only horizontal position
|
||||||
matters) to +1.0 (only vertical position matters).
|
matters) to +1.0 (only vertical position matters). You can also pass
|
||||||
|
`None` to disable advanced layout analysis, and instead return text
|
||||||
|
based on the position of the bottom left corner of the text box.
|
||||||
:param detect_vertical: If vertical text should be considered during
|
:param detect_vertical: If vertical text should be considered during
|
||||||
layout analysis
|
layout analysis
|
||||||
:param all_texts: If layout analysis should be performed on text in
|
:param all_texts: If layout analysis should be performed on text in
|
||||||
|
@ -70,8 +72,20 @@ class LAParams:
|
||||||
self.boxes_flow = boxes_flow
|
self.boxes_flow = boxes_flow
|
||||||
self.detect_vertical = detect_vertical
|
self.detect_vertical = detect_vertical
|
||||||
self.all_texts = all_texts
|
self.all_texts = all_texts
|
||||||
|
|
||||||
|
self._validate()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _validate(self):
|
||||||
|
if self.boxes_flow is not None:
|
||||||
|
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
|
||||||
|
"number between -1 and +1")
|
||||||
|
if not (isinstance(self.boxes_flow, int) or
|
||||||
|
isinstance(self.boxes_flow, float)):
|
||||||
|
raise TypeError(boxes_flow_err_msg)
|
||||||
|
if not -1 <= self.boxes_flow <= 1:
|
||||||
|
raise ValueError(boxes_flow_err_msg)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
||||||
'word_margin=%.1f all_texts=%r>' % \
|
'word_margin=%.1f all_texts=%r>' % \
|
||||||
|
@ -783,21 +797,20 @@ class LTLayoutContainer(LTContainer):
|
||||||
for obj in empties:
|
for obj in empties:
|
||||||
obj.analyze(laparams)
|
obj.analyze(laparams)
|
||||||
textboxes = list(self.group_textlines(laparams, textlines))
|
textboxes = list(self.group_textlines(laparams, textlines))
|
||||||
if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 \
|
if laparams.boxes_flow is None:
|
||||||
and textboxes:
|
|
||||||
self.groups = self.group_textboxes(laparams, textboxes)
|
|
||||||
assigner = IndexAssigner()
|
|
||||||
for group in self.groups:
|
|
||||||
group.analyze(laparams)
|
|
||||||
assigner.run(group)
|
|
||||||
textboxes.sort(key=lambda box: box.index)
|
|
||||||
else:
|
|
||||||
def getkey(box):
|
def getkey(box):
|
||||||
if isinstance(box, LTTextBoxVertical):
|
if isinstance(box, LTTextBoxVertical):
|
||||||
return (0, -box.x1, box.y0)
|
return (0, -box.x1, box.y0)
|
||||||
else:
|
else:
|
||||||
return (1, box.y0, box.x0)
|
return (1, box.y0, box.x0)
|
||||||
textboxes.sort(key=getkey)
|
textboxes.sort(key=getkey)
|
||||||
|
else:
|
||||||
|
self.groups = self.group_textboxes(laparams, textboxes)
|
||||||
|
assigner = IndexAssigner()
|
||||||
|
for group in self.groups:
|
||||||
|
group.analyze(laparams)
|
||||||
|
assigner.run(group)
|
||||||
|
textboxes.sort(key=lambda box: box.index)
|
||||||
self._objs = textboxes + otherobjs + empties
|
self._objs = textboxes + otherobjs + empties
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -120,7 +120,10 @@ def maketheparser():
|
||||||
help="Specifies how much a horizontal and vertical position of a "
|
help="Specifies how much a horizontal and vertical position of a "
|
||||||
"text matters when determining the order of lines. The value "
|
"text matters when determining the order of lines. The value "
|
||||||
"should be within the range of -1.0 (only horizontal position "
|
"should be within the range of -1.0 (only horizontal position "
|
||||||
"matters) to +1.0 (only vertical position matters).")
|
"matters) to +1.0 (only vertical position matters). You can also "
|
||||||
|
"pass `None` to disable advanced layout analysis, and instead "
|
||||||
|
"return text based on the position of the bottom left corner of "
|
||||||
|
"the text box.")
|
||||||
la_params.add_argument(
|
la_params.add_argument(
|
||||||
"--all-texts", "-A", default=False, action="store_true",
|
"--all-texts", "-A", default=False, action="store_true",
|
||||||
help="If layout analysis should be performed on text in figures.")
|
help="If layout analysis should be performed on text in figures.")
|
||||||
|
|
Loading…
Reference in New Issue