* Update documentation for boxes_flow, allow None * Apply comments from code review * Small wording changes, remove unnecessary comment * Update boxes_flow documentation for pdf2text * Pin version of tox to ensure python 3.4 supportpull/399/head
parent
518b5d6efc
commit
e55560f858
|
@ -6,6 +6,6 @@ python:
|
|||
- "3.7"
|
||||
- "3.8"
|
||||
install:
|
||||
- pip install tox-travis flake8
|
||||
- pip install tox==3.14.0 tox-travis flake8
|
||||
script:
|
||||
- tox -r
|
||||
|
|
|
@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395))
|
||||
|
||||
### Fixed
|
||||
- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
|
||||
- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
|
||||
|
|
|
@ -48,7 +48,9 @@ class LAParams:
|
|||
:param boxes_flow: Specifies how much a horizontal and vertical position
|
||||
of a text matters when determining the order of text boxes. The value
|
||||
should be within the range of -1.0 (only horizontal position
|
||||
matters) to +1.0 (only vertical position matters).
|
||||
matters) to +1.0 (only vertical position matters). You can also pass
|
||||
`None` to disable advanced layout analysis, and instead return text
|
||||
based on the position of the bottom left corner of the text box.
|
||||
:param detect_vertical: If vertical text should be considered during
|
||||
layout analysis
|
||||
:param all_texts: If layout analysis should be performed on text in
|
||||
|
@ -70,8 +72,20 @@ class LAParams:
|
|||
self.boxes_flow = boxes_flow
|
||||
self.detect_vertical = detect_vertical
|
||||
self.all_texts = all_texts
|
||||
|
||||
self._validate()
|
||||
return
|
||||
|
||||
def _validate(self):
|
||||
if self.boxes_flow is not None:
|
||||
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
|
||||
"number between -1 and +1")
|
||||
if not (isinstance(self.boxes_flow, int) or
|
||||
isinstance(self.boxes_flow, float)):
|
||||
raise TypeError(boxes_flow_err_msg)
|
||||
if not -1 <= self.boxes_flow <= 1:
|
||||
raise ValueError(boxes_flow_err_msg)
|
||||
|
||||
def __repr__(self):
|
||||
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
||||
'word_margin=%.1f all_texts=%r>' % \
|
||||
|
@ -783,21 +797,20 @@ class LTLayoutContainer(LTContainer):
|
|||
for obj in empties:
|
||||
obj.analyze(laparams)
|
||||
textboxes = list(self.group_textlines(laparams, textlines))
|
||||
if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 \
|
||||
and textboxes:
|
||||
self.groups = self.group_textboxes(laparams, textboxes)
|
||||
assigner = IndexAssigner()
|
||||
for group in self.groups:
|
||||
group.analyze(laparams)
|
||||
assigner.run(group)
|
||||
textboxes.sort(key=lambda box: box.index)
|
||||
else:
|
||||
if laparams.boxes_flow is None:
|
||||
def getkey(box):
|
||||
if isinstance(box, LTTextBoxVertical):
|
||||
return (0, -box.x1, box.y0)
|
||||
else:
|
||||
return (1, box.y0, box.x0)
|
||||
textboxes.sort(key=getkey)
|
||||
else:
|
||||
self.groups = self.group_textboxes(laparams, textboxes)
|
||||
assigner = IndexAssigner()
|
||||
for group in self.groups:
|
||||
group.analyze(laparams)
|
||||
assigner.run(group)
|
||||
textboxes.sort(key=lambda box: box.index)
|
||||
self._objs = textboxes + otherobjs + empties
|
||||
return
|
||||
|
||||
|
|
|
@ -120,7 +120,10 @@ def maketheparser():
|
|||
help="Specifies how much a horizontal and vertical position of a "
|
||||
"text matters when determining the order of lines. The value "
|
||||
"should be within the range of -1.0 (only horizontal position "
|
||||
"matters) to +1.0 (only vertical position matters).")
|
||||
"matters) to +1.0 (only vertical position matters). You can also "
|
||||
"pass `None` to disable advanced layout analysis, and instead "
|
||||
"return text based on the position of the bottom left corner of "
|
||||
"the text box.")
|
||||
la_params.add_argument(
|
||||
"--all-texts", "-A", default=False, action="store_true",
|
||||
help="If layout analysis should be performed on text in figures.")
|
||||
|
|
Loading…
Reference in New Issue