diff --git a/.travis.yml b/.travis.yml index 807650e..f227974 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,6 @@ python: - "3.7" - "3.8" install: - - pip install tox-travis flake8 + - pip install tox==3.14.0 tox-travis flake8 script: - tox -r diff --git a/CHANGELOG.md b/CHANGELOG.md index 950e9ea..7130f21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Added +- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#395](https://github.com/pdfminer/pdfminer.six/pull/395)) + ### Fixed - Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407)) - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389)) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index ca17f10..100bf22 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -48,7 +48,9 @@ class LAParams: :param boxes_flow: Specifies how much a horizontal and vertical position of a text matters when determining the order of text boxes. The value should be within the range of -1.0 (only horizontal position - matters) to +1.0 (only vertical position matters). + matters) to +1.0 (only vertical position matters). You can also pass + `None` to disable advanced layout analysis, and instead return text + based on the position of the bottom left corner of the text box. :param detect_vertical: If vertical text should be considered during layout analysis :param all_texts: If layout analysis should be performed on text in @@ -70,8 +72,20 @@ class LAParams: self.boxes_flow = boxes_flow self.detect_vertical = detect_vertical self.all_texts = all_texts + + self._validate() return + def _validate(self): + if self.boxes_flow is not None: + boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a " + "number between -1 and +1") + if not (isinstance(self.boxes_flow, int) or + isinstance(self.boxes_flow, float)): + raise TypeError(boxes_flow_err_msg) + if not -1 <= self.boxes_flow <= 1: + raise ValueError(boxes_flow_err_msg) + def __repr__(self): return '' % \ @@ -783,21 +797,20 @@ class LTLayoutContainer(LTContainer): for obj in empties: obj.analyze(laparams) textboxes = list(self.group_textlines(laparams, textlines)) - if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 \ - and textboxes: - self.groups = self.group_textboxes(laparams, textboxes) - assigner = IndexAssigner() - for group in self.groups: - group.analyze(laparams) - assigner.run(group) - textboxes.sort(key=lambda box: box.index) - else: + if laparams.boxes_flow is None: def getkey(box): if isinstance(box, LTTextBoxVertical): return (0, -box.x1, box.y0) else: return (1, box.y0, box.x0) textboxes.sort(key=getkey) + else: + self.groups = self.group_textboxes(laparams, textboxes) + assigner = IndexAssigner() + for group in self.groups: + group.analyze(laparams) + assigner.run(group) + textboxes.sort(key=lambda box: box.index) self._objs = textboxes + otherobjs + empties return diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 4313f15..7e6f9d7 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -120,7 +120,10 @@ def maketheparser(): help="Specifies how much a horizontal and vertical position of a " "text matters when determining the order of lines. The value " "should be within the range of -1.0 (only horizontal position " - "matters) to +1.0 (only vertical position matters).") + "matters) to +1.0 (only vertical position matters). You can also " + "pass `None` to disable advanced layout analysis, and instead " + "return text based on the position of the bottom left corner of " + "the text box.") la_params.add_argument( "--all-texts", "-A", default=False, action="store_true", help="If layout analysis should be performed on text in figures.")