From f2c967f5000ac26f731936303979fd365f5b56b0 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Fri, 6 May 2022 16:15:00 -0400 Subject: [PATCH] Ignore path constructors that do not begin with m (#749) * Ignore path constructors that do not begin with m Per PDF Reference Section 4.4.1, "path construction operators may be invoked in any sequence, but the first one invoked must be m or re to begin a new subpath." Since pdfminer.six already converts all `re` (rectangle) operators to their equivelent `mlllh` representation, paths ingested by `.paint_path(...)` that do not begin with the `m` operator are invalid. In addition to the advantage of hewing to the PDF Reference, this change also avoids the `ValueError: not enough values to unpack (expected 2, got 1)` error raised by the ` pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]` line in `converter.py` when parsing PDFs that (erroneously) include `("h",)` paths. * Update CHANGELOG.md Co-authored-by: Pieter Marsman --- CHANGELOG.md | 6 ++++++ pdfminer/converter.py | 11 ++++++++++- tests/test_converter.py | 9 +++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eedc8fd..76c8dbd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed +- Ignoring (invalid) path constructors that do not begin with `m` ([#749](https://github.com/pdfminer/pdfminer.six/pull/749)) + +## [20220506] + +### Fixed + - `IndexError` when handling invalid bfrange code map in CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731)) - `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732)) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index a414799..3da2fcb 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -109,7 +109,16 @@ class PDFLayoutAnalyzer(PDFTextDevice): """Paint paths described in section 4.4 of the PDF reference manual""" shape = "".join(x[0] for x in path) - if shape.count("m") > 1: + if shape[:1] != "m": + # Per PDF Reference Section 4.4.1, "path construction operators may + # be invoked in any sequence, but the first one invoked must be m + # or re to begin a new subpath." Since pdfminer.six already + # converts all `re` (rectangle) operators to their equivelent + # `mlllh` representation, paths ingested by `.paint_path(...)` that + # do not begin with the `m` operator are invalid. + pass + + elif shape.count("m") > 1: # recurse if there are multiple m's in this shape for m in re.finditer(r"m[^m]+", shape): subpath = path[m.start(0) : m.end(0)] diff --git a/tests/test_converter.py b/tests/test_converter.py index e9d18e8..bae442f 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -215,6 +215,15 @@ class TestPaintPath: (71.41, 434.89), ] + def test_paint_path_without_starting_m(self): + gs = PDFGraphicState() + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 100, 0, 100]) + paths = [[("h",)], [("l", 72.41, 433.89), ("l", 82.41, 433.89), ("h",)]] + for path in paths: + analyzer.paint_path(gs, False, False, False, path) + assert len(analyzer.cur_item._objs) == 0 + class TestBinaryDetector: def test_stringio(self):