diff --git a/CHANGELOG.md b/CHANGELOG.md index 97ccbbe..be5daf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Added +- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371)) + ### Fixed - Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451)) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 3aa2e2a..dbe034f 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -27,6 +27,9 @@ log = logging.getLogger(__name__) class PDFLayoutAnalyzer(PDFTextDevice): + + RECTS = re.compile('^(mlllh)+$') + def __init__(self, rsrcmgr, pageno=1, laparams=None): PDFTextDevice.__init__(self, rsrcmgr) self.pageno = pageno @@ -72,6 +75,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): return def paint_path(self, gstate, stroke, fill, evenodd, path): + """Paint paths described in section 4.4 of the PDF reference manual""" shape = ''.join(x[0] for x in path) if shape == 'ml': # horizontal/vertical line @@ -80,11 +84,11 @@ class PDFLayoutAnalyzer(PDFTextDevice): (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1)) if x0 == x1 or y0 == y1: - self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1), - stroke, fill, evenodd, gstate.scolor, - gstate.ncolor)) - return - if shape == 'mlllh': + line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke, + fill, evenodd, gstate.scolor, gstate.ncolor) + self.cur_item.add(line) + + elif shape == 'mlllh': # rectangle (_, x0, y0) = path[0] (_, x1, y1) = path[1] @@ -96,18 +100,22 @@ class PDFLayoutAnalyzer(PDFTextDevice): (x3, y3) = apply_matrix_pt(self.ctm, (x3, y3)) if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \ (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0): - self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2), - stroke, fill, evenodd, gstate.scolor, - gstate.ncolor)) - return - # other shapes - pts = [] - for p in path: - for i in range(1, len(p), 2): - pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) - self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, - gstate.scolor, gstate.ncolor)) - return + rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke, + fill, evenodd, gstate.scolor, gstate.ncolor) + self.cur_item.add(rect) + + elif self.RECTS.match(shape): + for paths in zip(*(iter(path),) * 5): + self.paint_path(gstate, stroke, fill, evenodd, list(paths)) + + else: + pts = [] + for p in path: + for i in range(1, len(p), 2): + pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) + curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, + gstate.scolor, gstate.ncolor) + self.cur_item.add(curve) def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): diff --git a/samples/contrib/issue-00369-excel.pdf b/samples/contrib/issue-00369-excel.pdf new file mode 100644 index 0000000..2b633e0 --- /dev/null +++ b/samples/contrib/issue-00369-excel.pdf @@ -0,0 +1,250 @@ +%PDF-1.7 +%µ¶ + +1 0 obj +<> +endobj + +2 0 obj +<> +endobj + +3 0 obj +<>>>/Rotate 0/Type/Page>> +endobj + +4 0 obj +(Identity) +endobj + +5 0 obj +(Adobe) +endobj + +6 0 obj +[-506 -268 506 952] +endobj + +7 0 obj +506 +endobj + +8 0 obj +[1005 1005 506 1006 1006 506 1007 1007 506 1008 1008 506 1009 1009 506 1010 1010 506 1011 1011 506 1012 1012 506 1013 1013 506] +endobj + +9 0 obj +<> +stream +/CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def /CMapName /Adobe-Identity-UCS def /CMapType 2 def 1 begincodespacerange <0000> endcodespacerange 9 beginbfchar <03ED> <0031> <03EE> <0032> <03EF> <0033> <03F0> <0034> <03F1> <0035> <03F2> <0036> <03F3> <0037> <03F4> <0038> <03F5> <0039> endbfchar endcmap CMapName currentdict /CMap defineresource pop end end +endstream +endobj + +10 0 obj +<>/CIDToGIDMap/Identity/FontDescriptor<>/Subtype/CIDFontType2/Type/Font/W 8 0 R>>]/Encoding/Identity-H/Subtype/Type0/ToUnicode 9 0 R/Type/Font>> +endobj + +11 0 obj +<> +endobj + +12 0 obj +<> +stream +q +q +.75 0 0 -.75 0 792 cm +126.56 73.28 m +134.08 73.28 l +134.08 90.88 l +126.56 90.88 l +h +W* +n +0 0 0 rg +BT +/F1 14.6921 Tf +1 0 0 -1 126.56 86.88 Tm +[<03ED>] TJ +ET +Q +q +.75 0 0 -.75 0 792 cm +195.36 73.28 m +202.88 73.28 l +202.88 90.88 l +195.36 90.88 l +h +W* +n +0 0 0 rg +BT +/F1 14.6921 Tf +1 0 0 -1 195.36 86.88 Tm +[<03EE>] TJ +ET +Q +q +.75 0 0 -.75 0 792 cm +264.16 73.28 m +271.68 73.28 l +271.68 90.88 l +264.16 90.88 l +h +W* +n +0 0 0 rg +BT +/F1 14.6921 Tf +1 0 0 -1 264.16 86.88 Tm +[<03EF>] TJ +ET +Q +q +.75 0 0 -.75 0 792 cm +0 0 0 rg +BT +/F1 14.6921 Tf +1 0 0 -1 126.56 108.48 Tm +[<03F0>-4174.0007<03F1>-4184.0007<03F2>] TJ +ET +Q +q +.75 0 0 -.75 0 792 cm +126.56 113.92 m +134.08 113.92 l +134.08 131.84 l +126.56 131.84 l +h +W* +n +0 0 0 rg +BT +/F1 14.6921 Tf +1 0 0 -1 126.56 127.84 Tm +[<03F3>] TJ +ET +Q +q +.75 0 0 -.75 0 792 cm +195.36 113.92 m +202.88 113.92 l +202.88 131.84 l +195.36 131.84 l +h +W* +n +0 0 0 rg +BT +/F1 14.6921 Tf +1 0 0 -1 195.36 127.84 Tm +[<03F4>] TJ +ET +Q +q +.75 0 0 -.75 0 792 cm +264.16 113.92 m +271.68 113.92 l +271.68 131.84 l +264.16 131.84 l +h +W* +n +0 0 0 rg +BT +/F1 14.6921 Tf +1 0 0 -1 264.16 127.84 Tm +[<03F5>] TJ +ET +Q +q +.75 0 0 -.75 0 792 cm +136 73.28 m +137.28 73.28 l +137.28 91.04 l +136 91.04 l +h +204.8 73.28 m +206.08 73.28 l +206.08 91.04 l +204.8 91.04 l +h +67.2 72 m +68.48 72 l +68.48 133.28 l +67.2 133.28 l +h +136 94.88 m +137.28 94.88 l +137.28 133.28 l +136 133.28 l +h +204.8 94.88 m +206.08 94.88 l +206.08 133.28 l +204.8 133.28 l +h +273.6 73.28 m +274.88 73.28 l +274.88 133.28 l +273.6 133.28 l +h +0 0 0 rg +f* +Q +q +.75 0 0 -.75 0 792 cm +68.48 72 m +274.88 72 l +274.88 73.28 l +68.48 73.28 l +h +68.48 91.04 m +273.6 91.04 l +273.6 92.32 l +68.48 92.32 l +h +68.48 93.6 m +273.6 93.6 l +273.6 94.88 l +68.48 94.88 l +h +68.48 112.64 m +274.88 112.64 l +274.88 113.92 l +68.48 113.92 l +h +68.48 132 m +274.88 132 l +274.88 133.28 l +68.48 133.28 l +h +0 0 0 rg +f* +Q +Q + +endstream +endobj + +xref +0 13 +0000000000 65536 f +0000000016 00000 n +0000000062 00000 n +0000000114 00000 n +0000000259 00000 n +0000000286 00000 n +0000000310 00000 n +0000000346 00000 n +0000000366 00000 n +0000000510 00000 n +0000001010 00000 n +0000001421 00000 n +0000001576 00000 n + +trailer +<> +startxref +3449 +%%EOF diff --git a/tests/test_converter.py b/tests/test_converter.py new file mode 100644 index 0000000..6cbdfbc --- /dev/null +++ b/tests/test_converter.py @@ -0,0 +1,38 @@ +from nose.tools import assert_equal + +from pdfminer.converter import PDFLayoutAnalyzer +from pdfminer.layout import LTContainer +from pdfminer.pdfinterp import PDFGraphicState + + +class TestPaintPath(): + def test_paint_path(self): + path = [('m', 6, 7), ('l', 7, 7)] + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 100, 0, 100]) + analyzer.paint_path(PDFGraphicState(), False, False, False, path) + assert_equal(len(analyzer.cur_item._objs), 1) + + def test_paint_path_mlllh(self): + path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)] + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 100, 0, 100]) + analyzer.paint_path(PDFGraphicState(), False, False, False, path) + assert_equal(len(analyzer.cur_item), 1) + + def test_paint_path_multiple_mlllh(self): + """Path from samples/contrib/issue-00369-excel.pdf""" + path = [ + ('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',), + ('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',), + ('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',) + ] + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 100, 0, 100]) + analyzer.paint_path(PDFGraphicState(), False, False, False, path) + assert_equal(len(analyzer.cur_item._objs), 3) + + def _get_analyzer(self): + analyzer = PDFLayoutAnalyzer(None) + analyzer.set_ctm([1, 0, 0, 1, 0, 0]) + return analyzer diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index bb9545d..dd1aecf 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -66,7 +66,7 @@ class TestPdf2Txt(): run('contrib/issue-00352-asw-oct96-p41.pdf') def test_scancode_patchelf(self): - """Regression test for # https://github.com/euske/pdfminer/issues/96""" + """Regression test for https://github.com/euske/pdfminer/issues/96""" run('scancode/patchelf.pdf') def test_contrib_hash_two_complement(self): @@ -76,6 +76,12 @@ class TestPdf2Txt(): """ run('contrib/issue-00352-hash-twos-complement.pdf') + def test_contrib_excel(self): + """Regression test for + https://github.com/pdfminer/pdfminer.six/issues/369 + """ + run('contrib/issue-00369-excel.pdf', '-t html') + class TestDumpImages: