Fix converting path to multiple rectangles (#371)

* Fix converting path to multiple rectangles

For path that consists of a series of rectangles
(shape is 'mlllhmlllh...'), call paint_path again with each group of
5 points. The result is multiple rects instead of a single curve.

fixes #369

* Reduce pdf size by removing font

* Add unittest for PDFLayoutAnalyzer.paint_path()

* Add line to CHANGELOG.md

* Add reference to pdf reference manual

* Cleanup function paint_path a bit

* Reduce line length of tests

* Reduce line length of tests

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/461/head
Kwok-kuen Cheung 2020-07-11 23:34:38 +08:00 committed by GitHub
parent 6a9269b432
commit 60863cfd55
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 323 additions and 18 deletions

View File

@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased] ## [Unreleased]
### Added
- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
### Fixed ### Fixed
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451)) - Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))

View File

@ -27,6 +27,9 @@ log = logging.getLogger(__name__)
class PDFLayoutAnalyzer(PDFTextDevice): class PDFLayoutAnalyzer(PDFTextDevice):
RECTS = re.compile('^(mlllh)+$')
def __init__(self, rsrcmgr, pageno=1, laparams=None): def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrcmgr) PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno self.pageno = pageno
@ -72,6 +75,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return return
def paint_path(self, gstate, stroke, fill, evenodd, path): def paint_path(self, gstate, stroke, fill, evenodd, path):
"""Paint paths described in section 4.4 of the PDF reference manual"""
shape = ''.join(x[0] for x in path) shape = ''.join(x[0] for x in path)
if shape == 'ml': if shape == 'ml':
# horizontal/vertical line # horizontal/vertical line
@ -80,11 +84,11 @@ class PDFLayoutAnalyzer(PDFTextDevice):
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0)) (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1)) (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
if x0 == x1 or y0 == y1: if x0 == x1 or y0 == y1:
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1), line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
stroke, fill, evenodd, gstate.scolor, fill, evenodd, gstate.scolor, gstate.ncolor)
gstate.ncolor)) self.cur_item.add(line)
return
if shape == 'mlllh': elif shape == 'mlllh':
# rectangle # rectangle
(_, x0, y0) = path[0] (_, x0, y0) = path[0]
(_, x1, y1) = path[1] (_, x1, y1) = path[1]
@ -96,18 +100,22 @@ class PDFLayoutAnalyzer(PDFTextDevice):
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3)) (x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \ if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0): (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2), rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
stroke, fill, evenodd, gstate.scolor, fill, evenodd, gstate.scolor, gstate.ncolor)
gstate.ncolor)) self.cur_item.add(rect)
return
# other shapes elif self.RECTS.match(shape):
pts = [] for paths in zip(*(iter(path),) * 5):
for p in path: self.paint_path(gstate, stroke, fill, evenodd, list(paths))
for i in range(1, len(p), 2):
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) else:
self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, pts = []
gstate.scolor, gstate.ncolor)) for p in path:
return for i in range(1, len(p), 2):
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
gstate.scolor, gstate.ncolor)
self.cur_item.add(curve)
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
graphicstate): graphicstate):

View File

@ -0,0 +1,250 @@
%PDF-1.7
%µ¶
1 0 obj
<</Pages 2 0 R/Type/Catalog>>
endobj
2 0 obj
<</Count 1/Kids[3 0 R]/Type/Pages>>
endobj
3 0 obj
<</Contents 12 0 R/CropBox[0 0 612 792]/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 10 0 R>>>>/Rotate 0/Type/Page>>
endobj
4 0 obj
(Identity)
endobj
5 0 obj
(Adobe)
endobj
6 0 obj
[-506 -268 506 952]
endobj
7 0 obj
506
endobj
8 0 obj
[1005 1005 506 1006 1006 506 1007 1007 506 1008 1008 506 1009 1009 506 1010 1010 506 1011 1011 506 1012 1012 506 1013 1013 506]
endobj
9 0 obj
<</Length 450>>
stream
/CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def /CMapName /Adobe-Identity-UCS def /CMapType 2 def 1 begincodespacerange <0000> <FFFF> endcodespacerange 9 beginbfchar <03ED> <0031> <03EE> <0032> <03EF> <0033> <03F0> <0034> <03F1> <0035> <03F2> <0036> <03F3> <0037> <03F4> <0038> <03F5> <0039> endbfchar endcmap CMapName currentdict /CMap defineresource pop end end
endstream
endobj
10 0 obj
<</BaseFont/CIDFont+F1/DescendantFonts[<</BaseFont/CIDFont+F1/CIDSystemInfo<</Ordering 4 0 R/Registry 5 0 R/Supplement 0>>/CIDToGIDMap/Identity/FontDescriptor<</Ascent 952/CapHeight 631/Descent -268/Flags 6/FontBBox 6 0 R/FontName/CIDFont+F1/ItalicAngle 0/StemV 7 0 R/Type/FontDescriptor>>/Subtype/CIDFontType2/Type/Font/W 8 0 R>>]/Encoding/Identity-H/Subtype/Type0/ToUnicode 9 0 R/Type/Font>>
endobj
11 0 obj
<</Author(IEUser)/CreationDate(D:20200205092701-08'00')/ModDate(D:20200205092701-08'00')/Producer(Microsoft: Print To PDF)/Title(Book1)>>
endobj
12 0 obj
<</Length 1821>>
stream
q
q
.75 0 0 -.75 0 792 cm
126.56 73.28 m
134.08 73.28 l
134.08 90.88 l
126.56 90.88 l
h
W*
n
0 0 0 rg
BT
/F1 14.6921 Tf
1 0 0 -1 126.56 86.88 Tm
[<03ED>] TJ
ET
Q
q
.75 0 0 -.75 0 792 cm
195.36 73.28 m
202.88 73.28 l
202.88 90.88 l
195.36 90.88 l
h
W*
n
0 0 0 rg
BT
/F1 14.6921 Tf
1 0 0 -1 195.36 86.88 Tm
[<03EE>] TJ
ET
Q
q
.75 0 0 -.75 0 792 cm
264.16 73.28 m
271.68 73.28 l
271.68 90.88 l
264.16 90.88 l
h
W*
n
0 0 0 rg
BT
/F1 14.6921 Tf
1 0 0 -1 264.16 86.88 Tm
[<03EF>] TJ
ET
Q
q
.75 0 0 -.75 0 792 cm
0 0 0 rg
BT
/F1 14.6921 Tf
1 0 0 -1 126.56 108.48 Tm
[<03F0>-4174.0007<03F1>-4184.0007<03F2>] TJ
ET
Q
q
.75 0 0 -.75 0 792 cm
126.56 113.92 m
134.08 113.92 l
134.08 131.84 l
126.56 131.84 l
h
W*
n
0 0 0 rg
BT
/F1 14.6921 Tf
1 0 0 -1 126.56 127.84 Tm
[<03F3>] TJ
ET
Q
q
.75 0 0 -.75 0 792 cm
195.36 113.92 m
202.88 113.92 l
202.88 131.84 l
195.36 131.84 l
h
W*
n
0 0 0 rg
BT
/F1 14.6921 Tf
1 0 0 -1 195.36 127.84 Tm
[<03F4>] TJ
ET
Q
q
.75 0 0 -.75 0 792 cm
264.16 113.92 m
271.68 113.92 l
271.68 131.84 l
264.16 131.84 l
h
W*
n
0 0 0 rg
BT
/F1 14.6921 Tf
1 0 0 -1 264.16 127.84 Tm
[<03F5>] TJ
ET
Q
q
.75 0 0 -.75 0 792 cm
136 73.28 m
137.28 73.28 l
137.28 91.04 l
136 91.04 l
h
204.8 73.28 m
206.08 73.28 l
206.08 91.04 l
204.8 91.04 l
h
67.2 72 m
68.48 72 l
68.48 133.28 l
67.2 133.28 l
h
136 94.88 m
137.28 94.88 l
137.28 133.28 l
136 133.28 l
h
204.8 94.88 m
206.08 94.88 l
206.08 133.28 l
204.8 133.28 l
h
273.6 73.28 m
274.88 73.28 l
274.88 133.28 l
273.6 133.28 l
h
0 0 0 rg
f*
Q
q
.75 0 0 -.75 0 792 cm
68.48 72 m
274.88 72 l
274.88 73.28 l
68.48 73.28 l
h
68.48 91.04 m
273.6 91.04 l
273.6 92.32 l
68.48 92.32 l
h
68.48 93.6 m
273.6 93.6 l
273.6 94.88 l
68.48 94.88 l
h
68.48 112.64 m
274.88 112.64 l
274.88 113.92 l
68.48 113.92 l
h
68.48 132 m
274.88 132 l
274.88 133.28 l
68.48 133.28 l
h
0 0 0 rg
f*
Q
Q
endstream
endobj
xref
0 13
0000000000 65536 f
0000000016 00000 n
0000000062 00000 n
0000000114 00000 n
0000000259 00000 n
0000000286 00000 n
0000000310 00000 n
0000000346 00000 n
0000000366 00000 n
0000000510 00000 n
0000001010 00000 n
0000001421 00000 n
0000001576 00000 n
trailer
<</Size 13/Info 11 0 R/Root 1 0 R>>
startxref
3449
%%EOF

38
tests/test_converter.py Normal file
View File

@ -0,0 +1,38 @@
from nose.tools import assert_equal
from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.layout import LTContainer
from pdfminer.pdfinterp import PDFGraphicState
class TestPaintPath():
def test_paint_path(self):
path = [('m', 6, 7), ('l', 7, 7)]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert_equal(len(analyzer.cur_item._objs), 1)
def test_paint_path_mlllh(self):
path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert_equal(len(analyzer.cur_item), 1)
def test_paint_path_multiple_mlllh(self):
"""Path from samples/contrib/issue-00369-excel.pdf"""
path = [
('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',),
('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',),
('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',)
]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert_equal(len(analyzer.cur_item._objs), 3)
def _get_analyzer(self):
analyzer = PDFLayoutAnalyzer(None)
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
return analyzer

View File

@ -66,7 +66,7 @@ class TestPdf2Txt():
run('contrib/issue-00352-asw-oct96-p41.pdf') run('contrib/issue-00352-asw-oct96-p41.pdf')
def test_scancode_patchelf(self): def test_scancode_patchelf(self):
"""Regression test for # https://github.com/euske/pdfminer/issues/96""" """Regression test for https://github.com/euske/pdfminer/issues/96"""
run('scancode/patchelf.pdf') run('scancode/patchelf.pdf')
def test_contrib_hash_two_complement(self): def test_contrib_hash_two_complement(self):
@ -76,6 +76,12 @@ class TestPdf2Txt():
""" """
run('contrib/issue-00352-hash-twos-complement.pdf') run('contrib/issue-00352-hash-twos-complement.pdf')
def test_contrib_excel(self):
"""Regression test for
https://github.com/pdfminer/pdfminer.six/issues/369
"""
run('contrib/issue-00369-excel.pdf', '-t html')
class TestDumpImages: class TestDumpImages: