Fix converting path to multiple rectangles (#371)
* Fix converting path to multiple rectangles For path that consists of a series of rectangles (shape is 'mlllhmlllh...'), call paint_path again with each group of 5 points. The result is multiple rects instead of a single curve. fixes #369 * Reduce pdf size by removing font * Add unittest for PDFLayoutAnalyzer.paint_path() * Add line to CHANGELOG.md * Add reference to pdf reference manual * Cleanup function paint_path a bit * Reduce line length of tests * Reduce line length of tests Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/461/head
parent
6a9269b432
commit
60863cfd55
|
@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
|
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,9 @@ log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PDFLayoutAnalyzer(PDFTextDevice):
|
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
|
|
||||||
|
RECTS = re.compile('^(mlllh)+$')
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||||
PDFTextDevice.__init__(self, rsrcmgr)
|
PDFTextDevice.__init__(self, rsrcmgr)
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
|
@ -72,6 +75,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||||
|
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
||||||
shape = ''.join(x[0] for x in path)
|
shape = ''.join(x[0] for x in path)
|
||||||
if shape == 'ml':
|
if shape == 'ml':
|
||||||
# horizontal/vertical line
|
# horizontal/vertical line
|
||||||
|
@ -80,11 +84,11 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||||
if x0 == x1 or y0 == y1:
|
if x0 == x1 or y0 == y1:
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1),
|
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
|
||||||
stroke, fill, evenodd, gstate.scolor,
|
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||||
gstate.ncolor))
|
self.cur_item.add(line)
|
||||||
return
|
|
||||||
if shape == 'mlllh':
|
elif shape == 'mlllh':
|
||||||
# rectangle
|
# rectangle
|
||||||
(_, x0, y0) = path[0]
|
(_, x0, y0) = path[0]
|
||||||
(_, x1, y1) = path[1]
|
(_, x1, y1) = path[1]
|
||||||
|
@ -96,18 +100,22 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
||||||
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
||||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
||||||
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2),
|
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
|
||||||
stroke, fill, evenodd, gstate.scolor,
|
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||||
gstate.ncolor))
|
self.cur_item.add(rect)
|
||||||
return
|
|
||||||
# other shapes
|
elif self.RECTS.match(shape):
|
||||||
|
for paths in zip(*(iter(path),) * 5):
|
||||||
|
self.paint_path(gstate, stroke, fill, evenodd, list(paths))
|
||||||
|
|
||||||
|
else:
|
||||||
pts = []
|
pts = []
|
||||||
for p in path:
|
for p in path:
|
||||||
for i in range(1, len(p), 2):
|
for i in range(1, len(p), 2):
|
||||||
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
|
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
|
||||||
self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||||
gstate.scolor, gstate.ncolor))
|
gstate.scolor, gstate.ncolor)
|
||||||
return
|
self.cur_item.add(curve)
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||||
graphicstate):
|
graphicstate):
|
||||||
|
|
|
@ -0,0 +1,250 @@
|
||||||
|
%PDF-1.7
|
||||||
|
%µ¶
|
||||||
|
|
||||||
|
1 0 obj
|
||||||
|
<</Pages 2 0 R/Type/Catalog>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
2 0 obj
|
||||||
|
<</Count 1/Kids[3 0 R]/Type/Pages>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
3 0 obj
|
||||||
|
<</Contents 12 0 R/CropBox[0 0 612 792]/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 10 0 R>>>>/Rotate 0/Type/Page>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
4 0 obj
|
||||||
|
(Identity)
|
||||||
|
endobj
|
||||||
|
|
||||||
|
5 0 obj
|
||||||
|
(Adobe)
|
||||||
|
endobj
|
||||||
|
|
||||||
|
6 0 obj
|
||||||
|
[-506 -268 506 952]
|
||||||
|
endobj
|
||||||
|
|
||||||
|
7 0 obj
|
||||||
|
506
|
||||||
|
endobj
|
||||||
|
|
||||||
|
8 0 obj
|
||||||
|
[1005 1005 506 1006 1006 506 1007 1007 506 1008 1008 506 1009 1009 506 1010 1010 506 1011 1011 506 1012 1012 506 1013 1013 506]
|
||||||
|
endobj
|
||||||
|
|
||||||
|
9 0 obj
|
||||||
|
<</Length 450>>
|
||||||
|
stream
|
||||||
|
/CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def /CMapName /Adobe-Identity-UCS def /CMapType 2 def 1 begincodespacerange <0000> <FFFF> endcodespacerange 9 beginbfchar <03ED> <0031> <03EE> <0032> <03EF> <0033> <03F0> <0034> <03F1> <0035> <03F2> <0036> <03F3> <0037> <03F4> <0038> <03F5> <0039> endbfchar endcmap CMapName currentdict /CMap defineresource pop end end
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
10 0 obj
|
||||||
|
<</BaseFont/CIDFont+F1/DescendantFonts[<</BaseFont/CIDFont+F1/CIDSystemInfo<</Ordering 4 0 R/Registry 5 0 R/Supplement 0>>/CIDToGIDMap/Identity/FontDescriptor<</Ascent 952/CapHeight 631/Descent -268/Flags 6/FontBBox 6 0 R/FontName/CIDFont+F1/ItalicAngle 0/StemV 7 0 R/Type/FontDescriptor>>/Subtype/CIDFontType2/Type/Font/W 8 0 R>>]/Encoding/Identity-H/Subtype/Type0/ToUnicode 9 0 R/Type/Font>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
11 0 obj
|
||||||
|
<</Author(IEUser)/CreationDate(D:20200205092701-08'00')/ModDate(D:20200205092701-08'00')/Producer(Microsoft: Print To PDF)/Title(Book1)>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
12 0 obj
|
||||||
|
<</Length 1821>>
|
||||||
|
stream
|
||||||
|
q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
126.56 73.28 m
|
||||||
|
134.08 73.28 l
|
||||||
|
134.08 90.88 l
|
||||||
|
126.56 90.88 l
|
||||||
|
h
|
||||||
|
W*
|
||||||
|
n
|
||||||
|
0 0 0 rg
|
||||||
|
BT
|
||||||
|
/F1 14.6921 Tf
|
||||||
|
1 0 0 -1 126.56 86.88 Tm
|
||||||
|
[<03ED>] TJ
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
195.36 73.28 m
|
||||||
|
202.88 73.28 l
|
||||||
|
202.88 90.88 l
|
||||||
|
195.36 90.88 l
|
||||||
|
h
|
||||||
|
W*
|
||||||
|
n
|
||||||
|
0 0 0 rg
|
||||||
|
BT
|
||||||
|
/F1 14.6921 Tf
|
||||||
|
1 0 0 -1 195.36 86.88 Tm
|
||||||
|
[<03EE>] TJ
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
264.16 73.28 m
|
||||||
|
271.68 73.28 l
|
||||||
|
271.68 90.88 l
|
||||||
|
264.16 90.88 l
|
||||||
|
h
|
||||||
|
W*
|
||||||
|
n
|
||||||
|
0 0 0 rg
|
||||||
|
BT
|
||||||
|
/F1 14.6921 Tf
|
||||||
|
1 0 0 -1 264.16 86.88 Tm
|
||||||
|
[<03EF>] TJ
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
0 0 0 rg
|
||||||
|
BT
|
||||||
|
/F1 14.6921 Tf
|
||||||
|
1 0 0 -1 126.56 108.48 Tm
|
||||||
|
[<03F0>-4174.0007<03F1>-4184.0007<03F2>] TJ
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
126.56 113.92 m
|
||||||
|
134.08 113.92 l
|
||||||
|
134.08 131.84 l
|
||||||
|
126.56 131.84 l
|
||||||
|
h
|
||||||
|
W*
|
||||||
|
n
|
||||||
|
0 0 0 rg
|
||||||
|
BT
|
||||||
|
/F1 14.6921 Tf
|
||||||
|
1 0 0 -1 126.56 127.84 Tm
|
||||||
|
[<03F3>] TJ
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
195.36 113.92 m
|
||||||
|
202.88 113.92 l
|
||||||
|
202.88 131.84 l
|
||||||
|
195.36 131.84 l
|
||||||
|
h
|
||||||
|
W*
|
||||||
|
n
|
||||||
|
0 0 0 rg
|
||||||
|
BT
|
||||||
|
/F1 14.6921 Tf
|
||||||
|
1 0 0 -1 195.36 127.84 Tm
|
||||||
|
[<03F4>] TJ
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
264.16 113.92 m
|
||||||
|
271.68 113.92 l
|
||||||
|
271.68 131.84 l
|
||||||
|
264.16 131.84 l
|
||||||
|
h
|
||||||
|
W*
|
||||||
|
n
|
||||||
|
0 0 0 rg
|
||||||
|
BT
|
||||||
|
/F1 14.6921 Tf
|
||||||
|
1 0 0 -1 264.16 127.84 Tm
|
||||||
|
[<03F5>] TJ
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
136 73.28 m
|
||||||
|
137.28 73.28 l
|
||||||
|
137.28 91.04 l
|
||||||
|
136 91.04 l
|
||||||
|
h
|
||||||
|
204.8 73.28 m
|
||||||
|
206.08 73.28 l
|
||||||
|
206.08 91.04 l
|
||||||
|
204.8 91.04 l
|
||||||
|
h
|
||||||
|
67.2 72 m
|
||||||
|
68.48 72 l
|
||||||
|
68.48 133.28 l
|
||||||
|
67.2 133.28 l
|
||||||
|
h
|
||||||
|
136 94.88 m
|
||||||
|
137.28 94.88 l
|
||||||
|
137.28 133.28 l
|
||||||
|
136 133.28 l
|
||||||
|
h
|
||||||
|
204.8 94.88 m
|
||||||
|
206.08 94.88 l
|
||||||
|
206.08 133.28 l
|
||||||
|
204.8 133.28 l
|
||||||
|
h
|
||||||
|
273.6 73.28 m
|
||||||
|
274.88 73.28 l
|
||||||
|
274.88 133.28 l
|
||||||
|
273.6 133.28 l
|
||||||
|
h
|
||||||
|
0 0 0 rg
|
||||||
|
f*
|
||||||
|
Q
|
||||||
|
q
|
||||||
|
.75 0 0 -.75 0 792 cm
|
||||||
|
68.48 72 m
|
||||||
|
274.88 72 l
|
||||||
|
274.88 73.28 l
|
||||||
|
68.48 73.28 l
|
||||||
|
h
|
||||||
|
68.48 91.04 m
|
||||||
|
273.6 91.04 l
|
||||||
|
273.6 92.32 l
|
||||||
|
68.48 92.32 l
|
||||||
|
h
|
||||||
|
68.48 93.6 m
|
||||||
|
273.6 93.6 l
|
||||||
|
273.6 94.88 l
|
||||||
|
68.48 94.88 l
|
||||||
|
h
|
||||||
|
68.48 112.64 m
|
||||||
|
274.88 112.64 l
|
||||||
|
274.88 113.92 l
|
||||||
|
68.48 113.92 l
|
||||||
|
h
|
||||||
|
68.48 132 m
|
||||||
|
274.88 132 l
|
||||||
|
274.88 133.28 l
|
||||||
|
68.48 133.28 l
|
||||||
|
h
|
||||||
|
0 0 0 rg
|
||||||
|
f*
|
||||||
|
Q
|
||||||
|
Q
|
||||||
|
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
xref
|
||||||
|
0 13
|
||||||
|
0000000000 65536 f
|
||||||
|
0000000016 00000 n
|
||||||
|
0000000062 00000 n
|
||||||
|
0000000114 00000 n
|
||||||
|
0000000259 00000 n
|
||||||
|
0000000286 00000 n
|
||||||
|
0000000310 00000 n
|
||||||
|
0000000346 00000 n
|
||||||
|
0000000366 00000 n
|
||||||
|
0000000510 00000 n
|
||||||
|
0000001010 00000 n
|
||||||
|
0000001421 00000 n
|
||||||
|
0000001576 00000 n
|
||||||
|
|
||||||
|
trailer
|
||||||
|
<</Size 13/Info 11 0 R/Root 1 0 R>>
|
||||||
|
startxref
|
||||||
|
3449
|
||||||
|
%%EOF
|
|
@ -0,0 +1,38 @@
|
||||||
|
from nose.tools import assert_equal
|
||||||
|
|
||||||
|
from pdfminer.converter import PDFLayoutAnalyzer
|
||||||
|
from pdfminer.layout import LTContainer
|
||||||
|
from pdfminer.pdfinterp import PDFGraphicState
|
||||||
|
|
||||||
|
|
||||||
|
class TestPaintPath():
|
||||||
|
def test_paint_path(self):
|
||||||
|
path = [('m', 6, 7), ('l', 7, 7)]
|
||||||
|
analyzer = self._get_analyzer()
|
||||||
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||||
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||||
|
assert_equal(len(analyzer.cur_item._objs), 1)
|
||||||
|
|
||||||
|
def test_paint_path_mlllh(self):
|
||||||
|
path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)]
|
||||||
|
analyzer = self._get_analyzer()
|
||||||
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||||
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||||
|
assert_equal(len(analyzer.cur_item), 1)
|
||||||
|
|
||||||
|
def test_paint_path_multiple_mlllh(self):
|
||||||
|
"""Path from samples/contrib/issue-00369-excel.pdf"""
|
||||||
|
path = [
|
||||||
|
('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',),
|
||||||
|
('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',),
|
||||||
|
('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',)
|
||||||
|
]
|
||||||
|
analyzer = self._get_analyzer()
|
||||||
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||||
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||||
|
assert_equal(len(analyzer.cur_item._objs), 3)
|
||||||
|
|
||||||
|
def _get_analyzer(self):
|
||||||
|
analyzer = PDFLayoutAnalyzer(None)
|
||||||
|
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
||||||
|
return analyzer
|
|
@ -66,7 +66,7 @@ class TestPdf2Txt():
|
||||||
run('contrib/issue-00352-asw-oct96-p41.pdf')
|
run('contrib/issue-00352-asw-oct96-p41.pdf')
|
||||||
|
|
||||||
def test_scancode_patchelf(self):
|
def test_scancode_patchelf(self):
|
||||||
"""Regression test for # https://github.com/euske/pdfminer/issues/96"""
|
"""Regression test for https://github.com/euske/pdfminer/issues/96"""
|
||||||
run('scancode/patchelf.pdf')
|
run('scancode/patchelf.pdf')
|
||||||
|
|
||||||
def test_contrib_hash_two_complement(self):
|
def test_contrib_hash_two_complement(self):
|
||||||
|
@ -76,6 +76,12 @@ class TestPdf2Txt():
|
||||||
"""
|
"""
|
||||||
run('contrib/issue-00352-hash-twos-complement.pdf')
|
run('contrib/issue-00352-hash-twos-complement.pdf')
|
||||||
|
|
||||||
|
def test_contrib_excel(self):
|
||||||
|
"""Regression test for
|
||||||
|
https://github.com/pdfminer/pdfminer.six/issues/369
|
||||||
|
"""
|
||||||
|
run('contrib/issue-00369-excel.pdf', '-t html')
|
||||||
|
|
||||||
|
|
||||||
class TestDumpImages:
|
class TestDumpImages:
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue