Fix converting path to multiple rectangles (#371)
* Fix converting path to multiple rectangles For path that consists of a series of rectangles (shape is 'mlllhmlllh...'), call paint_path again with each group of 5 points. The result is multiple rects instead of a single curve. fixes #369 * Reduce pdf size by removing font * Add unittest for PDFLayoutAnalyzer.paint_path() * Add line to CHANGELOG.md * Add reference to pdf reference manual * Cleanup function paint_path a bit * Reduce line length of tests * Reduce line length of tests Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/461/head
parent
6a9269b432
commit
60863cfd55
|
@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
|
||||
|
||||
### Fixed
|
||||
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
|
||||
|
||||
|
|
|
@ -27,6 +27,9 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||
|
||||
RECTS = re.compile('^(mlllh)+$')
|
||||
|
||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||
PDFTextDevice.__init__(self, rsrcmgr)
|
||||
self.pageno = pageno
|
||||
|
@ -72,6 +75,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
return
|
||||
|
||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
||||
shape = ''.join(x[0] for x in path)
|
||||
if shape == 'ml':
|
||||
# horizontal/vertical line
|
||||
|
@ -80,11 +84,11 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||
if x0 == x1 or y0 == y1:
|
||||
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1),
|
||||
stroke, fill, evenodd, gstate.scolor,
|
||||
gstate.ncolor))
|
||||
return
|
||||
if shape == 'mlllh':
|
||||
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(line)
|
||||
|
||||
elif shape == 'mlllh':
|
||||
# rectangle
|
||||
(_, x0, y0) = path[0]
|
||||
(_, x1, y1) = path[1]
|
||||
|
@ -96,18 +100,22 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
||||
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2),
|
||||
stroke, fill, evenodd, gstate.scolor,
|
||||
gstate.ncolor))
|
||||
return
|
||||
# other shapes
|
||||
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(rect)
|
||||
|
||||
elif self.RECTS.match(shape):
|
||||
for paths in zip(*(iter(path),) * 5):
|
||||
self.paint_path(gstate, stroke, fill, evenodd, list(paths))
|
||||
|
||||
else:
|
||||
pts = []
|
||||
for p in path:
|
||||
for i in range(1, len(p), 2):
|
||||
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
|
||||
self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||
gstate.scolor, gstate.ncolor))
|
||||
return
|
||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||
gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||
graphicstate):
|
||||
|
|
|
@ -0,0 +1,250 @@
|
|||
%PDF-1.7
|
||||
%µ¶
|
||||
|
||||
1 0 obj
|
||||
<</Pages 2 0 R/Type/Catalog>>
|
||||
endobj
|
||||
|
||||
2 0 obj
|
||||
<</Count 1/Kids[3 0 R]/Type/Pages>>
|
||||
endobj
|
||||
|
||||
3 0 obj
|
||||
<</Contents 12 0 R/CropBox[0 0 612 792]/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 10 0 R>>>>/Rotate 0/Type/Page>>
|
||||
endobj
|
||||
|
||||
4 0 obj
|
||||
(Identity)
|
||||
endobj
|
||||
|
||||
5 0 obj
|
||||
(Adobe)
|
||||
endobj
|
||||
|
||||
6 0 obj
|
||||
[-506 -268 506 952]
|
||||
endobj
|
||||
|
||||
7 0 obj
|
||||
506
|
||||
endobj
|
||||
|
||||
8 0 obj
|
||||
[1005 1005 506 1006 1006 506 1007 1007 506 1008 1008 506 1009 1009 506 1010 1010 506 1011 1011 506 1012 1012 506 1013 1013 506]
|
||||
endobj
|
||||
|
||||
9 0 obj
|
||||
<</Length 450>>
|
||||
stream
|
||||
/CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo << /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def /CMapName /Adobe-Identity-UCS def /CMapType 2 def 1 begincodespacerange <0000> <FFFF> endcodespacerange 9 beginbfchar <03ED> <0031> <03EE> <0032> <03EF> <0033> <03F0> <0034> <03F1> <0035> <03F2> <0036> <03F3> <0037> <03F4> <0038> <03F5> <0039> endbfchar endcmap CMapName currentdict /CMap defineresource pop end end
|
||||
endstream
|
||||
endobj
|
||||
|
||||
10 0 obj
|
||||
<</BaseFont/CIDFont+F1/DescendantFonts[<</BaseFont/CIDFont+F1/CIDSystemInfo<</Ordering 4 0 R/Registry 5 0 R/Supplement 0>>/CIDToGIDMap/Identity/FontDescriptor<</Ascent 952/CapHeight 631/Descent -268/Flags 6/FontBBox 6 0 R/FontName/CIDFont+F1/ItalicAngle 0/StemV 7 0 R/Type/FontDescriptor>>/Subtype/CIDFontType2/Type/Font/W 8 0 R>>]/Encoding/Identity-H/Subtype/Type0/ToUnicode 9 0 R/Type/Font>>
|
||||
endobj
|
||||
|
||||
11 0 obj
|
||||
<</Author(IEUser)/CreationDate(D:20200205092701-08'00')/ModDate(D:20200205092701-08'00')/Producer(Microsoft: Print To PDF)/Title(Book1)>>
|
||||
endobj
|
||||
|
||||
12 0 obj
|
||||
<</Length 1821>>
|
||||
stream
|
||||
q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
126.56 73.28 m
|
||||
134.08 73.28 l
|
||||
134.08 90.88 l
|
||||
126.56 90.88 l
|
||||
h
|
||||
W*
|
||||
n
|
||||
0 0 0 rg
|
||||
BT
|
||||
/F1 14.6921 Tf
|
||||
1 0 0 -1 126.56 86.88 Tm
|
||||
[<03ED>] TJ
|
||||
ET
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
195.36 73.28 m
|
||||
202.88 73.28 l
|
||||
202.88 90.88 l
|
||||
195.36 90.88 l
|
||||
h
|
||||
W*
|
||||
n
|
||||
0 0 0 rg
|
||||
BT
|
||||
/F1 14.6921 Tf
|
||||
1 0 0 -1 195.36 86.88 Tm
|
||||
[<03EE>] TJ
|
||||
ET
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
264.16 73.28 m
|
||||
271.68 73.28 l
|
||||
271.68 90.88 l
|
||||
264.16 90.88 l
|
||||
h
|
||||
W*
|
||||
n
|
||||
0 0 0 rg
|
||||
BT
|
||||
/F1 14.6921 Tf
|
||||
1 0 0 -1 264.16 86.88 Tm
|
||||
[<03EF>] TJ
|
||||
ET
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
0 0 0 rg
|
||||
BT
|
||||
/F1 14.6921 Tf
|
||||
1 0 0 -1 126.56 108.48 Tm
|
||||
[<03F0>-4174.0007<03F1>-4184.0007<03F2>] TJ
|
||||
ET
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
126.56 113.92 m
|
||||
134.08 113.92 l
|
||||
134.08 131.84 l
|
||||
126.56 131.84 l
|
||||
h
|
||||
W*
|
||||
n
|
||||
0 0 0 rg
|
||||
BT
|
||||
/F1 14.6921 Tf
|
||||
1 0 0 -1 126.56 127.84 Tm
|
||||
[<03F3>] TJ
|
||||
ET
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
195.36 113.92 m
|
||||
202.88 113.92 l
|
||||
202.88 131.84 l
|
||||
195.36 131.84 l
|
||||
h
|
||||
W*
|
||||
n
|
||||
0 0 0 rg
|
||||
BT
|
||||
/F1 14.6921 Tf
|
||||
1 0 0 -1 195.36 127.84 Tm
|
||||
[<03F4>] TJ
|
||||
ET
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
264.16 113.92 m
|
||||
271.68 113.92 l
|
||||
271.68 131.84 l
|
||||
264.16 131.84 l
|
||||
h
|
||||
W*
|
||||
n
|
||||
0 0 0 rg
|
||||
BT
|
||||
/F1 14.6921 Tf
|
||||
1 0 0 -1 264.16 127.84 Tm
|
||||
[<03F5>] TJ
|
||||
ET
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
136 73.28 m
|
||||
137.28 73.28 l
|
||||
137.28 91.04 l
|
||||
136 91.04 l
|
||||
h
|
||||
204.8 73.28 m
|
||||
206.08 73.28 l
|
||||
206.08 91.04 l
|
||||
204.8 91.04 l
|
||||
h
|
||||
67.2 72 m
|
||||
68.48 72 l
|
||||
68.48 133.28 l
|
||||
67.2 133.28 l
|
||||
h
|
||||
136 94.88 m
|
||||
137.28 94.88 l
|
||||
137.28 133.28 l
|
||||
136 133.28 l
|
||||
h
|
||||
204.8 94.88 m
|
||||
206.08 94.88 l
|
||||
206.08 133.28 l
|
||||
204.8 133.28 l
|
||||
h
|
||||
273.6 73.28 m
|
||||
274.88 73.28 l
|
||||
274.88 133.28 l
|
||||
273.6 133.28 l
|
||||
h
|
||||
0 0 0 rg
|
||||
f*
|
||||
Q
|
||||
q
|
||||
.75 0 0 -.75 0 792 cm
|
||||
68.48 72 m
|
||||
274.88 72 l
|
||||
274.88 73.28 l
|
||||
68.48 73.28 l
|
||||
h
|
||||
68.48 91.04 m
|
||||
273.6 91.04 l
|
||||
273.6 92.32 l
|
||||
68.48 92.32 l
|
||||
h
|
||||
68.48 93.6 m
|
||||
273.6 93.6 l
|
||||
273.6 94.88 l
|
||||
68.48 94.88 l
|
||||
h
|
||||
68.48 112.64 m
|
||||
274.88 112.64 l
|
||||
274.88 113.92 l
|
||||
68.48 113.92 l
|
||||
h
|
||||
68.48 132 m
|
||||
274.88 132 l
|
||||
274.88 133.28 l
|
||||
68.48 133.28 l
|
||||
h
|
||||
0 0 0 rg
|
||||
f*
|
||||
Q
|
||||
Q
|
||||
|
||||
endstream
|
||||
endobj
|
||||
|
||||
xref
|
||||
0 13
|
||||
0000000000 65536 f
|
||||
0000000016 00000 n
|
||||
0000000062 00000 n
|
||||
0000000114 00000 n
|
||||
0000000259 00000 n
|
||||
0000000286 00000 n
|
||||
0000000310 00000 n
|
||||
0000000346 00000 n
|
||||
0000000366 00000 n
|
||||
0000000510 00000 n
|
||||
0000001010 00000 n
|
||||
0000001421 00000 n
|
||||
0000001576 00000 n
|
||||
|
||||
trailer
|
||||
<</Size 13/Info 11 0 R/Root 1 0 R>>
|
||||
startxref
|
||||
3449
|
||||
%%EOF
|
|
@ -0,0 +1,38 @@
|
|||
from nose.tools import assert_equal
|
||||
|
||||
from pdfminer.converter import PDFLayoutAnalyzer
|
||||
from pdfminer.layout import LTContainer
|
||||
from pdfminer.pdfinterp import PDFGraphicState
|
||||
|
||||
|
||||
class TestPaintPath():
|
||||
def test_paint_path(self):
|
||||
path = [('m', 6, 7), ('l', 7, 7)]
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
assert_equal(len(analyzer.cur_item._objs), 1)
|
||||
|
||||
def test_paint_path_mlllh(self):
|
||||
path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)]
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
assert_equal(len(analyzer.cur_item), 1)
|
||||
|
||||
def test_paint_path_multiple_mlllh(self):
|
||||
"""Path from samples/contrib/issue-00369-excel.pdf"""
|
||||
path = [
|
||||
('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',),
|
||||
('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',),
|
||||
('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',)
|
||||
]
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
assert_equal(len(analyzer.cur_item._objs), 3)
|
||||
|
||||
def _get_analyzer(self):
|
||||
analyzer = PDFLayoutAnalyzer(None)
|
||||
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
||||
return analyzer
|
|
@ -66,7 +66,7 @@ class TestPdf2Txt():
|
|||
run('contrib/issue-00352-asw-oct96-p41.pdf')
|
||||
|
||||
def test_scancode_patchelf(self):
|
||||
"""Regression test for # https://github.com/euske/pdfminer/issues/96"""
|
||||
"""Regression test for https://github.com/euske/pdfminer/issues/96"""
|
||||
run('scancode/patchelf.pdf')
|
||||
|
||||
def test_contrib_hash_two_complement(self):
|
||||
|
@ -76,6 +76,12 @@ class TestPdf2Txt():
|
|||
"""
|
||||
run('contrib/issue-00352-hash-twos-complement.pdf')
|
||||
|
||||
def test_contrib_excel(self):
|
||||
"""Regression test for
|
||||
https://github.com/pdfminer/pdfminer.six/issues/369
|
||||
"""
|
||||
run('contrib/issue-00369-excel.pdf', '-t html')
|
||||
|
||||
|
||||
class TestDumpImages:
|
||||
|
||||
|
|
Loading…
Reference in New Issue