Storing Bezier path and dashing style of line in LTCurve (#801)
* Fixes #672 and #630 Add raw points to get full information from bezier segments and dashing style * Use intermediate variables for constructing tranformed_path Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/825/merge
parent
ebf7bcdb98
commit
5114acdda6
|
@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
|
- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
|
||||||
- Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790))
|
- Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790))
|
||||||
- Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829))
|
- Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829))
|
||||||
|
- Storing Bezier path and dashing style of line in LTCurve ([#801](https://github.com/pdfminer/pdfminer.six/pull/801))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
|
|
|
@ -138,6 +138,19 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
]
|
]
|
||||||
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
||||||
|
|
||||||
|
operators = [str(operation[0]) for operation in path]
|
||||||
|
transformed_points = [
|
||||||
|
[
|
||||||
|
apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
|
||||||
|
for operand1, operand2 in zip(operation[1::2], operation[2::2])
|
||||||
|
]
|
||||||
|
for operation in path
|
||||||
|
]
|
||||||
|
transformed_path = [
|
||||||
|
cast(PathSegment, (o, *p))
|
||||||
|
for o, p in zip(operators, transformed_points)
|
||||||
|
]
|
||||||
|
|
||||||
if shape in {"mlh", "ml"}:
|
if shape in {"mlh", "ml"}:
|
||||||
# single line segment
|
# single line segment
|
||||||
#
|
#
|
||||||
|
@ -152,6 +165,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
evenodd,
|
evenodd,
|
||||||
gstate.scolor,
|
gstate.scolor,
|
||||||
gstate.ncolor,
|
gstate.ncolor,
|
||||||
|
original_path=transformed_path,
|
||||||
|
dashing_style=gstate.dash,
|
||||||
)
|
)
|
||||||
self.cur_item.add(line)
|
self.cur_item.add(line)
|
||||||
|
|
||||||
|
@ -171,6 +186,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
evenodd,
|
evenodd,
|
||||||
gstate.scolor,
|
gstate.scolor,
|
||||||
gstate.ncolor,
|
gstate.ncolor,
|
||||||
|
transformed_path,
|
||||||
|
gstate.dash,
|
||||||
)
|
)
|
||||||
self.cur_item.add(rect)
|
self.cur_item.add(rect)
|
||||||
else:
|
else:
|
||||||
|
@ -182,9 +199,10 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
evenodd,
|
evenodd,
|
||||||
gstate.scolor,
|
gstate.scolor,
|
||||||
gstate.ncolor,
|
gstate.ncolor,
|
||||||
|
transformed_path,
|
||||||
|
gstate.dash,
|
||||||
)
|
)
|
||||||
self.cur_item.add(curve)
|
self.cur_item.add(curve)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
curve = LTCurve(
|
curve = LTCurve(
|
||||||
gstate.linewidth,
|
gstate.linewidth,
|
||||||
|
@ -194,6 +212,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
evenodd,
|
evenodd,
|
||||||
gstate.scolor,
|
gstate.scolor,
|
||||||
gstate.ncolor,
|
gstate.ncolor,
|
||||||
|
transformed_path,
|
||||||
|
gstate.dash,
|
||||||
)
|
)
|
||||||
self.cur_item.add(curve)
|
self.cur_item.add(curve)
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ from .pdffont import PDFFont
|
||||||
from .pdfinterp import Color
|
from .pdfinterp import Color
|
||||||
from .pdfinterp import PDFGraphicState
|
from .pdfinterp import PDFGraphicState
|
||||||
from .pdftypes import PDFStream
|
from .pdftypes import PDFStream
|
||||||
from .utils import INF
|
from .utils import INF, PathSegment
|
||||||
from .utils import LTComponentT
|
from .utils import LTComponentT
|
||||||
from .utils import Matrix
|
from .utils import Matrix
|
||||||
from .utils import Plane
|
from .utils import Plane
|
||||||
|
@ -210,7 +210,14 @@ class LTComponent(LTItem):
|
||||||
|
|
||||||
|
|
||||||
class LTCurve(LTComponent):
|
class LTCurve(LTComponent):
|
||||||
"""A generic Bezier curve"""
|
"""
|
||||||
|
A generic Bezier curve
|
||||||
|
|
||||||
|
The parameter `original_path` contains the original
|
||||||
|
pathing information from the pdf (e.g. for reconstructing Bezier Curves).
|
||||||
|
|
||||||
|
`dashing_style` contains the Dashing information if any.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -221,6 +228,8 @@ class LTCurve(LTComponent):
|
||||||
evenodd: bool = False,
|
evenodd: bool = False,
|
||||||
stroking_color: Optional[Color] = None,
|
stroking_color: Optional[Color] = None,
|
||||||
non_stroking_color: Optional[Color] = None,
|
non_stroking_color: Optional[Color] = None,
|
||||||
|
original_path: Optional[List[PathSegment]] = None,
|
||||||
|
dashing_style: Optional[Tuple[object, object]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
LTComponent.__init__(self, get_bound(pts))
|
LTComponent.__init__(self, get_bound(pts))
|
||||||
self.pts = pts
|
self.pts = pts
|
||||||
|
@ -230,6 +239,8 @@ class LTCurve(LTComponent):
|
||||||
self.evenodd = evenodd
|
self.evenodd = evenodd
|
||||||
self.stroking_color = stroking_color
|
self.stroking_color = stroking_color
|
||||||
self.non_stroking_color = non_stroking_color
|
self.non_stroking_color = non_stroking_color
|
||||||
|
self.original_path = original_path
|
||||||
|
self.dashing_style = dashing_style
|
||||||
|
|
||||||
def get_pts(self) -> str:
|
def get_pts(self) -> str:
|
||||||
return ",".join("%.3f,%.3f" % p for p in self.pts)
|
return ",".join("%.3f,%.3f" % p for p in self.pts)
|
||||||
|
@ -251,6 +262,8 @@ class LTLine(LTCurve):
|
||||||
evenodd: bool = False,
|
evenodd: bool = False,
|
||||||
stroking_color: Optional[Color] = None,
|
stroking_color: Optional[Color] = None,
|
||||||
non_stroking_color: Optional[Color] = None,
|
non_stroking_color: Optional[Color] = None,
|
||||||
|
original_path: Optional[List[PathSegment]] = None,
|
||||||
|
dashing_style: Optional[Tuple[object, object]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
LTCurve.__init__(
|
LTCurve.__init__(
|
||||||
self,
|
self,
|
||||||
|
@ -261,6 +274,8 @@ class LTLine(LTCurve):
|
||||||
evenodd,
|
evenodd,
|
||||||
stroking_color,
|
stroking_color,
|
||||||
non_stroking_color,
|
non_stroking_color,
|
||||||
|
original_path,
|
||||||
|
dashing_style,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -279,6 +294,8 @@ class LTRect(LTCurve):
|
||||||
evenodd: bool = False,
|
evenodd: bool = False,
|
||||||
stroking_color: Optional[Color] = None,
|
stroking_color: Optional[Color] = None,
|
||||||
non_stroking_color: Optional[Color] = None,
|
non_stroking_color: Optional[Color] = None,
|
||||||
|
original_path: Optional[List[PathSegment]] = None,
|
||||||
|
dashing_style: Optional[Tuple[object, object]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
LTCurve.__init__(
|
LTCurve.__init__(
|
||||||
|
@ -290,6 +307,8 @@ class LTRect(LTCurve):
|
||||||
evenodd,
|
evenodd,
|
||||||
stroking_color,
|
stroking_color,
|
||||||
non_stroking_color,
|
non_stroking_color,
|
||||||
|
original_path,
|
||||||
|
dashing_style,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -216,6 +216,45 @@ class TestPaintPath:
|
||||||
(71.41, 434.89),
|
(71.41, 434.89),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def test_paint_path_beziers_check_raw(self):
|
||||||
|
"""See section 4.4, table 4.9 of the PDF reference manual"""
|
||||||
|
|
||||||
|
def parse(path):
|
||||||
|
analyzer = self._get_analyzer()
|
||||||
|
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
||||||
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||||
|
return analyzer.cur_item._objs
|
||||||
|
|
||||||
|
# "c" operator
|
||||||
|
assert parse(
|
||||||
|
[
|
||||||
|
("m", 72.41, 433.89),
|
||||||
|
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
||||||
|
]
|
||||||
|
)[0].original_path == [
|
||||||
|
("m", (72.41, 433.89)),
|
||||||
|
("c", (72.41, 434.45), (71.96, 434.89), (71.41, 434.89)),
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_paint_path_dashed(self):
|
||||||
|
"""See section 4.4, table 4.9 of the PDF reference manual"""
|
||||||
|
|
||||||
|
def parse(path):
|
||||||
|
analyzer = self._get_analyzer()
|
||||||
|
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
||||||
|
graphicstate = PDFGraphicState()
|
||||||
|
graphicstate.dash = ([1, 1], 0)
|
||||||
|
analyzer.paint_path(graphicstate, False, False, False, path)
|
||||||
|
return analyzer.cur_item._objs
|
||||||
|
|
||||||
|
# "c" operator
|
||||||
|
assert parse(
|
||||||
|
[
|
||||||
|
("m", 72.41, 433.89),
|
||||||
|
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
||||||
|
]
|
||||||
|
)[0].dashing_style == ([1, 1], 0)
|
||||||
|
|
||||||
def test_paint_path_without_starting_m(self):
|
def test_paint_path_without_starting_m(self):
|
||||||
gs = PDFGraphicState()
|
gs = PDFGraphicState()
|
||||||
analyzer = self._get_analyzer()
|
analyzer = self._get_analyzer()
|
||||||
|
|
Loading…
Reference in New Issue