2020-10-25 13:37:12 +00:00
|
|
|
import io
|
|
|
|
from tempfile import TemporaryFile
|
2020-07-11 15:34:38 +00:00
|
|
|
|
Fix to set color space from color convenience ops (#794)
Section 4.5 of the PDF reference says: "Color values are interpreted
according to the current color space, another parameter of the graphics
state. A PDF content stream first selects a color space by invoking the
CS operator (for the stroking color) or the cs operator (for the
non-stroking color). It then selects color values within that color
space with the SC operator (stroking) or the sc operator (nonstroking).
There are also convenience operators—G, g, RG, rg, K, and k—that select
both a color space and a color value within it in a single step."
Previously, those convenience operators did *not* set the color space.
This commit, following on filed issue #779, fixes this. It also adds a
test to demonstrate that, at least for the do_rg method, the fix works
as intended.
2022-08-18 18:38:51 +00:00
|
|
|
from helpers import absolute_sample_path
|
2020-10-25 13:37:12 +00:00
|
|
|
from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter
|
2021-07-27 16:27:32 +00:00
|
|
|
from pdfminer.high_level import extract_pages
|
Fix to set color space from color convenience ops (#794)
Section 4.5 of the PDF reference says: "Color values are interpreted
according to the current color space, another parameter of the graphics
state. A PDF content stream first selects a color space by invoking the
CS operator (for the stroking color) or the cs operator (for the
non-stroking color). It then selects color values within that color
space with the SC operator (stroking) or the sc operator (nonstroking).
There are also convenience operators—G, g, RG, rg, K, and k—that select
both a color space and a color value within it in a single step."
Previously, those convenience operators did *not* set the color space.
This commit, following on filed issue #779, fixes this. It also adds a
test to demonstrate that, at least for the do_rg method, the fix works
as intended.
2022-08-18 18:38:51 +00:00
|
|
|
from pdfminer.layout import LTChar, LTContainer, LTRect, LTLine, LTCurve
|
2020-07-11 15:34:38 +00:00
|
|
|
from pdfminer.pdfinterp import PDFGraphicState
|
|
|
|
|
|
|
|
|
2022-02-02 21:24:32 +00:00
|
|
|
class TestPaintPath:
|
2020-07-11 15:34:38 +00:00
|
|
|
def test_paint_path(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
path = [("m", 6, 7), ("l", 7, 7)]
|
2020-07-11 15:34:38 +00:00
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
2022-02-02 21:24:32 +00:00
|
|
|
assert len(analyzer.cur_item._objs) == 1
|
2020-07-11 15:34:38 +00:00
|
|
|
|
|
|
|
def test_paint_path_mlllh(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
path = [("m", 6, 7), ("l", 7, 7), ("l", 7, 91), ("l", 6, 91), ("h",)]
|
2020-07-11 15:34:38 +00:00
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
2022-02-02 21:24:32 +00:00
|
|
|
assert len(analyzer.cur_item) == 1
|
2020-07-11 15:34:38 +00:00
|
|
|
|
|
|
|
def test_paint_path_multiple_mlllh(self):
|
|
|
|
"""Path from samples/contrib/issue-00369-excel.pdf"""
|
|
|
|
path = [
|
2022-02-11 21:46:51 +00:00
|
|
|
("m", 6, 7),
|
|
|
|
("l", 7, 7),
|
|
|
|
("l", 7, 91),
|
|
|
|
("l", 6, 91),
|
|
|
|
("h",),
|
|
|
|
("m", 4, 7),
|
|
|
|
("l", 6, 7),
|
|
|
|
("l", 6, 91),
|
|
|
|
("l", 4, 91),
|
|
|
|
("h",),
|
|
|
|
("m", 67, 2),
|
|
|
|
("l", 68, 2),
|
|
|
|
("l", 68, 3),
|
|
|
|
("l", 67, 3),
|
|
|
|
("h",),
|
2020-07-11 15:34:38 +00:00
|
|
|
]
|
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
2022-02-02 21:24:32 +00:00
|
|
|
assert len(analyzer.cur_item._objs) == 3
|
2020-07-11 15:34:38 +00:00
|
|
|
|
2020-10-12 15:53:00 +00:00
|
|
|
def test_paint_path_quadrilaterals(self):
|
|
|
|
"""via https://github.com/pdfminer/pdfminer.six/issues/473"""
|
|
|
|
|
|
|
|
def parse(path):
|
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
|
|
|
return analyzer.cur_item._objs
|
|
|
|
|
|
|
|
def get_types(path):
|
|
|
|
return list(map(type, parse(path)))
|
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Standard rect
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 10, 90),
|
|
|
|
("l", 90, 90),
|
|
|
|
("l", 90, 10),
|
|
|
|
("l", 10, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTRect]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Same but mllll variation
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 10, 90),
|
|
|
|
("l", 90, 90),
|
|
|
|
("l", 90, 10),
|
|
|
|
("l", 10, 10),
|
|
|
|
("l", 10, 90),
|
|
|
|
]
|
|
|
|
) == [LTRect]
|
2021-07-27 16:27:32 +00:00
|
|
|
|
|
|
|
# Bowtie shape
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 110, 90),
|
|
|
|
("l", 190, 10),
|
|
|
|
("l", 190, 90),
|
|
|
|
("l", 110, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTCurve]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Quadrilateral with one slanted side
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 210, 90),
|
|
|
|
("l", 290, 60),
|
|
|
|
("l", 290, 10),
|
|
|
|
("l", 210, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTCurve]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Path with two rect subpaths
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 310, 90),
|
|
|
|
("l", 350, 90),
|
|
|
|
("l", 350, 10),
|
|
|
|
("l", 310, 10),
|
|
|
|
("h",),
|
|
|
|
("m", 350, 90),
|
|
|
|
("l", 390, 90),
|
|
|
|
("l", 390, 10),
|
|
|
|
("l", 350, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTRect, LTRect]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Path with one rect subpath and one pentagon
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 410, 90),
|
|
|
|
("l", 445, 90),
|
|
|
|
("l", 445, 10),
|
|
|
|
("l", 410, 10),
|
|
|
|
("h",),
|
|
|
|
("m", 455, 70),
|
|
|
|
("l", 475, 90),
|
|
|
|
("l", 490, 70),
|
|
|
|
("l", 490, 10),
|
|
|
|
("l", 455, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTRect, LTCurve]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Three types of simple lines
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
# Vertical line
|
|
|
|
("m", 10, 30),
|
|
|
|
("l", 10, 40),
|
|
|
|
("h",),
|
|
|
|
# Horizontal line
|
|
|
|
("m", 10, 50),
|
|
|
|
("l", 70, 50),
|
|
|
|
("h",),
|
|
|
|
# Diagonal line
|
|
|
|
("m", 10, 10),
|
|
|
|
("l", 30, 30),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTLine, LTLine, LTLine]
|
2021-07-27 16:27:32 +00:00
|
|
|
|
|
|
|
# Same as above, but 'ml' variation
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
# Vertical line
|
|
|
|
("m", 10, 30),
|
|
|
|
("l", 10, 40),
|
|
|
|
# Horizontal line
|
|
|
|
("m", 10, 50),
|
|
|
|
("l", 70, 50),
|
|
|
|
# Diagonal line
|
|
|
|
("m", 10, 10),
|
|
|
|
("l", 30, 30),
|
|
|
|
]
|
|
|
|
) == [LTLine, LTLine, LTLine]
|
2021-07-27 16:27:32 +00:00
|
|
|
|
|
|
|
# There are six lines in this one-page PDF;
|
|
|
|
# they all have shape 'ml' not 'mlh'
|
|
|
|
ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf")
|
|
|
|
ml_pdf_page = list(ml_pdf)[0]
|
|
|
|
assert sum(type(item) == LTLine for item in ml_pdf_page) == 6
|
|
|
|
|
2020-07-11 15:34:38 +00:00
|
|
|
def _get_analyzer(self):
|
|
|
|
analyzer = PDFLayoutAnalyzer(None)
|
|
|
|
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
|
|
|
return analyzer
|
2020-10-25 13:37:12 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
def test_paint_path_beziers(self):
|
|
|
|
"""See section 4.4, table 4.9 of the PDF reference manual"""
|
|
|
|
|
|
|
|
def parse(path):
|
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
|
|
|
return analyzer.cur_item._objs
|
|
|
|
|
|
|
|
# "c" operator
|
2022-02-11 21:46:51 +00:00
|
|
|
assert parse(
|
|
|
|
[
|
|
|
|
("m", 72.41, 433.89),
|
|
|
|
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
|
|
|
]
|
|
|
|
)[0].pts == [
|
2021-07-27 16:27:32 +00:00
|
|
|
(72.41, 433.89),
|
|
|
|
(71.41, 434.89),
|
|
|
|
]
|
|
|
|
|
|
|
|
# "v" operator
|
2022-02-11 21:46:51 +00:00
|
|
|
assert parse([("m", 72.41, 433.89), ("v", 71.96, 434.89, 71.41, 434.89)])[
|
|
|
|
0
|
|
|
|
].pts == [
|
2021-07-27 16:27:32 +00:00
|
|
|
(72.41, 433.89),
|
|
|
|
(71.41, 434.89),
|
|
|
|
]
|
|
|
|
|
|
|
|
# "y" operator
|
2022-02-11 21:46:51 +00:00
|
|
|
assert parse([("m", 72.41, 433.89), ("y", 72.41, 434.45, 71.41, 434.89)])[
|
|
|
|
0
|
|
|
|
].pts == [
|
2021-07-27 16:27:32 +00:00
|
|
|
(72.41, 433.89),
|
|
|
|
(71.41, 434.89),
|
|
|
|
]
|
|
|
|
|
2022-05-06 20:15:00 +00:00
|
|
|
def test_paint_path_without_starting_m(self):
|
|
|
|
gs = PDFGraphicState()
|
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
|
|
|
paths = [[("h",)], [("l", 72.41, 433.89), ("l", 82.41, 433.89), ("h",)]]
|
|
|
|
for path in paths:
|
|
|
|
analyzer.paint_path(gs, False, False, False, path)
|
|
|
|
assert len(analyzer.cur_item._objs) == 0
|
|
|
|
|
2020-10-25 13:37:12 +00:00
|
|
|
|
Fix to set color space from color convenience ops (#794)
Section 4.5 of the PDF reference says: "Color values are interpreted
according to the current color space, another parameter of the graphics
state. A PDF content stream first selects a color space by invoking the
CS operator (for the stroking color) or the cs operator (for the
non-stroking color). It then selects color values within that color
space with the SC operator (stroking) or the sc operator (nonstroking).
There are also convenience operators—G, g, RG, rg, K, and k—that select
both a color space and a color value within it in a single step."
Previously, those convenience operators did *not* set the color space.
This commit, following on filed issue #779, fixes this. It also adds a
test to demonstrate that, at least for the do_rg method, the fix works
as intended.
2022-08-18 18:38:51 +00:00
|
|
|
def get_chars(el):
|
|
|
|
if isinstance(el, LTContainer):
|
|
|
|
for item in el:
|
|
|
|
yield from get_chars(item)
|
|
|
|
elif isinstance(el, LTChar):
|
|
|
|
yield el
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class TestColorSpace:
|
|
|
|
def test_do_rg(self):
|
|
|
|
path = absolute_sample_path("contrib/issue-00352-hash-twos-complement.pdf")
|
|
|
|
for page in extract_pages(path):
|
|
|
|
for char in get_chars(page):
|
|
|
|
cs = char.ncs.name
|
|
|
|
color = char.graphicstate.ncolor
|
|
|
|
if cs == "DeviceGray":
|
|
|
|
assert isinstance(color, (float, int))
|
|
|
|
elif cs == "DeviceRGB":
|
|
|
|
assert len(color) == 3
|
|
|
|
elif cs == "DeviceCMYK":
|
|
|
|
assert len(color) == 4
|
|
|
|
|
|
|
|
|
2022-02-11 21:46:51 +00:00
|
|
|
class TestBinaryDetector:
|
2020-10-25 13:37:12 +00:00
|
|
|
def test_stringio(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert not PDFConverter._is_binary_stream(io.StringIO())
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_bytesio(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert PDFConverter._is_binary_stream(io.BytesIO())
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_tmpfile(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
with TemporaryFile(mode="w") as f:
|
2022-02-02 21:24:32 +00:00
|
|
|
assert not PDFConverter._is_binary_stream(f)
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_binary_tmpfile(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
with TemporaryFile(mode="wb") as f:
|
2022-02-02 21:24:32 +00:00
|
|
|
assert PDFConverter._is_binary_stream(f)
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_non_file_like_object_defaults_to_binary(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert PDFConverter._is_binary_stream(object())
|
Fix bug: _is_binary_stream should recognize TextIOWrapper as non-binary, escaped \r\n should be removed (#616)
* detect TextIOWrapper as non-binary
* I don't understand the CHANGELOG.md format, hope this is good enough
* Delete \\\r\n in Literal Strings (ref. section 7.3.4.2 of PDF32000_2008)
* Keep Travis CI happy
* Added test
* Remove pdfminer/Changelog
* Prettify _parse_string_1
* Add CHANGELOG.md
* Satisfy flake8
* Update CHANGELOG.md
* Use logging.Logger.warning instead of warning.warn in most cases, following
the Python official guidance that warning.warn is directed at _developers_,
not users
* (pdfdocument.py) remove declarations of PDFTextExtractionNotAllowedWarning,
PDFNoValidXRefWarning
* (pdfpage.py) Don't import warning, don't use PDFTextExtractionNotAllowedWarning
* (tools/dumppdf.py) Don't import warning, don't use PDFNoValidXRefWarning
* (tests/test_tools_dumppdf.py) Don't import warning, check for logging.WARN rather
than PDFNoValidXRefWarning
* get name right
* make flake8 happy
* Revert "make flake8 happy"
This reverts commit 45927696869abff5041cc5a338aa9390cd98606e.
* Revert "get name right"
This reverts commit 80091ea211c279511d206d14b2ad6cb0fb887a1f.
* Revert "Use logging.Logger.warning instead of warning.warn in most cases, following"
This reverts commit 3c1e3d66064e0c42d86a7191c357e16d1406449d.
* Revert "Merge branch 'preferLoggingToWarning' into hst"
This reverts commit 9d9d1399216d589ab600755d6548240d935c3ff3, reversing
changes made to 80091ea211c279511d206d14b2ad6cb0fb887a1f.
* Revert "Revert "Merge branch 'preferLoggingToWarning' into hst""
This reverts commit b3da21934d29c5cfa9354d7a41018368b6d51e9f.
Co-authored-by: Henry S. Thompson <ht@home.hst.name>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-09-27 18:30:40 +00:00
|
|
|
|
|
|
|
def test_textiowrapper(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert not PDFConverter._is_binary_stream(io.TextIOBase())
|