Log warning and continue gracefully if errors in cmap (#731)
* Log warning and continue gracefully if errors in cmap * Fix nox testing * Also log warning if cid range is larger than actual code * Format with black * Add docstring * Add CHANGELOG.md * Restore running cmapdb.py directlypull/747/head
parent
13021c9875
commit
894dabf264
|
@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
|
- `IndexError` when handling invalid bfrange code map in
|
||||||
|
CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731))
|
||||||
- `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732))
|
- `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732))
|
||||||
- `TypeError` in encodingdb.py when name of unicode is not
|
- `TypeError` in encodingdb.py when name of unicode is not
|
||||||
str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733))
|
str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733))
|
||||||
|
|
|
@ -29,6 +29,7 @@ from typing import (
|
||||||
Tuple,
|
Tuple,
|
||||||
Union,
|
Union,
|
||||||
cast,
|
cast,
|
||||||
|
Set,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .encodingdb import name2unicode
|
from .encodingdb import name2unicode
|
||||||
|
@ -285,6 +286,7 @@ class CMapParser(PSStackParser[PSKeyword]):
|
||||||
self.cmap = cmap
|
self.cmap = cmap
|
||||||
# some ToUnicode maps don't have "begincmap" keyword.
|
# some ToUnicode maps don't have "begincmap" keyword.
|
||||||
self._in_cmap = True
|
self._in_cmap = True
|
||||||
|
self._warnings: Set[str] = set()
|
||||||
return
|
return
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
|
@ -312,16 +314,22 @@ class CMapParser(PSStackParser[PSKeyword]):
|
||||||
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
|
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
|
||||||
|
|
||||||
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||||
|
"""ToUnicode CMaps
|
||||||
|
|
||||||
|
See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
|
||||||
|
"""
|
||||||
if token is self.KEYWORD_BEGINCMAP:
|
if token is self.KEYWORD_BEGINCMAP:
|
||||||
self._in_cmap = True
|
self._in_cmap = True
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
elif token is self.KEYWORD_ENDCMAP:
|
elif token is self.KEYWORD_ENDCMAP:
|
||||||
self._in_cmap = False
|
self._in_cmap = False
|
||||||
return
|
return
|
||||||
|
|
||||||
if not self._in_cmap:
|
if not self._in_cmap:
|
||||||
return
|
return
|
||||||
#
|
|
||||||
if token is self.KEYWORD_DEF:
|
if token is self.KEYWORD_DEF:
|
||||||
try:
|
try:
|
||||||
((_, k), (_, v)) = self.pop(2)
|
((_, k), (_, v)) = self.pop(2)
|
||||||
|
@ -350,33 +358,47 @@ class CMapParser(PSStackParser[PSKeyword]):
|
||||||
if token is self.KEYWORD_BEGINCIDRANGE:
|
if token is self.KEYWORD_BEGINCIDRANGE:
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_ENDCIDRANGE:
|
if token is self.KEYWORD_ENDCIDRANGE:
|
||||||
objs = [obj for (__, obj) in self.popall()]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (s, e, cid) in choplist(3, objs):
|
for (start_byte, end_byte, cid) in choplist(3, objs):
|
||||||
if (
|
if not isinstance(start_byte, bytes):
|
||||||
not isinstance(s, bytes)
|
self._warn_once("The start object of begincidrange is not a byte.")
|
||||||
or not isinstance(e, bytes)
|
|
||||||
or not isinstance(cid, int)
|
|
||||||
or len(s) != len(e)
|
|
||||||
):
|
|
||||||
continue
|
continue
|
||||||
sprefix = s[:-4]
|
if not isinstance(end_byte, bytes):
|
||||||
eprefix = e[:-4]
|
self._warn_once("The end object of begincidrange is not a byte.")
|
||||||
if sprefix != eprefix:
|
|
||||||
continue
|
continue
|
||||||
svar = s[-4:]
|
if not isinstance(cid, int):
|
||||||
evar = e[-4:]
|
self._warn_once("The cid object of begincidrange is not a byte.")
|
||||||
s1 = nunpack(svar)
|
continue
|
||||||
e1 = nunpack(evar)
|
if len(start_byte) != len(end_byte):
|
||||||
|
self._warn_once(
|
||||||
|
"The start and end byte of begincidrange have "
|
||||||
|
"different lengths."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
start_prefix = start_byte[:-4]
|
||||||
|
end_prefix = end_byte[:-4]
|
||||||
|
if start_prefix != end_prefix:
|
||||||
|
self._warn_once(
|
||||||
|
"The prefix of the start and end byte of "
|
||||||
|
"begincidrange are not the same."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
svar = start_byte[-4:]
|
||||||
|
evar = end_byte[-4:]
|
||||||
|
start = nunpack(svar)
|
||||||
|
end = nunpack(evar)
|
||||||
vlen = len(svar)
|
vlen = len(svar)
|
||||||
for i in range(e1 - s1 + 1):
|
for i in range(end - start + 1):
|
||||||
x = sprefix + struct.pack(">L", s1 + i)[-vlen:]
|
x = start_prefix + struct.pack(">L", start + i)[-vlen:]
|
||||||
self.cmap.add_cid2unichr(cid + i, x)
|
self.cmap.add_cid2unichr(cid + i, x)
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_BEGINCIDCHAR:
|
if token is self.KEYWORD_BEGINCIDCHAR:
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_ENDCIDCHAR:
|
if token is self.KEYWORD_ENDCIDCHAR:
|
||||||
objs = [obj for (__, obj) in self.popall()]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (cid, code) in choplist(2, objs):
|
for (cid, code) in choplist(2, objs):
|
||||||
|
@ -387,34 +409,44 @@ class CMapParser(PSStackParser[PSKeyword]):
|
||||||
if token is self.KEYWORD_BEGINBFRANGE:
|
if token is self.KEYWORD_BEGINBFRANGE:
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_ENDBFRANGE:
|
if token is self.KEYWORD_ENDBFRANGE:
|
||||||
objs = [obj for (__, obj) in self.popall()]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (s, e, code) in choplist(3, objs):
|
for (start_byte, end_byte, code) in choplist(3, objs):
|
||||||
if (
|
if not isinstance(start_byte, bytes):
|
||||||
not isinstance(s, bytes)
|
self._warn_once("The start object is not a byte.")
|
||||||
or not isinstance(e, bytes)
|
|
||||||
or len(s) != len(e)
|
|
||||||
):
|
|
||||||
continue
|
continue
|
||||||
s1 = nunpack(s)
|
if not isinstance(end_byte, bytes):
|
||||||
e1 = nunpack(e)
|
self._warn_once("The end object is not a byte.")
|
||||||
|
continue
|
||||||
|
if len(start_byte) != len(end_byte):
|
||||||
|
self._warn_once("The start and end byte have different lengths.")
|
||||||
|
continue
|
||||||
|
start = nunpack(start_byte)
|
||||||
|
end = nunpack(end_byte)
|
||||||
if isinstance(code, list):
|
if isinstance(code, list):
|
||||||
for i in range(e1 - s1 + 1):
|
if len(code) != end - start + 1:
|
||||||
self.cmap.add_cid2unichr(s1 + i, code[i])
|
self._warn_once(
|
||||||
|
"The difference between the start and end "
|
||||||
|
"offsets does not match the code length."
|
||||||
|
)
|
||||||
|
for cid, unicode_value in zip(range(start, end + 1), code):
|
||||||
|
self.cmap.add_cid2unichr(cid, unicode_value)
|
||||||
else:
|
else:
|
||||||
assert isinstance(code, bytes)
|
assert isinstance(code, bytes)
|
||||||
var = code[-4:]
|
var = code[-4:]
|
||||||
base = nunpack(var)
|
base = nunpack(var)
|
||||||
prefix = code[:-4]
|
prefix = code[:-4]
|
||||||
vlen = len(var)
|
vlen = len(var)
|
||||||
for i in range(e1 - s1 + 1):
|
for i in range(end - start + 1):
|
||||||
x = prefix + struct.pack(">L", base + i)[-vlen:]
|
x = prefix + struct.pack(">L", base + i)[-vlen:]
|
||||||
self.cmap.add_cid2unichr(s1 + i, x)
|
self.cmap.add_cid2unichr(start + i, x)
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_BEGINBFCHAR:
|
if token is self.KEYWORD_BEGINBFCHAR:
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_ENDBFCHAR:
|
if token is self.KEYWORD_ENDBFCHAR:
|
||||||
objs = [obj for (__, obj) in self.popall()]
|
objs = [obj for (__, obj) in self.popall()]
|
||||||
for (cid, code) in choplist(2, objs):
|
for (cid, code) in choplist(2, objs):
|
||||||
|
@ -425,12 +457,23 @@ class CMapParser(PSStackParser[PSKeyword]):
|
||||||
if token is self.KEYWORD_BEGINNOTDEFRANGE:
|
if token is self.KEYWORD_BEGINNOTDEFRANGE:
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_ENDNOTDEFRANGE:
|
if token is self.KEYWORD_ENDNOTDEFRANGE:
|
||||||
self.popall()
|
self.popall()
|
||||||
return
|
return
|
||||||
|
|
||||||
self.push((pos, token))
|
self.push((pos, token))
|
||||||
return
|
|
||||||
|
def _warn_once(self, msg: str) -> None:
|
||||||
|
"""Warn once for each unique message"""
|
||||||
|
if msg not in self._warnings:
|
||||||
|
self._warnings.add(msg)
|
||||||
|
base_msg = (
|
||||||
|
"Ignoring (part of) ToUnicode map because the PDF data "
|
||||||
|
"does not conform to the format. This could result in "
|
||||||
|
"(cid) values in the output. "
|
||||||
|
)
|
||||||
|
log.warning(base_msg + msg)
|
||||||
|
|
||||||
|
|
||||||
def main(argv: List[str]) -> None:
|
def main(argv: List[str]) -> None:
|
||||||
|
|
Loading…
Reference in New Issue