Log warning and continue gracefully if errors in cmap (#731)

* Log warning and continue gracefully if errors in cmap

* Fix nox testing

* Also log warning if cid range is larger than actual code

* Format with black

* Add docstring

* Add CHANGELOG.md

* Restore running cmapdb.py directly
pull/747/head
Pieter Marsman 2022-03-21 19:39:53 +01:00 committed by GitHub
parent 13021c9875
commit 894dabf264
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 75 additions and 30 deletions

View File

@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed ### Fixed
- `IndexError` when handling invalid bfrange code map in
CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731))
- `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732)) - `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732))
- `TypeError` in encodingdb.py when name of unicode is not - `TypeError` in encodingdb.py when name of unicode is not
str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733)) str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733))

View File

@ -29,6 +29,7 @@ from typing import (
Tuple, Tuple,
Union, Union,
cast, cast,
Set,
) )
from .encodingdb import name2unicode from .encodingdb import name2unicode
@ -285,6 +286,7 @@ class CMapParser(PSStackParser[PSKeyword]):
self.cmap = cmap self.cmap = cmap
# some ToUnicode maps don't have "begincmap" keyword. # some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True self._in_cmap = True
self._warnings: Set[str] = set()
return return
def run(self) -> None: def run(self) -> None:
@ -312,16 +314,22 @@ class CMapParser(PSStackParser[PSKeyword]):
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
def do_keyword(self, pos: int, token: PSKeyword) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None:
"""ToUnicode CMaps
See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
"""
if token is self.KEYWORD_BEGINCMAP: if token is self.KEYWORD_BEGINCMAP:
self._in_cmap = True self._in_cmap = True
self.popall() self.popall()
return return
elif token is self.KEYWORD_ENDCMAP: elif token is self.KEYWORD_ENDCMAP:
self._in_cmap = False self._in_cmap = False
return return
if not self._in_cmap: if not self._in_cmap:
return return
#
if token is self.KEYWORD_DEF: if token is self.KEYWORD_DEF:
try: try:
((_, k), (_, v)) = self.pop(2) ((_, k), (_, v)) = self.pop(2)
@ -350,33 +358,47 @@ class CMapParser(PSStackParser[PSKeyword]):
if token is self.KEYWORD_BEGINCIDRANGE: if token is self.KEYWORD_BEGINCIDRANGE:
self.popall() self.popall()
return return
if token is self.KEYWORD_ENDCIDRANGE: if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs): for (start_byte, end_byte, cid) in choplist(3, objs):
if ( if not isinstance(start_byte, bytes):
not isinstance(s, bytes) self._warn_once("The start object of begincidrange is not a byte.")
or not isinstance(e, bytes)
or not isinstance(cid, int)
or len(s) != len(e)
):
continue continue
sprefix = s[:-4] if not isinstance(end_byte, bytes):
eprefix = e[:-4] self._warn_once("The end object of begincidrange is not a byte.")
if sprefix != eprefix:
continue continue
svar = s[-4:] if not isinstance(cid, int):
evar = e[-4:] self._warn_once("The cid object of begincidrange is not a byte.")
s1 = nunpack(svar) continue
e1 = nunpack(evar) if len(start_byte) != len(end_byte):
self._warn_once(
"The start and end byte of begincidrange have "
"different lengths."
)
continue
start_prefix = start_byte[:-4]
end_prefix = end_byte[:-4]
if start_prefix != end_prefix:
self._warn_once(
"The prefix of the start and end byte of "
"begincidrange are not the same."
)
continue
svar = start_byte[-4:]
evar = end_byte[-4:]
start = nunpack(svar)
end = nunpack(evar)
vlen = len(svar) vlen = len(svar)
for i in range(e1 - s1 + 1): for i in range(end - start + 1):
x = sprefix + struct.pack(">L", s1 + i)[-vlen:] x = start_prefix + struct.pack(">L", start + i)[-vlen:]
self.cmap.add_cid2unichr(cid + i, x) self.cmap.add_cid2unichr(cid + i, x)
return return
if token is self.KEYWORD_BEGINCIDCHAR: if token is self.KEYWORD_BEGINCIDCHAR:
self.popall() self.popall()
return return
if token is self.KEYWORD_ENDCIDCHAR: if token is self.KEYWORD_ENDCIDCHAR:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
@ -387,34 +409,44 @@ class CMapParser(PSStackParser[PSKeyword]):
if token is self.KEYWORD_BEGINBFRANGE: if token is self.KEYWORD_BEGINBFRANGE:
self.popall() self.popall()
return return
if token is self.KEYWORD_ENDBFRANGE: if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs): for (start_byte, end_byte, code) in choplist(3, objs):
if ( if not isinstance(start_byte, bytes):
not isinstance(s, bytes) self._warn_once("The start object is not a byte.")
or not isinstance(e, bytes)
or len(s) != len(e)
):
continue continue
s1 = nunpack(s) if not isinstance(end_byte, bytes):
e1 = nunpack(e) self._warn_once("The end object is not a byte.")
continue
if len(start_byte) != len(end_byte):
self._warn_once("The start and end byte have different lengths.")
continue
start = nunpack(start_byte)
end = nunpack(end_byte)
if isinstance(code, list): if isinstance(code, list):
for i in range(e1 - s1 + 1): if len(code) != end - start + 1:
self.cmap.add_cid2unichr(s1 + i, code[i]) self._warn_once(
"The difference between the start and end "
"offsets does not match the code length."
)
for cid, unicode_value in zip(range(start, end + 1), code):
self.cmap.add_cid2unichr(cid, unicode_value)
else: else:
assert isinstance(code, bytes) assert isinstance(code, bytes)
var = code[-4:] var = code[-4:]
base = nunpack(var) base = nunpack(var)
prefix = code[:-4] prefix = code[:-4]
vlen = len(var) vlen = len(var)
for i in range(e1 - s1 + 1): for i in range(end - start + 1):
x = prefix + struct.pack(">L", base + i)[-vlen:] x = prefix + struct.pack(">L", base + i)[-vlen:]
self.cmap.add_cid2unichr(s1 + i, x) self.cmap.add_cid2unichr(start + i, x)
return return
if token is self.KEYWORD_BEGINBFCHAR: if token is self.KEYWORD_BEGINBFCHAR:
self.popall() self.popall()
return return
if token is self.KEYWORD_ENDBFCHAR: if token is self.KEYWORD_ENDBFCHAR:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
@ -425,12 +457,23 @@ class CMapParser(PSStackParser[PSKeyword]):
if token is self.KEYWORD_BEGINNOTDEFRANGE: if token is self.KEYWORD_BEGINNOTDEFRANGE:
self.popall() self.popall()
return return
if token is self.KEYWORD_ENDNOTDEFRANGE: if token is self.KEYWORD_ENDNOTDEFRANGE:
self.popall() self.popall()
return return
self.push((pos, token)) self.push((pos, token))
return
def _warn_once(self, msg: str) -> None:
"""Warn once for each unique message"""
if msg not in self._warnings:
self._warnings.add(msg)
base_msg = (
"Ignoring (part of) ToUnicode map because the PDF data "
"does not conform to the format. This could result in "
"(cid) values in the output. "
)
log.warning(base_msg + msg)
def main(argv: List[str]) -> None: def main(argv: List[str]) -> None: