From 894dabf2648523f792cbbb3878dba82a4c570287 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Mon, 21 Mar 2022 19:39:53 +0100 Subject: [PATCH] Log warning and continue gracefully if errors in cmap (#731) * Log warning and continue gracefully if errors in cmap * Fix nox testing * Also log warning if cid range is larger than actual code * Format with black * Add docstring * Add CHANGELOG.md * Restore running cmapdb.py directly --- CHANGELOG.md | 2 + pdfminer/cmapdb.py | 103 ++++++++++++++++++++++++++++++++------------- 2 files changed, 75 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5c8fd3..07beb6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed +- `IndexError` when handling invalid bfrange code map in + CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731)) - `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732)) - `TypeError` in encodingdb.py when name of unicode is not str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733)) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index b84d86b..704a9d3 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -29,6 +29,7 @@ from typing import ( Tuple, Union, cast, + Set, ) from .encodingdb import name2unicode @@ -285,6 +286,7 @@ class CMapParser(PSStackParser[PSKeyword]): self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True + self._warnings: Set[str] = set() return def run(self) -> None: @@ -312,16 +314,22 @@ class CMapParser(PSStackParser[PSKeyword]): KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange") def do_keyword(self, pos: int, token: PSKeyword) -> None: + """ToUnicode CMaps + + See Section 5.9.2 - ToUnicode CMaps of the PDF Reference. + """ if token is self.KEYWORD_BEGINCMAP: self._in_cmap = True self.popall() return + elif token is self.KEYWORD_ENDCMAP: self._in_cmap = False return + if not self._in_cmap: return - # + if token is self.KEYWORD_DEF: try: ((_, k), (_, v)) = self.pop(2) @@ -350,33 +358,47 @@ class CMapParser(PSStackParser[PSKeyword]): if token is self.KEYWORD_BEGINCIDRANGE: self.popall() return + if token is self.KEYWORD_ENDCIDRANGE: objs = [obj for (__, obj) in self.popall()] - for (s, e, cid) in choplist(3, objs): - if ( - not isinstance(s, bytes) - or not isinstance(e, bytes) - or not isinstance(cid, int) - or len(s) != len(e) - ): + for (start_byte, end_byte, cid) in choplist(3, objs): + if not isinstance(start_byte, bytes): + self._warn_once("The start object of begincidrange is not a byte.") continue - sprefix = s[:-4] - eprefix = e[:-4] - if sprefix != eprefix: + if not isinstance(end_byte, bytes): + self._warn_once("The end object of begincidrange is not a byte.") continue - svar = s[-4:] - evar = e[-4:] - s1 = nunpack(svar) - e1 = nunpack(evar) + if not isinstance(cid, int): + self._warn_once("The cid object of begincidrange is not a byte.") + continue + if len(start_byte) != len(end_byte): + self._warn_once( + "The start and end byte of begincidrange have " + "different lengths." + ) + continue + start_prefix = start_byte[:-4] + end_prefix = end_byte[:-4] + if start_prefix != end_prefix: + self._warn_once( + "The prefix of the start and end byte of " + "begincidrange are not the same." + ) + continue + svar = start_byte[-4:] + evar = end_byte[-4:] + start = nunpack(svar) + end = nunpack(evar) vlen = len(svar) - for i in range(e1 - s1 + 1): - x = sprefix + struct.pack(">L", s1 + i)[-vlen:] + for i in range(end - start + 1): + x = start_prefix + struct.pack(">L", start + i)[-vlen:] self.cmap.add_cid2unichr(cid + i, x) return if token is self.KEYWORD_BEGINCIDCHAR: self.popall() return + if token is self.KEYWORD_ENDCIDCHAR: objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): @@ -387,34 +409,44 @@ class CMapParser(PSStackParser[PSKeyword]): if token is self.KEYWORD_BEGINBFRANGE: self.popall() return + if token is self.KEYWORD_ENDBFRANGE: objs = [obj for (__, obj) in self.popall()] - for (s, e, code) in choplist(3, objs): - if ( - not isinstance(s, bytes) - or not isinstance(e, bytes) - or len(s) != len(e) - ): + for (start_byte, end_byte, code) in choplist(3, objs): + if not isinstance(start_byte, bytes): + self._warn_once("The start object is not a byte.") continue - s1 = nunpack(s) - e1 = nunpack(e) + if not isinstance(end_byte, bytes): + self._warn_once("The end object is not a byte.") + continue + if len(start_byte) != len(end_byte): + self._warn_once("The start and end byte have different lengths.") + continue + start = nunpack(start_byte) + end = nunpack(end_byte) if isinstance(code, list): - for i in range(e1 - s1 + 1): - self.cmap.add_cid2unichr(s1 + i, code[i]) + if len(code) != end - start + 1: + self._warn_once( + "The difference between the start and end " + "offsets does not match the code length." + ) + for cid, unicode_value in zip(range(start, end + 1), code): + self.cmap.add_cid2unichr(cid, unicode_value) else: assert isinstance(code, bytes) var = code[-4:] base = nunpack(var) prefix = code[:-4] vlen = len(var) - for i in range(e1 - s1 + 1): + for i in range(end - start + 1): x = prefix + struct.pack(">L", base + i)[-vlen:] - self.cmap.add_cid2unichr(s1 + i, x) + self.cmap.add_cid2unichr(start + i, x) return if token is self.KEYWORD_BEGINBFCHAR: self.popall() return + if token is self.KEYWORD_ENDBFCHAR: objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): @@ -425,12 +457,23 @@ class CMapParser(PSStackParser[PSKeyword]): if token is self.KEYWORD_BEGINNOTDEFRANGE: self.popall() return + if token is self.KEYWORD_ENDNOTDEFRANGE: self.popall() return self.push((pos, token)) - return + + def _warn_once(self, msg: str) -> None: + """Warn once for each unique message""" + if msg not in self._warnings: + self._warnings.add(msg) + base_msg = ( + "Ignoring (part of) ToUnicode map because the PDF data " + "does not conform to the format. This could result in " + "(cid) values in the output. " + ) + log.warning(base_msg + msg) def main(argv: List[str]) -> None: