diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c84e69..4833fcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371)) -### Fixed +## Fixed +- Always try to get CMap, not only for identity encodings ([#438](https://github.com/pdfminer/pdfminer.six/pull/438)) - Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451)) ### Changed diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 35c574f..e1f5a5d 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -3,7 +3,6 @@ import struct import sys from io import BytesIO - from . import settings from .cmapdb import CMap from .cmapdb import CMapDB @@ -133,16 +132,12 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') -# Note: DLIdent-* isn't found in PDF Reference but is been kept as -# it is harmless and have possibility of been a type. -# (induced from bug report/PR) -IDENTITY_ENCODER = {'Identity-H': 'Identity-H', - 'Identity-V': 'Identity-V', - 'DLIdent-H': 'Identity-H', - 'DLIdent-V': 'Identity-V', - 'OneByteIdentityH': 'OneByteIdentityH', - 'OneByteIdentityV': 'OneByteIdentityV', - } +# Mapping of cmap names. Original cmap name is kept if not in the mapping. +# (missing reference for why DLIdent is mapped to Identity) +IDENTITY_ENCODER = { + 'DLIdent-H': 'Identity-H', + 'DLIdent-V': 'Identity-V', +} def getdict(data): @@ -725,13 +720,28 @@ class PDFCIDFont(PDFFont): return def get_cmap_from_spec(self, spec, strict): - """ + """Get cmap from font specification + For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an attribute of spec['Encoding']. The horizontal/vertical modes are mentioned with different name such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. """ + cmap_name = self._get_cmap_name(spec, strict) + + try: + return CMapDB.get_cmap(cmap_name) + except CMapDB.CMapNotFound as e: + if strict: + raise PDFFontError(e) + return CMap() + + @staticmethod + def _get_cmap_name(spec, strict): + """Get cmap name from font specification""" + cmap_name = 'unknown' # default value + try: spec_encoding = spec['Encoding'] if hasattr(spec_encoding, 'name'): @@ -741,18 +751,16 @@ class PDFCIDFont(PDFFont): except KeyError: if strict: raise PDFFontError('Encoding is unspecified') - cmap_name = 'unknown' + if type(cmap_name) is PDFStream: if 'CMapName' in cmap_name: cmap_name = cmap_name.get('CMapName').name else: if strict: raise PDFFontError('CMapName unspecified for encoding') - cmap_name = 'unknown' - if cmap_name in IDENTITY_ENCODER: - return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name]) - else: - return CMap() + + cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name) + return cmap_name def __repr__(self): return ''\ diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 86788a0..47496e6 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -28,7 +28,8 @@ test_strings = { "H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n\f", "simple2.pdf": "\f", - "simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f", + "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n" + "World\n\nWorld\n\n\f", "simple4.pdf": "Text1\nText2\nText3\n\n\f" } diff --git a/tests/test_pdffont.py b/tests/test_pdffont.py new file mode 100644 index 0000000..4044afd --- /dev/null +++ b/tests/test_pdffont.py @@ -0,0 +1,21 @@ +from nose.tools import assert_equal, assert_greater + +from pdfminer.pdffont import PDFCIDFont +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.psparser import PSLiteral + + +def test_get_cmap_from_pickle(): + """Test if cmap file is read from pdfminer/cmap + + Regression test for https://github.com/pdfminer/pdfminer.six/issues/391 + """ + cmap_name = 'UniGB-UCS2-H' + spec = {'Encoding': PSLiteral(cmap_name)} + resource_manager = PDFResourceManager() + font = PDFCIDFont(resource_manager, spec) + + cmap = font.get_cmap_from_spec(spec, False) + + assert_equal(cmap.attrs.get('CMapName'), cmap_name) + assert_greater(len(cmap.code2cid), 0)