Always try to get CMap, even if name is not recognized (#438)

* Add trying to get cmap from pickle file. And cleaning up a bit.

* Don't use keyword argument for dict.get

* Add docs

* Make _get_cmap_name static

* Add test

* Add CHANGELOG.md

* Remove identity mappings from IDENTITY_ENCODER because that's now the default if the key is not in there

* Add CJK characters to expected output of simple3.pdf

* Fix line length

* Add comment
pull/461/head
Pieter Marsman 2020-07-23 20:27:38 +02:00 committed by GitHub
parent 3cebf5ef66
commit 4f65242750
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 51 additions and 20 deletions

View File

@ -8,7 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added ### Added
- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371)) - Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
### Fixed ## Fixed
- Always try to get CMap, not only for identity encodings ([#438](https://github.com/pdfminer/pdfminer.six/pull/438))
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451)) - Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
### Changed ### Changed

View File

@ -3,7 +3,6 @@ import struct
import sys import sys
from io import BytesIO from io import BytesIO
from . import settings from . import settings
from .cmapdb import CMap from .cmapdb import CMap
from .cmapdb import CMapDB from .cmapdb import CMapDB
@ -133,16 +132,12 @@ class Type1FontHeaderParser(PSStackParser):
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
None, '-') None, '-')
# Note: DLIdent-* isn't found in PDF Reference but is been kept as # Mapping of cmap names. Original cmap name is kept if not in the mapping.
# it is harmless and have possibility of been a type. # (missing reference for why DLIdent is mapped to Identity)
# (induced from bug report/PR) IDENTITY_ENCODER = {
IDENTITY_ENCODER = {'Identity-H': 'Identity-H',
'Identity-V': 'Identity-V',
'DLIdent-H': 'Identity-H', 'DLIdent-H': 'Identity-H',
'DLIdent-V': 'Identity-V', 'DLIdent-V': 'Identity-V',
'OneByteIdentityH': 'OneByteIdentityH', }
'OneByteIdentityV': 'OneByteIdentityV',
}
def getdict(data): def getdict(data):
@ -725,13 +720,28 @@ class PDFCIDFont(PDFFont):
return return
def get_cmap_from_spec(self, spec, strict): def get_cmap_from_spec(self, spec, strict):
""" """Get cmap from font specification
For certain PDFs, Encoding Type isn't mentioned as an attribute of For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an Encoding but as an attribute of CMapName, where CMapName is an
attribute of spec['Encoding']. attribute of spec['Encoding'].
The horizontal/vertical modes are mentioned with different name The horizontal/vertical modes are mentioned with different name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
""" """
cmap_name = self._get_cmap_name(spec, strict)
try:
return CMapDB.get_cmap(cmap_name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
return CMap()
@staticmethod
def _get_cmap_name(spec, strict):
"""Get cmap name from font specification"""
cmap_name = 'unknown' # default value
try: try:
spec_encoding = spec['Encoding'] spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'): if hasattr(spec_encoding, 'name'):
@ -741,18 +751,16 @@ class PDFCIDFont(PDFFont):
except KeyError: except KeyError:
if strict: if strict:
raise PDFFontError('Encoding is unspecified') raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'
if type(cmap_name) is PDFStream: if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name: if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name cmap_name = cmap_name.get('CMapName').name
else: else:
if strict: if strict:
raise PDFFontError('CMapName unspecified for encoding') raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER: cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name]) return cmap_name
else:
return CMap()
def __repr__(self): def __repr__(self):
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\ return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\

View File

@ -28,7 +28,8 @@ test_strings = {
"H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f", "H e l l o \n\nW o r l d\n\n\f",
"simple2.pdf": "\f", "simple2.pdf": "\f",
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f", "simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n"
"World\n\nWorld\n\n\f",
"simple4.pdf": "Text1\nText2\nText3\n\n\f" "simple4.pdf": "Text1\nText2\nText3\n\n\f"
} }

21
tests/test_pdffont.py Normal file
View File

@ -0,0 +1,21 @@
from nose.tools import assert_equal, assert_greater
from pdfminer.pdffont import PDFCIDFont
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.psparser import PSLiteral
def test_get_cmap_from_pickle():
"""Test if cmap file is read from pdfminer/cmap
Regression test for https://github.com/pdfminer/pdfminer.six/issues/391
"""
cmap_name = 'UniGB-UCS2-H'
spec = {'Encoding': PSLiteral(cmap_name)}
resource_manager = PDFResourceManager()
font = PDFCIDFont(resource_manager, spec)
cmap = font.get_cmap_from_spec(spec, False)
assert_equal(cmap.attrs.get('CMapName'), cmap_name)
assert_greater(len(cmap.code2cid), 0)