Always try to get CMap, even if name is not recognized (#438)
* Add trying to get cmap from pickle file. And cleaning up a bit. * Don't use keyword argument for dict.get * Add docs * Make _get_cmap_name static * Add test * Add CHANGELOG.md * Remove identity mappings from IDENTITY_ENCODER because that's now the default if the key is not in there * Add CJK characters to expected output of simple3.pdf * Fix line length * Add commentpull/461/head
parent
3cebf5ef66
commit
4f65242750
|
@ -8,7 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
### Added
|
### Added
|
||||||
- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
|
- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
|
||||||
|
|
||||||
### Fixed
|
## Fixed
|
||||||
|
- Always try to get CMap, not only for identity encodings ([#438](https://github.com/pdfminer/pdfminer.six/pull/438))
|
||||||
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
|
- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
|
@ -3,7 +3,6 @@ import struct
|
||||||
import sys
|
import sys
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
|
||||||
from . import settings
|
from . import settings
|
||||||
from .cmapdb import CMap
|
from .cmapdb import CMap
|
||||||
from .cmapdb import CMapDB
|
from .cmapdb import CMapDB
|
||||||
|
@ -133,15 +132,11 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
|
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
|
||||||
None, '-')
|
None, '-')
|
||||||
|
|
||||||
# Note: DLIdent-* isn't found in PDF Reference but is been kept as
|
# Mapping of cmap names. Original cmap name is kept if not in the mapping.
|
||||||
# it is harmless and have possibility of been a type.
|
# (missing reference for why DLIdent is mapped to Identity)
|
||||||
# (induced from bug report/PR)
|
IDENTITY_ENCODER = {
|
||||||
IDENTITY_ENCODER = {'Identity-H': 'Identity-H',
|
|
||||||
'Identity-V': 'Identity-V',
|
|
||||||
'DLIdent-H': 'Identity-H',
|
'DLIdent-H': 'Identity-H',
|
||||||
'DLIdent-V': 'Identity-V',
|
'DLIdent-V': 'Identity-V',
|
||||||
'OneByteIdentityH': 'OneByteIdentityH',
|
|
||||||
'OneByteIdentityV': 'OneByteIdentityV',
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -725,13 +720,28 @@ class PDFCIDFont(PDFFont):
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cmap_from_spec(self, spec, strict):
|
def get_cmap_from_spec(self, spec, strict):
|
||||||
"""
|
"""Get cmap from font specification
|
||||||
|
|
||||||
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
||||||
Encoding but as an attribute of CMapName, where CMapName is an
|
Encoding but as an attribute of CMapName, where CMapName is an
|
||||||
attribute of spec['Encoding'].
|
attribute of spec['Encoding'].
|
||||||
The horizontal/vertical modes are mentioned with different name
|
The horizontal/vertical modes are mentioned with different name
|
||||||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
||||||
"""
|
"""
|
||||||
|
cmap_name = self._get_cmap_name(spec, strict)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return CMapDB.get_cmap(cmap_name)
|
||||||
|
except CMapDB.CMapNotFound as e:
|
||||||
|
if strict:
|
||||||
|
raise PDFFontError(e)
|
||||||
|
return CMap()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_cmap_name(spec, strict):
|
||||||
|
"""Get cmap name from font specification"""
|
||||||
|
cmap_name = 'unknown' # default value
|
||||||
|
|
||||||
try:
|
try:
|
||||||
spec_encoding = spec['Encoding']
|
spec_encoding = spec['Encoding']
|
||||||
if hasattr(spec_encoding, 'name'):
|
if hasattr(spec_encoding, 'name'):
|
||||||
|
@ -741,18 +751,16 @@ class PDFCIDFont(PDFFont):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if strict:
|
if strict:
|
||||||
raise PDFFontError('Encoding is unspecified')
|
raise PDFFontError('Encoding is unspecified')
|
||||||
cmap_name = 'unknown'
|
|
||||||
if type(cmap_name) is PDFStream:
|
if type(cmap_name) is PDFStream:
|
||||||
if 'CMapName' in cmap_name:
|
if 'CMapName' in cmap_name:
|
||||||
cmap_name = cmap_name.get('CMapName').name
|
cmap_name = cmap_name.get('CMapName').name
|
||||||
else:
|
else:
|
||||||
if strict:
|
if strict:
|
||||||
raise PDFFontError('CMapName unspecified for encoding')
|
raise PDFFontError('CMapName unspecified for encoding')
|
||||||
cmap_name = 'unknown'
|
|
||||||
if cmap_name in IDENTITY_ENCODER:
|
cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
|
||||||
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
|
return cmap_name
|
||||||
else:
|
|
||||||
return CMap()
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
|
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
|
||||||
|
|
|
@ -28,7 +28,8 @@ test_strings = {
|
||||||
"H e l l o \n\nW o r l d\n\n"
|
"H e l l o \n\nW o r l d\n\n"
|
||||||
"H e l l o \n\nW o r l d\n\n\f",
|
"H e l l o \n\nW o r l d\n\n\f",
|
||||||
"simple2.pdf": "\f",
|
"simple2.pdf": "\f",
|
||||||
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
"simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
|
||||||
|
"World\n\nWorld\n\n\f",
|
||||||
"simple4.pdf": "Text1\nText2\nText3\n\n\f"
|
"simple4.pdf": "Text1\nText2\nText3\n\n\f"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
from nose.tools import assert_equal, assert_greater
|
||||||
|
|
||||||
|
from pdfminer.pdffont import PDFCIDFont
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager
|
||||||
|
from pdfminer.psparser import PSLiteral
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_cmap_from_pickle():
|
||||||
|
"""Test if cmap file is read from pdfminer/cmap
|
||||||
|
|
||||||
|
Regression test for https://github.com/pdfminer/pdfminer.six/issues/391
|
||||||
|
"""
|
||||||
|
cmap_name = 'UniGB-UCS2-H'
|
||||||
|
spec = {'Encoding': PSLiteral(cmap_name)}
|
||||||
|
resource_manager = PDFResourceManager()
|
||||||
|
font = PDFCIDFont(resource_manager, spec)
|
||||||
|
|
||||||
|
cmap = font.get_cmap_from_spec(spec, False)
|
||||||
|
|
||||||
|
assert_equal(cmap.attrs.get('CMapName'), cmap_name)
|
||||||
|
assert_greater(len(cmap.code2cid), 0)
|
Loading…
Reference in New Issue