Merge pull request #264 from fakabbir/pdfstream-as-cmap

Pdfstream as cmap
pull/287/head
Tata Ganesh 2019-07-31 22:20:37 +05:30 committed by GitHub
commit 48b25939c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 111 additions and 13 deletions

View File

@ -16,6 +16,7 @@ from . import settings
from .psparser import PSLiteral from .psparser import PSLiteral
from .psparser import literal_name from .psparser import literal_name
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import resolve1 from .pdftypes import resolve1
from .pdftypes import int_value from .pdftypes import int_value
from .pdftypes import num_value from .pdftypes import num_value
@ -127,7 +128,7 @@ class Type1FontHeaderParser(PSStackParser):
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
IDENTITY_ENCODER = ('Identity-H', 'Identity-V')
## CFFFont ## CFFFont
## (Format specified in Adobe Technical Note: #5176 ## (Format specified in Adobe Technical Note: #5176
@ -648,18 +649,8 @@ class PDFCIDFont(PDFFont):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
try: self.cmap = self.get_cmap_from_spec(spec, strict)
name = literal_name(spec['Encoding'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
self.cmap = CMap()
try: try:
descriptor = dict_value(spec['FontDescriptor']) descriptor = dict_value(spec['FontDescriptor'])
except KeyError: except KeyError:
@ -706,6 +697,36 @@ class PDFCIDFont(PDFFont):
PDFFont.__init__(self, descriptor, widths, default_width=default_width) PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return return
def get_cmap_from_spec(self, spec, strict):
"""
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribure of spec['Encoding'].
The horizaontal/vertical modes are mentioned with diffrent name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'
"""
try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
cmap_name = literal_name(spec['Encoding'])
else:
cmap_name = literal_name(spec_encoding['CMapName'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'
if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER:
return CMapDB.get_cmap(cmap_name)
else:
return CMap()
def __repr__(self): def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding) return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)

77
tests/test_pdfencoding.py Normal file
View File

@ -0,0 +1,77 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nose, logging, os
from pdfminer.cmapdb import IdentityCMap, CMap
from pdfminer.pdffont import PDFCIDFont
from pdfminer.pdftypes import PDFStream
from pdfminer.psparser import PSLiteral
class TestPDFEncoding():
def test_cmapname_onebyteidentityV(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_cmapname_onebyteidentityH(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_cmapname_V(self):
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_cmapname_H(self):
stream = PDFStream({'CMapName': PSLiteral('H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_encoding_identityH(self):
spec = {'Encoding': PSLiteral('Identity-H')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV(self):
spec = {'Encoding': PSLiteral('Identity-V')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_stream(self):
stream = PDFStream({'CMapName':'Identity-H'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_stream(self):
stream = PDFStream({'CMapName':'Identity-V'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_font_without_spec(self):
font = PDFCIDFont(None, {})
assert isinstance(font.cmap, CMap)
if __name__ == '__main__':
nose.runmodule()