Merge pull request #283 from fakabbir/pdfstream-as-cmap

Pdfstream as cmap
pull/298/head
Tata Ganesh 2019-10-12 21:22:52 +05:30 committed by GitHub
commit f53fbd98b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 172 additions and 70 deletions

View File

@ -1,5 +1,3 @@
""" Adobe character mapping (CMap) support.
CMaps provide the mapping between character codes and Unicode
@ -40,8 +38,6 @@ class CMapError(Exception):
pass
## CMapBase
##
class CMapBase(object):
debug = 0
@ -67,8 +63,6 @@ class CMapBase(object):
return
## CMap
##
class CMap(CMapBase):
def __init__(self, **kwargs):
@ -119,8 +113,6 @@ class CMap(CMapBase):
return
## IdentityCMap
##
class IdentityCMap(CMapBase):
def decode(self, code):
@ -131,8 +123,16 @@ class IdentityCMap(CMapBase):
return ()
## UnicodeMap
##
class IdentityCMapByte(IdentityCMap):
def decode(self, code):
n = len(code)
if n:
return struct.unpack('>%dB' % n, code)
else:
return ()
class UnicodeMap(CMapBase):
def __init__(self, **kwargs):
@ -153,8 +153,6 @@ class UnicodeMap(CMapBase):
return
## FileCMap
##
class FileCMap(CMap):
def add_code2cid(self, code, cid):
@ -173,8 +171,6 @@ class FileCMap(CMap):
return
## FileUnicodeMap
##
class FileUnicodeMap(UnicodeMap):
def add_cid2unichr(self, cid, code):
@ -192,8 +188,6 @@ class FileUnicodeMap(UnicodeMap):
return
## PyCMap
##
class PyCMap(CMap):
def __init__(self, name, module):
@ -204,8 +198,6 @@ class PyCMap(CMap):
return
## PyUnicodeMap
##
class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical):
@ -218,8 +210,6 @@ class PyUnicodeMap(UnicodeMap):
return
## CMapDB
##
class CMapDB(object):
_cmap_cache = {}
@ -252,6 +242,10 @@ class CMapDB(object):
return IdentityCMap(WMode=0)
elif name == 'Identity-V':
return IdentityCMap(WMode=1)
elif name == 'OneByteIdentityH':
return IdentityCMapByte(WMode=0)
elif name == 'OneByteIdentityV':
return IdentityCMapByte(WMode=1)
try:
return klass._cmap_cache[name]
except KeyError:
@ -271,8 +265,6 @@ class CMapDB(object):
return umaps[vertical]
## CMapParser
##
class CMapParser(PSStackParser):
def __init__(self, cmap, fp):
@ -360,7 +352,6 @@ class CMapParser(PSStackParser):
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
#assert s1 <= e1, str((s1, e1))
for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
@ -387,7 +378,6 @@ class CMapParser(PSStackParser):
continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1, str((s1, e1))
if isinstance(code, list):
for i in range(e1-s1+1):
self.cmap.add_cid2unichr(s1+i, code[i])
@ -422,17 +412,16 @@ class CMapParser(PSStackParser):
return
# test
def main(argv):
args = argv[1:]
for fname in args:
fp = open(fname, 'rb')
cmap = FileUnicodeMap()
#cmap = FileCMap()
CMapParser(cmap, fp).run()
fp.close()
cmap.dump()
return
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -14,6 +14,8 @@ from .encodingdb import EncodingDB
from .encodingdb import name2unicode
from .fontmetrics import FONT_METRICS
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import resolve1
from .pdftypes import dict_value
from .pdftypes import int_value
from .pdftypes import list_value
@ -33,7 +35,6 @@ from .utils import nunpack
log = logging.getLogger(__name__)
def get_widths(seq):
widths = {}
r = []
@ -52,10 +53,6 @@ def get_widths(seq):
widths[i] = w
r = []
return widths
#assert get_widths([1]) == {}
#assert get_widths([1,2,3]) == {1:3, 2:3}
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
def get_widths2(seq):
widths = {}
@ -75,13 +72,8 @@ def get_widths2(seq):
widths[i] = (w, (vx, vy))
r = []
return widths
#assert get_widths2([1]) == {}
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
## FontMetricsDB
##
class FontMetricsDB(object):
@classmethod
@ -89,8 +81,6 @@ class FontMetricsDB(object):
return FONT_METRICS[fontname]
## Type1FontHeaderParser
##
class Type1FontHeaderParser(PSStackParser):
KEYWORD_BEGIN = KWD(b'begin')
@ -141,11 +131,16 @@ class Type1FontHeaderParser(PSStackParser):
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
#Note: DLIdent-* isn't found in PDF Reference but is been kept as
#it is harmless and have possibility of been a type. (induced from bug report/PR)
IDENTITY_ENCODER = {'Identity-H':'Identity-H',
'Identity-V':'Identity-V',
'DLIdent-H':'Identity-H',
'DLIdent-V':'Identity-V',
'OneByteIdentityH':'OneByteIdentityH',
'OneByteIdentityV':'OneByteIdentityV',
}
## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
##
def getdict(data):
d = {}
fp = BytesIO(data)
@ -273,6 +268,7 @@ class CFFFont(object):
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
)
class INDEX(object):
def __init__(self, fp):
@ -373,9 +369,6 @@ class CFFFont(object):
assert False, str(('Unhandled', format))
else:
raise ValueError('unsupported charset format: %r' % format)
#print self.code2gid
#print self.name2gid
#assert 0
return
def getstr(self, sid):
@ -384,8 +377,6 @@ class CFFFont(object):
return self.string_index[sid-len(self.STANDARD_STRINGS)]
## TrueTypeFont
##
class TrueTypeFont(object):
class CMapNotFound(Exception):
@ -471,8 +462,6 @@ class TrueTypeFont(object):
return unicode_map
## Fonts
##
class PDFFontError(PDFException):
pass
@ -484,7 +473,6 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
LITERAL_TYPE1C = LIT('Type1C')
# PDFFont
class PDFFont(object):
def __init__(self, descriptor, widths, default_width=None):
@ -549,7 +537,6 @@ class PDFFont(object):
return sum(self.char_width(cid) for cid in self.decode(s))
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
def __init__(self, descriptor, widths, spec):
@ -586,7 +573,6 @@ class PDFSimpleFont(PDFFont):
raise PDFUnicodeNotDefined(None, cid)
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
def __init__(self, rsrcmgr, spec):
@ -618,14 +604,12 @@ class PDFType1Font(PDFSimpleFont):
return '<PDFType1Font: basefont=%r>' % self.basefont
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
def __init__(self, rsrcmgr, spec):
@ -648,7 +632,6 @@ class PDFType3Font(PDFSimpleFont):
return '<PDFType3Font>'
# PDFCIDFont
class PDFCIDFont(PDFFont):
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
@ -661,18 +644,8 @@ class PDFCIDFont(PDFFont):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
self.cmap = CMap()
self.cmap = self.get_cmap_from_spec(spec, strict)
try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
@ -719,6 +692,36 @@ class PDFCIDFont(PDFFont):
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return
def get_cmap_from_spec(self, spec, strict):
"""
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribute of spec['Encoding'].
The horizontal/vertical modes are mentioned with different name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
"""
try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
cmap_name = literal_name(spec['Encoding'])
else:
cmap_name = literal_name(spec_encoding['CMapName'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'
if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER:
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
else:
return CMap()
def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
@ -743,16 +746,14 @@ class PDFCIDFont(PDFFont):
except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
# main
def main(argv):
for fname in argv[1:]:
fp = open(fname, 'rb')
#font = TrueTypeFont(fname, fp)
font = CFFFont(fname, fp)
print (font)
fp.close()
return
if __name__ == '__main__':
sys.exit(main(sys.argv))

Binary file not shown.

111
tests/test_pdfencoding.py Normal file
View File

@ -0,0 +1,111 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nose, logging, os
from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte
from pdfminer.pdffont import PDFCIDFont
from pdfminer.pdftypes import PDFStream
from pdfminer.psparser import PSLiteral
class TestPDFEncoding():
def test_cmapname_onebyteidentityV(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte)
def test_cmapname_onebyteidentityH(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte)
def test_cmapname_V(self):
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_cmapname_H(self):
stream = PDFStream({'CMapName': PSLiteral('H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_encoding_identityH(self):
spec = {'Encoding': PSLiteral('Identity-H')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV(self):
spec = {'Encoding': PSLiteral('Identity-V')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_stream(self):
stream = PDFStream({'CMapName':'Identity-H'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_stream(self):
stream = PDFStream({'CMapName':'Identity-V'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH(self):
spec = {'Encoding': PSLiteral('DLIdent-H')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV(self):
spec = {'Encoding': PSLiteral('DLIdent-V')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH_as_stream(self):
stream = PDFStream({'CMapName':'DLIdent-H'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV_as_stream(self):
stream = PDFStream({'CMapName':'DLIdent-V'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_font_without_spec(self):
font = PDFCIDFont(None, {})
assert isinstance(font.cmap, CMap)
if __name__ == '__main__':
nose.runmodule()

View File

@ -24,6 +24,7 @@ class TestDumpPDF():
run('../samples/','simple1')
run('../samples/','simple2')
run('../samples/','simple3')
run('../samples/','sampleOneByteIdentityEncode')
def test_2(self):
run('../samples/nonfree/','dmca')