commit
f53fbd98b1
|
@ -1,5 +1,3 @@
|
|||
|
||||
|
||||
""" Adobe character mapping (CMap) support.
|
||||
|
||||
CMaps provide the mapping between character codes and Unicode
|
||||
|
@ -40,8 +38,6 @@ class CMapError(Exception):
|
|||
pass
|
||||
|
||||
|
||||
## CMapBase
|
||||
##
|
||||
class CMapBase(object):
|
||||
|
||||
debug = 0
|
||||
|
@ -67,8 +63,6 @@ class CMapBase(object):
|
|||
return
|
||||
|
||||
|
||||
## CMap
|
||||
##
|
||||
class CMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
@ -119,8 +113,6 @@ class CMap(CMapBase):
|
|||
return
|
||||
|
||||
|
||||
## IdentityCMap
|
||||
##
|
||||
class IdentityCMap(CMapBase):
|
||||
|
||||
def decode(self, code):
|
||||
|
@ -131,8 +123,16 @@ class IdentityCMap(CMapBase):
|
|||
return ()
|
||||
|
||||
|
||||
## UnicodeMap
|
||||
##
|
||||
class IdentityCMapByte(IdentityCMap):
|
||||
|
||||
def decode(self, code):
|
||||
n = len(code)
|
||||
if n:
|
||||
return struct.unpack('>%dB' % n, code)
|
||||
else:
|
||||
return ()
|
||||
|
||||
|
||||
class UnicodeMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
@ -153,8 +153,6 @@ class UnicodeMap(CMapBase):
|
|||
return
|
||||
|
||||
|
||||
## FileCMap
|
||||
##
|
||||
class FileCMap(CMap):
|
||||
|
||||
def add_code2cid(self, code, cid):
|
||||
|
@ -173,8 +171,6 @@ class FileCMap(CMap):
|
|||
return
|
||||
|
||||
|
||||
## FileUnicodeMap
|
||||
##
|
||||
class FileUnicodeMap(UnicodeMap):
|
||||
|
||||
def add_cid2unichr(self, cid, code):
|
||||
|
@ -192,8 +188,6 @@ class FileUnicodeMap(UnicodeMap):
|
|||
return
|
||||
|
||||
|
||||
## PyCMap
|
||||
##
|
||||
class PyCMap(CMap):
|
||||
|
||||
def __init__(self, name, module):
|
||||
|
@ -204,8 +198,6 @@ class PyCMap(CMap):
|
|||
return
|
||||
|
||||
|
||||
## PyUnicodeMap
|
||||
##
|
||||
class PyUnicodeMap(UnicodeMap):
|
||||
|
||||
def __init__(self, name, module, vertical):
|
||||
|
@ -218,8 +210,6 @@ class PyUnicodeMap(UnicodeMap):
|
|||
return
|
||||
|
||||
|
||||
## CMapDB
|
||||
##
|
||||
class CMapDB(object):
|
||||
|
||||
_cmap_cache = {}
|
||||
|
@ -252,6 +242,10 @@ class CMapDB(object):
|
|||
return IdentityCMap(WMode=0)
|
||||
elif name == 'Identity-V':
|
||||
return IdentityCMap(WMode=1)
|
||||
elif name == 'OneByteIdentityH':
|
||||
return IdentityCMapByte(WMode=0)
|
||||
elif name == 'OneByteIdentityV':
|
||||
return IdentityCMapByte(WMode=1)
|
||||
try:
|
||||
return klass._cmap_cache[name]
|
||||
except KeyError:
|
||||
|
@ -271,8 +265,6 @@ class CMapDB(object):
|
|||
return umaps[vertical]
|
||||
|
||||
|
||||
## CMapParser
|
||||
##
|
||||
class CMapParser(PSStackParser):
|
||||
|
||||
def __init__(self, cmap, fp):
|
||||
|
@ -360,7 +352,6 @@ class CMapParser(PSStackParser):
|
|||
s1 = nunpack(svar)
|
||||
e1 = nunpack(evar)
|
||||
vlen = len(svar)
|
||||
#assert s1 <= e1, str((s1, e1))
|
||||
for i in range(e1-s1+1):
|
||||
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||
self.cmap.add_code2cid(x, cid+i)
|
||||
|
@ -387,7 +378,6 @@ class CMapParser(PSStackParser):
|
|||
continue
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
#assert s1 <= e1, str((s1, e1))
|
||||
if isinstance(code, list):
|
||||
for i in range(e1-s1+1):
|
||||
self.cmap.add_cid2unichr(s1+i, code[i])
|
||||
|
@ -422,17 +412,16 @@ class CMapParser(PSStackParser):
|
|||
return
|
||||
|
||||
|
||||
# test
|
||||
def main(argv):
|
||||
args = argv[1:]
|
||||
for fname in args:
|
||||
fp = open(fname, 'rb')
|
||||
cmap = FileUnicodeMap()
|
||||
#cmap = FileCMap()
|
||||
CMapParser(cmap, fp).run()
|
||||
fp.close()
|
||||
cmap.dump()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -14,6 +14,8 @@ from .encodingdb import EncodingDB
|
|||
from .encodingdb import name2unicode
|
||||
from .fontmetrics import FONT_METRICS
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import list_value
|
||||
|
@ -33,7 +35,6 @@ from .utils import nunpack
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_widths(seq):
|
||||
widths = {}
|
||||
r = []
|
||||
|
@ -52,10 +53,6 @@ def get_widths(seq):
|
|||
widths[i] = w
|
||||
r = []
|
||||
return widths
|
||||
#assert get_widths([1]) == {}
|
||||
#assert get_widths([1,2,3]) == {1:3, 2:3}
|
||||
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
|
||||
|
||||
|
||||
def get_widths2(seq):
|
||||
widths = {}
|
||||
|
@ -75,13 +72,8 @@ def get_widths2(seq):
|
|||
widths[i] = (w, (vx, vy))
|
||||
r = []
|
||||
return widths
|
||||
#assert get_widths2([1]) == {}
|
||||
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
|
||||
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
|
||||
|
||||
|
||||
## FontMetricsDB
|
||||
##
|
||||
class FontMetricsDB(object):
|
||||
|
||||
@classmethod
|
||||
|
@ -89,8 +81,6 @@ class FontMetricsDB(object):
|
|||
return FONT_METRICS[fontname]
|
||||
|
||||
|
||||
## Type1FontHeaderParser
|
||||
##
|
||||
class Type1FontHeaderParser(PSStackParser):
|
||||
|
||||
KEYWORD_BEGIN = KWD(b'begin')
|
||||
|
@ -141,11 +131,16 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
|
||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
||||
|
||||
#Note: DLIdent-* isn't found in PDF Reference but is been kept as
|
||||
#it is harmless and have possibility of been a type. (induced from bug report/PR)
|
||||
IDENTITY_ENCODER = {'Identity-H':'Identity-H',
|
||||
'Identity-V':'Identity-V',
|
||||
'DLIdent-H':'Identity-H',
|
||||
'DLIdent-V':'Identity-V',
|
||||
'OneByteIdentityH':'OneByteIdentityH',
|
||||
'OneByteIdentityV':'OneByteIdentityV',
|
||||
}
|
||||
|
||||
## CFFFont
|
||||
## (Format specified in Adobe Technical Note: #5176
|
||||
## "The Compact Font Format Specification")
|
||||
##
|
||||
def getdict(data):
|
||||
d = {}
|
||||
fp = BytesIO(data)
|
||||
|
@ -273,6 +268,7 @@ class CFFFont(object):
|
|||
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
||||
)
|
||||
|
||||
|
||||
class INDEX(object):
|
||||
|
||||
def __init__(self, fp):
|
||||
|
@ -373,9 +369,6 @@ class CFFFont(object):
|
|||
assert False, str(('Unhandled', format))
|
||||
else:
|
||||
raise ValueError('unsupported charset format: %r' % format)
|
||||
#print self.code2gid
|
||||
#print self.name2gid
|
||||
#assert 0
|
||||
return
|
||||
|
||||
def getstr(self, sid):
|
||||
|
@ -384,8 +377,6 @@ class CFFFont(object):
|
|||
return self.string_index[sid-len(self.STANDARD_STRINGS)]
|
||||
|
||||
|
||||
## TrueTypeFont
|
||||
##
|
||||
class TrueTypeFont(object):
|
||||
|
||||
class CMapNotFound(Exception):
|
||||
|
@ -471,8 +462,6 @@ class TrueTypeFont(object):
|
|||
return unicode_map
|
||||
|
||||
|
||||
## Fonts
|
||||
##
|
||||
class PDFFontError(PDFException):
|
||||
pass
|
||||
|
||||
|
@ -484,7 +473,6 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
|||
LITERAL_TYPE1C = LIT('Type1C')
|
||||
|
||||
|
||||
# PDFFont
|
||||
class PDFFont(object):
|
||||
|
||||
def __init__(self, descriptor, widths, default_width=None):
|
||||
|
@ -549,7 +537,6 @@ class PDFFont(object):
|
|||
return sum(self.char_width(cid) for cid in self.decode(s))
|
||||
|
||||
|
||||
# PDFSimpleFont
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
||||
def __init__(self, descriptor, widths, spec):
|
||||
|
@ -586,7 +573,6 @@ class PDFSimpleFont(PDFFont):
|
|||
raise PDFUnicodeNotDefined(None, cid)
|
||||
|
||||
|
||||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
|
@ -618,14 +604,12 @@ class PDFType1Font(PDFSimpleFont):
|
|||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFTrueTypeFont
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
|
@ -648,7 +632,6 @@ class PDFType3Font(PDFSimpleFont):
|
|||
return '<PDFType3Font>'
|
||||
|
||||
|
||||
# PDFCIDFont
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
|
||||
|
@ -661,18 +644,8 @@ class PDFCIDFont(PDFFont):
|
|||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
|
||||
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
|
||||
try:
|
||||
name = literal_name(spec['Encoding'])
|
||||
except KeyError:
|
||||
if strict:
|
||||
raise PDFFontError('Encoding is unspecified')
|
||||
name = 'unknown'
|
||||
try:
|
||||
self.cmap = CMapDB.get_cmap(name)
|
||||
except CMapDB.CMapNotFound as e:
|
||||
if strict:
|
||||
raise PDFFontError(e)
|
||||
self.cmap = CMap()
|
||||
self.cmap = self.get_cmap_from_spec(spec, strict)
|
||||
|
||||
try:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
except KeyError:
|
||||
|
@ -719,6 +692,36 @@ class PDFCIDFont(PDFFont):
|
|||
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
||||
return
|
||||
|
||||
def get_cmap_from_spec(self, spec, strict):
|
||||
"""
|
||||
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
||||
Encoding but as an attribute of CMapName, where CMapName is an
|
||||
attribute of spec['Encoding'].
|
||||
The horizontal/vertical modes are mentioned with different name
|
||||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
||||
"""
|
||||
try:
|
||||
spec_encoding = spec['Encoding']
|
||||
if hasattr(spec_encoding, 'name'):
|
||||
cmap_name = literal_name(spec['Encoding'])
|
||||
else:
|
||||
cmap_name = literal_name(spec_encoding['CMapName'])
|
||||
except KeyError:
|
||||
if strict:
|
||||
raise PDFFontError('Encoding is unspecified')
|
||||
cmap_name = 'unknown'
|
||||
if type(cmap_name) is PDFStream:
|
||||
if 'CMapName' in cmap_name:
|
||||
cmap_name = cmap_name.get('CMapName').name
|
||||
else:
|
||||
if strict:
|
||||
raise PDFFontError('CMapName unspecified for encoding')
|
||||
cmap_name = 'unknown'
|
||||
if cmap_name in IDENTITY_ENCODER:
|
||||
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
|
||||
else:
|
||||
return CMap()
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
|
||||
|
||||
|
@ -743,16 +746,14 @@ class PDFCIDFont(PDFFont):
|
|||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
for fname in argv[1:]:
|
||||
fp = open(fname, 'rb')
|
||||
#font = TrueTypeFont(fname, fp)
|
||||
font = CFFFont(fname, fp)
|
||||
print (font)
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,111 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import nose, logging, os
|
||||
from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte
|
||||
from pdfminer.pdffont import PDFCIDFont
|
||||
from pdfminer.pdftypes import PDFStream
|
||||
from pdfminer.psparser import PSLiteral
|
||||
|
||||
class TestPDFEncoding():
|
||||
|
||||
def test_cmapname_onebyteidentityV(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMapByte)
|
||||
|
||||
def test_cmapname_onebyteidentityH(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMapByte)
|
||||
|
||||
def test_cmapname_V(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, CMap)
|
||||
|
||||
def test_cmapname_H(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('H')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, CMap)
|
||||
|
||||
def test_encoding_identityH(self):
|
||||
spec = {'Encoding': PSLiteral('Identity-H')}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityV(self):
|
||||
spec = {'Encoding': PSLiteral('Identity-V')}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityH_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityV_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityH_as_stream(self):
|
||||
stream = PDFStream({'CMapName':'Identity-H'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityV_as_stream(self):
|
||||
stream = PDFStream({'CMapName':'Identity-V'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH(self):
|
||||
spec = {'Encoding': PSLiteral('DLIdent-H')}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentV(self):
|
||||
spec = {'Encoding': PSLiteral('DLIdent-V')}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_stream(self):
|
||||
stream = PDFStream({'CMapName':'DLIdent-H'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentV_as_stream(self):
|
||||
stream = PDFStream({'CMapName':'DLIdent-V'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_font_without_spec(self):
|
||||
font = PDFCIDFont(None, {})
|
||||
assert isinstance(font.cmap, CMap)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
nose.runmodule()
|
|
@ -24,6 +24,7 @@ class TestDumpPDF():
|
|||
run('../samples/','simple1')
|
||||
run('../samples/','simple2')
|
||||
run('../samples/','simple3')
|
||||
run('../samples/','sampleOneByteIdentityEncode')
|
||||
|
||||
def test_2(self):
|
||||
run('../samples/nonfree/','dmca')
|
||||
|
|
Loading…
Reference in New Issue