diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 8185c93..1681a8d 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -1,5 +1,3 @@ - - """ Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode @@ -40,8 +38,6 @@ class CMapError(Exception): pass -## CMapBase -## class CMapBase(object): debug = 0 @@ -67,8 +63,6 @@ class CMapBase(object): return -## CMap -## class CMap(CMapBase): def __init__(self, **kwargs): @@ -119,8 +113,6 @@ class CMap(CMapBase): return -## IdentityCMap -## class IdentityCMap(CMapBase): def decode(self, code): @@ -131,8 +123,16 @@ class IdentityCMap(CMapBase): return () -## UnicodeMap -## +class IdentityCMapByte(IdentityCMap): + + def decode(self, code): + n = len(code) + if n: + return struct.unpack('>%dB' % n, code) + else: + return () + + class UnicodeMap(CMapBase): def __init__(self, **kwargs): @@ -153,8 +153,6 @@ class UnicodeMap(CMapBase): return -## FileCMap -## class FileCMap(CMap): def add_code2cid(self, code, cid): @@ -173,8 +171,6 @@ class FileCMap(CMap): return -## FileUnicodeMap -## class FileUnicodeMap(UnicodeMap): def add_cid2unichr(self, cid, code): @@ -192,8 +188,6 @@ class FileUnicodeMap(UnicodeMap): return -## PyCMap -## class PyCMap(CMap): def __init__(self, name, module): @@ -204,8 +198,6 @@ class PyCMap(CMap): return -## PyUnicodeMap -## class PyUnicodeMap(UnicodeMap): def __init__(self, name, module, vertical): @@ -218,8 +210,6 @@ class PyUnicodeMap(UnicodeMap): return -## CMapDB -## class CMapDB(object): _cmap_cache = {} @@ -252,6 +242,10 @@ class CMapDB(object): return IdentityCMap(WMode=0) elif name == 'Identity-V': return IdentityCMap(WMode=1) + elif name == 'OneByteIdentityH': + return IdentityCMapByte(WMode=0) + elif name == 'OneByteIdentityV': + return IdentityCMapByte(WMode=1) try: return klass._cmap_cache[name] except KeyError: @@ -271,8 +265,6 @@ class CMapDB(object): return umaps[vertical] -## CMapParser -## class CMapParser(PSStackParser): def __init__(self, cmap, fp): @@ -360,7 +352,6 @@ class CMapParser(PSStackParser): s1 = nunpack(svar) e1 = nunpack(evar) vlen = len(svar) - #assert s1 <= e1, str((s1, e1)) for i in range(e1-s1+1): x = sprefix+struct.pack('>L', s1+i)[-vlen:] self.cmap.add_code2cid(x, cid+i) @@ -387,7 +378,6 @@ class CMapParser(PSStackParser): continue s1 = nunpack(s) e1 = nunpack(e) - #assert s1 <= e1, str((s1, e1)) if isinstance(code, list): for i in range(e1-s1+1): self.cmap.add_cid2unichr(s1+i, code[i]) @@ -422,17 +412,16 @@ class CMapParser(PSStackParser): return -# test def main(argv): args = argv[1:] for fname in args: fp = open(fname, 'rb') cmap = FileUnicodeMap() - #cmap = FileCMap() CMapParser(cmap, fp).run() fp.close() cmap.dump() return + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 2cfd7a9..be9ef8b 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -14,6 +14,8 @@ from .encodingdb import EncodingDB from .encodingdb import name2unicode from .fontmetrics import FONT_METRICS from .pdftypes import PDFException +from .pdftypes import PDFStream +from .pdftypes import resolve1 from .pdftypes import dict_value from .pdftypes import int_value from .pdftypes import list_value @@ -33,7 +35,6 @@ from .utils import nunpack log = logging.getLogger(__name__) - def get_widths(seq): widths = {} r = [] @@ -52,10 +53,6 @@ def get_widths(seq): widths[i] = w r = [] return widths -#assert get_widths([1]) == {} -#assert get_widths([1,2,3]) == {1:3, 2:3} -#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8} - def get_widths2(seq): widths = {} @@ -75,13 +72,8 @@ def get_widths2(seq): widths[i] = (w, (vx, vy)) r = [] return widths -#assert get_widths2([1]) == {} -#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))} -#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))} -## FontMetricsDB -## class FontMetricsDB(object): @classmethod @@ -89,8 +81,6 @@ class FontMetricsDB(object): return FONT_METRICS[fontname] -## Type1FontHeaderParser -## class Type1FontHeaderParser(PSStackParser): KEYWORD_BEGIN = KWD(b'begin') @@ -141,11 +131,16 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') +#Note: DLIdent-* isn't found in PDF Reference but is been kept as +#it is harmless and have possibility of been a type. (induced from bug report/PR) +IDENTITY_ENCODER = {'Identity-H':'Identity-H', + 'Identity-V':'Identity-V', + 'DLIdent-H':'Identity-H', + 'DLIdent-V':'Identity-V', + 'OneByteIdentityH':'OneByteIdentityH', + 'OneByteIdentityV':'OneByteIdentityV', + } -## CFFFont -## (Format specified in Adobe Technical Note: #5176 -## "The Compact Font Format Specification") -## def getdict(data): d = {} fp = BytesIO(data) @@ -273,6 +268,7 @@ class CFFFont(object): 'Light', 'Medium', 'Regular', 'Roman', 'Semibold', ) + class INDEX(object): def __init__(self, fp): @@ -373,9 +369,6 @@ class CFFFont(object): assert False, str(('Unhandled', format)) else: raise ValueError('unsupported charset format: %r' % format) - #print self.code2gid - #print self.name2gid - #assert 0 return def getstr(self, sid): @@ -384,8 +377,6 @@ class CFFFont(object): return self.string_index[sid-len(self.STANDARD_STRINGS)] -## TrueTypeFont -## class TrueTypeFont(object): class CMapNotFound(Exception): @@ -471,8 +462,6 @@ class TrueTypeFont(object): return unicode_map -## Fonts -## class PDFFontError(PDFException): pass @@ -484,7 +473,6 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') LITERAL_TYPE1C = LIT('Type1C') -# PDFFont class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): @@ -549,7 +537,6 @@ class PDFFont(object): return sum(self.char_width(cid) for cid in self.decode(s)) -# PDFSimpleFont class PDFSimpleFont(PDFFont): def __init__(self, descriptor, widths, spec): @@ -586,7 +573,6 @@ class PDFSimpleFont(PDFFont): raise PDFUnicodeNotDefined(None, cid) -# PDFType1Font class PDFType1Font(PDFSimpleFont): def __init__(self, rsrcmgr, spec): @@ -618,14 +604,12 @@ class PDFType1Font(PDFSimpleFont): return '' % self.basefont -# PDFTrueTypeFont class PDFTrueTypeFont(PDFType1Font): def __repr__(self): return '' % self.basefont -# PDFType3Font class PDFType3Font(PDFSimpleFont): def __init__(self, rsrcmgr, spec): @@ -648,7 +632,6 @@ class PDFType3Font(PDFSimpleFont): return '' -# PDFCIDFont class PDFCIDFont(PDFFont): def __init__(self, rsrcmgr, spec, strict=settings.STRICT): @@ -661,18 +644,8 @@ class PDFCIDFont(PDFFont): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) - try: - name = literal_name(spec['Encoding']) - except KeyError: - if strict: - raise PDFFontError('Encoding is unspecified') - name = 'unknown' - try: - self.cmap = CMapDB.get_cmap(name) - except CMapDB.CMapNotFound as e: - if strict: - raise PDFFontError(e) - self.cmap = CMap() + self.cmap = self.get_cmap_from_spec(spec, strict) + try: descriptor = dict_value(spec['FontDescriptor']) except KeyError: @@ -719,6 +692,36 @@ class PDFCIDFont(PDFFont): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return + def get_cmap_from_spec(self, spec, strict): + """ + For certain PDFs, Encoding Type isn't mentioned as an attribute of + Encoding but as an attribute of CMapName, where CMapName is an + attribute of spec['Encoding']. + The horizontal/vertical modes are mentioned with different name + such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. + """ + try: + spec_encoding = spec['Encoding'] + if hasattr(spec_encoding, 'name'): + cmap_name = literal_name(spec['Encoding']) + else: + cmap_name = literal_name(spec_encoding['CMapName']) + except KeyError: + if strict: + raise PDFFontError('Encoding is unspecified') + cmap_name = 'unknown' + if type(cmap_name) is PDFStream: + if 'CMapName' in cmap_name: + cmap_name = cmap_name.get('CMapName').name + else: + if strict: + raise PDFFontError('CMapName unspecified for encoding') + cmap_name = 'unknown' + if cmap_name in IDENTITY_ENCODER: + return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name]) + else: + return CMap() + def __repr__(self): return '' % (self.basefont, self.cidcoding) @@ -743,16 +746,14 @@ class PDFCIDFont(PDFFont): except KeyError: raise PDFUnicodeNotDefined(self.cidcoding, cid) - -# main def main(argv): for fname in argv[1:]: fp = open(fname, 'rb') - #font = TrueTypeFont(fname, fp) font = CFFFont(fname, fp) print (font) fp.close() return + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/samples/sampleOneByteIdentityEncode.pdf b/samples/sampleOneByteIdentityEncode.pdf new file mode 100644 index 0000000..35abc4f Binary files /dev/null and b/samples/sampleOneByteIdentityEncode.pdf differ diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py new file mode 100644 index 0000000..5878eba --- /dev/null +++ b/tests/test_pdfencoding.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +# -*- coding: utf-8 -*- + +import nose, logging, os +from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte +from pdfminer.pdffont import PDFCIDFont +from pdfminer.pdftypes import PDFStream +from pdfminer.psparser import PSLiteral + +class TestPDFEncoding(): + + def test_cmapname_onebyteidentityV(self): + stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMapByte) + + def test_cmapname_onebyteidentityH(self): + stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMapByte) + + def test_cmapname_V(self): + stream = PDFStream({'CMapName': PSLiteral('V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_cmapname_H(self): + stream = PDFStream({'CMapName': PSLiteral('H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_encoding_identityH(self): + spec = {'Encoding': PSLiteral('Identity-H')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV(self): + spec = {'Encoding': PSLiteral('Identity-V')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityH_as_stream(self): + stream = PDFStream({'CMapName':'Identity-H'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV_as_stream(self): + stream = PDFStream({'CMapName':'Identity-V'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH(self): + spec = {'Encoding': PSLiteral('DLIdent-H')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentV(self): + spec = {'Encoding': PSLiteral('DLIdent-V')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_stream(self): + stream = PDFStream({'CMapName':'DLIdent-H'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentV_as_stream(self): + stream = PDFStream({'CMapName':'DLIdent-V'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_font_without_spec(self): + font = PDFCIDFont(None, {}) + assert isinstance(font.cmap, CMap) + + +if __name__ == '__main__': + nose.runmodule() diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 18be203..6126d92 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -24,6 +24,7 @@ class TestDumpPDF(): run('../samples/','simple1') run('../samples/','simple2') run('../samples/','simple3') + run('../samples/','sampleOneByteIdentityEncode') def test_2(self): run('../samples/nonfree/','dmca')