Merge pull request #283 from fakabbir/pdfstream-as-cmap

Pdfstream as cmap
2019-10-12 21:22:52 +05:30 · 2019-10-12 21:22:52 +05:30 · f53fbd98b1
parent 42e2c8143b 7c03d96d25
commit f53fbd98b1
5 changed files with 172 additions and 70 deletions
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -1,5 +1,3 @@
-
-
 """ Adobe character mapping (CMap) support.

 CMaps provide the mapping between character codes and Unicode
@ -40,8 +38,6 @@ class CMapError(Exception):
    pass


-##  CMapBase
-##
 class CMapBase(object):

    debug = 0
@ -67,8 +63,6 @@ class CMapBase(object):
        return


-##  CMap
-##
 class CMap(CMapBase):

    def __init__(self, **kwargs):
@ -119,8 +113,6 @@ class CMap(CMapBase):
        return


-##  IdentityCMap
-##
 class IdentityCMap(CMapBase):

    def decode(self, code):
@ -131,8 +123,16 @@ class IdentityCMap(CMapBase):
            return ()


-##  UnicodeMap
-##
+class IdentityCMapByte(IdentityCMap):
+
+    def decode(self, code):
+        n = len(code)
+        if n:
+            return struct.unpack('>%dB' % n, code)
+        else:
+            return ()
+
+
 class UnicodeMap(CMapBase):

    def __init__(self, **kwargs):
@ -153,8 +153,6 @@ class UnicodeMap(CMapBase):
        return


-##  FileCMap
-##
 class FileCMap(CMap):

    def add_code2cid(self, code, cid):
@ -173,8 +171,6 @@ class FileCMap(CMap):
        return


-##  FileUnicodeMap
-##
 class FileUnicodeMap(UnicodeMap):

    def add_cid2unichr(self, cid, code):
@ -192,8 +188,6 @@ class FileUnicodeMap(UnicodeMap):
        return


-##  PyCMap
-##
 class PyCMap(CMap):

    def __init__(self, name, module):
@ -204,8 +198,6 @@ class PyCMap(CMap):
        return


-##  PyUnicodeMap
-##
 class PyUnicodeMap(UnicodeMap):

    def __init__(self, name, module, vertical):
@ -218,8 +210,6 @@ class PyUnicodeMap(UnicodeMap):
        return


-##  CMapDB
-##
 class CMapDB(object):

    _cmap_cache = {}
@ -252,6 +242,10 @@ class CMapDB(object):
            return IdentityCMap(WMode=0)
        elif name == 'Identity-V':
            return IdentityCMap(WMode=1)
+        elif name == 'OneByteIdentityH':
+            return IdentityCMapByte(WMode=0)
+        elif name == 'OneByteIdentityV':
+            return IdentityCMapByte(WMode=1)
        try:
            return klass._cmap_cache[name]
        except KeyError:
@ -271,8 +265,6 @@ class CMapDB(object):
        return umaps[vertical]


-##  CMapParser
-##
 class CMapParser(PSStackParser):

    def __init__(self, cmap, fp):
@ -360,7 +352,6 @@ class CMapParser(PSStackParser):
                s1 = nunpack(svar)
                e1 = nunpack(evar)
                vlen = len(svar)
-                #assert s1 <= e1, str((s1, e1))
                for i in range(e1-s1+1):
                    x = sprefix+struct.pack('>L', s1+i)[-vlen:]
                    self.cmap.add_code2cid(x, cid+i)
@ -387,7 +378,6 @@ class CMapParser(PSStackParser):
                        continue
                s1 = nunpack(s)
                e1 = nunpack(e)
-                #assert s1 <= e1, str((s1, e1))
                if isinstance(code, list):
                    for i in range(e1-s1+1):
                        self.cmap.add_cid2unichr(s1+i, code[i])
@ -422,17 +412,16 @@ class CMapParser(PSStackParser):
        return


-# test
 def main(argv):
    args = argv[1:]
    for fname in args:
        fp = open(fname, 'rb')
        cmap = FileUnicodeMap()
-        #cmap = FileCMap()
        CMapParser(cmap, fp).run()
        fp.close()
        cmap.dump()
    return

+
 if __name__ == '__main__':
    sys.exit(main(sys.argv))
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -14,6 +14,8 @@ from .encodingdb import EncodingDB
 from .encodingdb import name2unicode
 from .fontmetrics import FONT_METRICS
 from .pdftypes import PDFException
+from .pdftypes import PDFStream
+from .pdftypes import resolve1
 from .pdftypes import dict_value
 from .pdftypes import int_value
 from .pdftypes import list_value
@ -33,7 +35,6 @@ from .utils import nunpack

 log = logging.getLogger(__name__)

-
 def get_widths(seq):
    widths = {}
    r = []
@ -52,10 +53,6 @@ def get_widths(seq):
                    widths[i] = w
                r = []
    return widths
-#assert get_widths([1]) == {}
-#assert get_widths([1,2,3]) == {1:3, 2:3}
-#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
-

 def get_widths2(seq):
    widths = {}
@ -75,13 +72,8 @@ def get_widths2(seq):
                    widths[i] = (w, (vx, vy))
                r = []
    return widths
-#assert get_widths2([1]) == {}
-#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
-#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}


-##  FontMetricsDB
-##
 class FontMetricsDB(object):

    @classmethod
@ -89,8 +81,6 @@ class FontMetricsDB(object):
        return FONT_METRICS[fontname]


-##  Type1FontHeaderParser
-##
 class Type1FontHeaderParser(PSStackParser):

    KEYWORD_BEGIN = KWD(b'begin')
@ -141,11 +131,16 @@ class Type1FontHeaderParser(PSStackParser):

 NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')

+#Note: DLIdent-* isn't found in PDF Reference but is been kept as
+#it is harmless and have possibility of been a type. (induced from bug report/PR)
+IDENTITY_ENCODER = {'Identity-H':'Identity-H',
+                    'Identity-V':'Identity-V',
+                    'DLIdent-H':'Identity-H',
+                    'DLIdent-V':'Identity-V',
+                    'OneByteIdentityH':'OneByteIdentityH',
+                    'OneByteIdentityV':'OneByteIdentityV',
+                    }

-##  CFFFont
-##  (Format specified in Adobe Technical Note: #5176
-##   "The Compact Font Format Specification")
-##
 def getdict(data):
    d = {}
    fp = BytesIO(data)
@ -273,6 +268,7 @@ class CFFFont(object):
      'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
    )

+
    class INDEX(object):

        def __init__(self, fp):
@ -373,9 +369,6 @@ class CFFFont(object):
            assert False, str(('Unhandled', format))
        else:
            raise ValueError('unsupported charset format: %r' % format)
-        #print self.code2gid
-        #print self.name2gid
-        #assert 0
        return

    def getstr(self, sid):
@ -384,8 +377,6 @@ class CFFFont(object):
        return self.string_index[sid-len(self.STANDARD_STRINGS)]


-##  TrueTypeFont
-##
 class TrueTypeFont(object):

    class CMapNotFound(Exception):
@ -471,8 +462,6 @@ class TrueTypeFont(object):
        return unicode_map


-##  Fonts
-##
 class PDFFontError(PDFException):
    pass

@ -484,7 +473,6 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
 LITERAL_TYPE1C = LIT('Type1C')


-# PDFFont
 class PDFFont(object):

    def __init__(self, descriptor, widths, default_width=None):
@ -549,7 +537,6 @@ class PDFFont(object):
        return sum(self.char_width(cid) for cid in self.decode(s))


-# PDFSimpleFont
 class PDFSimpleFont(PDFFont):

    def __init__(self, descriptor, widths, spec):
@ -586,7 +573,6 @@ class PDFSimpleFont(PDFFont):
            raise PDFUnicodeNotDefined(None, cid)


-# PDFType1Font
 class PDFType1Font(PDFSimpleFont):

    def __init__(self, rsrcmgr, spec):
@ -618,14 +604,12 @@ class PDFType1Font(PDFSimpleFont):
        return '<PDFType1Font: basefont=%r>' % self.basefont


-# PDFTrueTypeFont
 class PDFTrueTypeFont(PDFType1Font):

    def __repr__(self):
        return '<PDFTrueTypeFont: basefont=%r>' % self.basefont


-# PDFType3Font
 class PDFType3Font(PDFSimpleFont):

    def __init__(self, rsrcmgr, spec):
@ -648,7 +632,6 @@ class PDFType3Font(PDFSimpleFont):
        return '<PDFType3Font>'


-# PDFCIDFont
 class PDFCIDFont(PDFFont):

    def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
@ -661,18 +644,8 @@ class PDFCIDFont(PDFFont):
        self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
        self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
                                    resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
-        try:
-            name = literal_name(spec['Encoding'])
-        except KeyError:
-            if strict:
-                raise PDFFontError('Encoding is unspecified')
-            name = 'unknown'
-        try:
-            self.cmap = CMapDB.get_cmap(name)
-        except CMapDB.CMapNotFound as e:
-            if strict:
-                raise PDFFontError(e)
-            self.cmap = CMap()
+        self.cmap = self.get_cmap_from_spec(spec, strict)
+
        try:
            descriptor = dict_value(spec['FontDescriptor'])
        except KeyError:
@ -719,6 +692,36 @@ class PDFCIDFont(PDFFont):
        PDFFont.__init__(self, descriptor, widths, default_width=default_width)
        return

+    def get_cmap_from_spec(self, spec, strict):
+        """
+        For certain PDFs, Encoding Type isn't mentioned as an attribute of
+        Encoding but as an attribute of CMapName, where CMapName is an
+        attribute of spec['Encoding'].
+        The horizontal/vertical modes are mentioned with different name
+        such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
+        """
+        try:
+            spec_encoding = spec['Encoding']
+            if hasattr(spec_encoding, 'name'):
+                cmap_name = literal_name(spec['Encoding'])
+            else:
+                cmap_name = literal_name(spec_encoding['CMapName'])
+        except KeyError:
+            if strict:
+                raise PDFFontError('Encoding is unspecified')
+            cmap_name = 'unknown'
+        if type(cmap_name) is PDFStream:
+            if 'CMapName' in cmap_name:
+                cmap_name = cmap_name.get('CMapName').name
+            else:
+                if strict:
+                    raise PDFFontError('CMapName unspecified for encoding')
+                cmap_name = 'unknown'
+        if cmap_name in IDENTITY_ENCODER:
+            return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
+        else:
+            return CMap()
+
    def __repr__(self):
        return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)

@ -743,16 +746,14 @@ class PDFCIDFont(PDFFont):
        except KeyError:
            raise PDFUnicodeNotDefined(self.cidcoding, cid)

-
-# main
 def main(argv):
    for fname in argv[1:]:
        fp = open(fname, 'rb')
-        #font = TrueTypeFont(fname, fp)
        font = CFFFont(fname, fp)
        print (font)
        fp.close()
    return

+
 if __name__ == '__main__':
    sys.exit(main(sys.argv))
--- a/samples/sampleOneByteIdentityEncode.pdf
+++ b/samples/sampleOneByteIdentityEncode.pdf
--- a/tests/test_pdfencoding.py
+++ b/tests/test_pdfencoding.py
@ -0,0 +1,111 @@
+#!/usr/bin/env python
+
+# -*- coding: utf-8 -*-
+
+import nose, logging, os
+from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte
+from pdfminer.pdffont import PDFCIDFont
+from pdfminer.pdftypes import PDFStream
+from pdfminer.psparser import PSLiteral
+
+class TestPDFEncoding():
+
+    def test_cmapname_onebyteidentityV(self):
+        stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMapByte)
+
+    def test_cmapname_onebyteidentityH(self):
+        stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMapByte)
+
+    def test_cmapname_V(self):
+        stream = PDFStream({'CMapName': PSLiteral('V')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, CMap)
+
+    def test_cmapname_H(self):
+        stream = PDFStream({'CMapName': PSLiteral('H')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, CMap)
+
+    def test_encoding_identityH(self):
+        spec = {'Encoding': PSLiteral('Identity-H')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityV(self):
+        spec = {'Encoding': PSLiteral('Identity-V')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityH_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityV_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityH_as_stream(self):
+        stream = PDFStream({'CMapName':'Identity-H'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityV_as_stream(self):
+        stream = PDFStream({'CMapName':'Identity-V'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH(self):
+        spec = {'Encoding': PSLiteral('DLIdent-H')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentV(self):
+        spec = {'Encoding': PSLiteral('DLIdent-V')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH_as_stream(self):
+        stream = PDFStream({'CMapName':'DLIdent-H'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentV_as_stream(self):
+        stream = PDFStream({'CMapName':'DLIdent-V'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_font_without_spec(self):
+        font = PDFCIDFont(None, {})
+        assert isinstance(font.cmap, CMap)
+
+
+if __name__ == '__main__':
+    nose.runmodule()
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -24,6 +24,7 @@ class TestDumpPDF():
        run('../samples/','simple1')
        run('../samples/','simple2')
        run('../samples/','simple3')
+        run('../samples/','sampleOneByteIdentityEncode')

    def test_2(self):
        run('../samples/nonfree/','dmca')