Merge branch 'pdfstream-as-cmap' of https://github.com/fakabbir/pdfminer.six into pdfstream-as-cmap

2019-08-10 11:04:10 +05:30 · 2019-08-10 11:04:10 +05:30 · 3f0f05def6
parent 3125d3634a fe38695739
commit 3f0f05def6
10 changed files with 260 additions and 45 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,32 @@
+# Contributing guidelines
+
+Any contribution is appreciated! You might want to:
+
+* Fix spelling errors
+* Improve documentation
+* Add tests for untested code
+* Add new features
+* Fix bugs
+
+## How can I contribute?
+
+* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features
+    - If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the
+     issue. 
+* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request)
+* Help others giving your thoughts on open issues and pull requests.
+
+## General guidelines for creating issues and pull requests
+
+* Search previous issues, as yours might be a duplicate.
+* When creating a new issue for a bug, include a minimal reproducible example.
+* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This
+  will help others to see the importance of your feature request. 
+* Link pull request to a single issue.
+* Pull requests should be merged to develop, not master. This ensures that master always equals the released verion.  
+* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case 
+  of features, this will show that your code works correctly.
+* Code should work for Python 2.7 and Python 3.x (for now).
+* Code changes should conform to PEP8 coding style (with a line-width of 120). Existing code may stay as it is. 
+* New features should be well documented using docstrings.
+* Check spelling and grammar.
--- a/README.md
+++ b/README.md
@ -35,7 +35,7 @@ Features
 How to Install
 --------------

- * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
+ * Install Python 2.7 or newer.
 * Install

    `pip install pdfminer.six`
@ -81,6 +81,12 @@ TODO
 * Performance improvements.


+Contributing
+------------
+
+Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). 
+
+
 Terms and Conditions
 --------------------

--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -83,7 +83,7 @@ class CMap(CMapBase):
        assert isinstance(cmap, CMap), str(type(cmap))

        def copy(dst, src):
-            for (k, v) in src.iteritems():
+            for (k, v) in six.iteritems(src):
                if isinstance(v, dict):
                    d = {}
                    dst[k] = d
@ -110,7 +110,7 @@ class CMap(CMapBase):
        if code2cid is None:
            code2cid = self.code2cid
            code = ()
-        for (k, v) in sorted(code2cid.iteritems()):
+        for (k, v) in sorted(six.iteritems(code2cid)):
            c = code+(k,)
            if isinstance(v, int):
                out.write('code %r = cid %d\n' % (c, v))
@ -157,7 +157,7 @@ class UnicodeMap(CMapBase):
        return self.cid2unichr[cid]

    def dump(self, out=sys.stdout):
-        for (k, v) in sorted(self.cid2unichr.iteritems()):
+        for (k, v) in sorted(six.iteritems(self.cid2unichr)):
            out.write('cid %d = unicode %r\n' % (k, v))
        return

--- a/pdfminer/encodingdb.py
+++ b/pdfminer/encodingdb.py
@ -1,28 +1,67 @@
-
+import logging
 import re
-from .psparser import PSLiteral
+
+import six  # Python 2+3 compatibility
+
 from .glyphlist import glyphname2unicode
 from .latin_enc import ENCODING
+from .psparser import PSLiteral

-import six # Python 2+3 compatibility
+HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')

-STRIP_NAME = re.compile(r'[0-9]+')
+log = logging.getLogger(__name__)


-##  name2unicode
-##
 def name2unicode(name):
-    """Converts Adobe glyph names to Unicode numbers."""
-    if name in glyphname2unicode:
-        return glyphname2unicode[name]
-    m = STRIP_NAME.search(name)
-    if not m:
-        raise KeyError(name)
-    return six.unichr(int(m.group(0)))
+    """Converts Adobe glyph names to Unicode numbers.
+
+    In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
+    This way the caller must explicitly define what to do when there is not a match.
+
+    Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
+
+    :returns unicode character if name resembles something, otherwise a KeyError
+    """
+    name = name.split('.')[0]
+    components = name.split('_')
+
+    if len(components) > 1:
+        return ''.join(map(name2unicode, components))
+
+    else:
+        if name in glyphname2unicode:
+            return glyphname2unicode.get(name)
+
+        elif name.startswith('uni'):
+            name_without_uni = name.strip('uni')
+
+            if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
+                unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
+                for digit in unicode_digits:
+                    raise_key_error_for_invalid_unicode(digit)
+                characters = map(six.unichr, unicode_digits)
+                return ''.join(characters)
+
+        elif name.startswith('u'):
+            name_without_u = name.strip('u')
+
+            if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
+                unicode_digit = int(name_without_u, base=16)
+                raise_key_error_for_invalid_unicode(unicode_digit)
+                return six.unichr(unicode_digit)
+
+    raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
+
+
+def raise_key_error_for_invalid_unicode(unicode_digit):
+    """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
+
+    :raises KeyError if unicode digit is invalid
+    """
+    if 55295 < unicode_digit < 57344:
+        raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)


-##  EncodingDB
-##
 class EncodingDB(object):

    std2unicode = {}
@ -59,7 +98,7 @@ class EncodingDB(object):
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = name2unicode(x.name)
-                    except KeyError:
-                        pass
+                    except KeyError as e:
+                        log.debug(str(e))
                    cid += 1
        return cid2unicode
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -178,7 +178,7 @@ class TagExtractor(PDFDevice):
        s = ''
        if isinstance(props, dict):
            s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
-                        in sorted(props.iteritems()))
+                        in sorted(six.iteritems(props)))
        out_s = '<%s%s>' % (utils.enc(tag.name), s)
        self.outfp.write(utils.make_compat_bytes(out_s))
        self._stack.append(tag)
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -1,35 +1,39 @@
-
-import sys
+import logging
 import struct
+import sys
 from io import BytesIO
+
+import six  # Python 2+3 compatibility
+
+from . import settings
+from .cmapdb import CMap
 from .cmapdb import CMapDB
 from .cmapdb import CMapParser
 from .cmapdb import FileUnicodeMap
-from .cmapdb import CMap
 from .encodingdb import EncodingDB
 from .encodingdb import name2unicode
-from .psparser import PSStackParser
-from .psparser import PSEOF
-from .psparser import LIT
-from .psparser import KWD
-from . import settings
-from .psparser import PSLiteral
-from .psparser import literal_name
+from .fontmetrics import FONT_METRICS
 from .pdftypes import PDFException
 from .pdftypes import PDFStream
 from .pdftypes import resolve1
-from .pdftypes import int_value
-from .pdftypes import num_value
-from .pdftypes import list_value
 from .pdftypes import dict_value
+from .pdftypes import int_value
+from .pdftypes import list_value
+from .pdftypes import num_value
+from .pdftypes import resolve1
 from .pdftypes import stream_value
-from .fontmetrics import FONT_METRICS
+from .psparser import KWD
+from .psparser import LIT
+from .psparser import PSEOF
+from .psparser import PSLiteral
+from .psparser import PSStackParser
+from .psparser import literal_name
 from .utils import apply_matrix_norm
-from .utils import nunpack
 from .utils import choplist
 from .utils import isnumber
+from .utils import nunpack

-import six #Python 2+3 compatibility
+log = logging.getLogger(__name__)


 def get_widths(seq):
@ -99,7 +103,6 @@ class Type1FontHeaderParser(PSStackParser):
    KEYWORD_ARRAY = KWD(b'array')
    KEYWORD_READONLY = KWD(b'readonly')
    KEYWORD_FOR = KWD(b'for')
-    KEYWORD_FOR = KWD(b'for')

    def __init__(self, data):
        PSStackParser.__init__(self, data)
@ -107,6 +110,17 @@ class Type1FontHeaderParser(PSStackParser):
        return

    def get_encoding(self):
+        """Parse the font encoding
+
+        The Type1 font encoding maps character codes to character names. These character names could either be standard
+        Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
+        sequence of operations that describe how the character should be drawn.
+        Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
+
+        References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
+
+        :returns mapping of character identifiers (cid's) to unicode characters
+        """
        while 1:
            try:
                (cid, name) = self.nextobject()
@ -114,8 +128,8 @@ class Type1FontHeaderParser(PSStackParser):
                break
            try:
                self._cid2unicode[cid] = name2unicode(name)
-            except KeyError:
-                pass
+            except KeyError as e:
+                log.debug(str(e))
        return self._cid2unicode

    def do_keyword(self, pos, token):
@ -460,7 +474,7 @@ class TrueTypeFont(object):
                assert False, str(('Unhandled', fmttype))
        # create unicode map
        unicode_map = FileUnicodeMap()
-        for (char, gid) in char2gid.iteritems():
+        for (char, gid) in six.iteritems(char2gid):
            unicode_map.add_cid2unichr(gid, char)
        return unicode_map

--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@ -96,7 +96,7 @@ def resolve_all(x, default=None):
    if isinstance(x, list):
        x = [resolve_all(v, default=default) for v in x]
    elif isinstance(x, dict):
-        for (k, v) in x.iteritems():
+        for (k, v) in six.iteritems(x):
            x[k] = resolve_all(v, default=default)
    return x

--- a/tests/test_encodingdb.py
+++ b/tests/test_encodingdb.py
@ -0,0 +1,121 @@
+"""
+Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
+
+While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
+added.
+"""
+from nose.tools import assert_raises
+
+from pdfminer.encodingdb import name2unicode
+
+
+def test_name2unicode_name_in_agl():
+    """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
+    assert u'\u013B' == name2unicode('Lcommaaccent')
+
+
+def test_name2unicode_uni():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('uni013B')
+
+
+def test_name2unicode_uni_lowercase():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('uni013b')
+
+
+def test_name2unicode_uni_with_sequence_of_digits():
+    """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
+    assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
+
+
+def test_name2unicode_uni_with_sequence_of_digits_lowercase():
+    """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
+    assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
+
+
+def test_name2unicode_uni_empty_string():
+    """The name "uni20ac" has a single component, which is mapped to a euro-sign.
+
+    According to the specification this should be mapped to an empty string, but we also want to support lowercase
+    hexadecimals
+    """
+    assert u'\u20ac' == name2unicode('uni20ac')
+
+
+def test_name2unicode_uni_empty_string_long():
+    """The name "uniD801DC0C" has a single component, which is mapped to an empty string
+
+    Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
+    expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
+    glyph name "u1040C.
+    """
+    assert_raises(KeyError, name2unicode, 'uniD801DC0C')
+
+
+def test_name2unicode_uni_empty_string_long_lowercase():
+    """The name "uniD801DC0C" has a single component, which is mapped to an empty string
+
+    Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
+    expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
+    glyph name "u1040C."""
+    assert_raises(KeyError, name2unicode, 'uniD801DC0C')
+
+
+def test_name2unicode_uni_pua():
+    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
+    assert u'\uF6FB' == name2unicode('uniF6FB')
+
+
+def test_name2unicode_uni_pua_lowercase():
+    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
+    assert u'\uF6FB' == name2unicode('unif6fb')
+
+
+def test_name2unicode_u_with_4_digits():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('u013B')
+
+
+def test_name2unicode_u_with_4_digits_lowercase():
+    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
+    assert u'\u013B' == name2unicode('u013b')
+
+
+def test_name2unicode_u_with_5_digits():
+    """The name "u1040C" has a single component, which is mapped to the string U+1040C"""
+    assert u'\U0001040C' == name2unicode('u1040C')
+
+
+def test_name2unicode_u_with_5_digits_lowercase():
+    """The name "u1040C" has a single component, which is mapped to the string U+1040C"""
+    assert u'\U0001040C' == name2unicode('u1040c')
+
+
+def test_name2unicode_multiple_components():
+    """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
+    assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
+
+
+def test_name2unicode_multiple_components_lowercase():
+    """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
+    assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
+
+
+def test_name2unicode_foo():
+    """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
+    assert_raises(KeyError, name2unicode, 'foo')
+
+
+def test_name2unicode_notdef():
+    """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
+    assert_raises(KeyError, name2unicode, '.notdef')
+
+
+def test_name2unicode_pua_ogoneksmall():
+    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
+    assert u'\uF6FB' == name2unicode('Ogoneksmall')
+
+
+def test_name2unicode_overflow_error():
+    assert_raises(KeyError, name2unicode, '226215240241240240240240')
--- a/tools/conv_afm.py
+++ b/tools/conv_afm.py
@ -3,6 +3,8 @@
 import sys
 import fileinput

+import six #Python 2+3 compatibility
+
 def main(argv):
    fonts = {}
    for line in fileinput.input():
@ -33,7 +35,7 @@ def main(argv):
            props[k] = tuple(map(float, f[1:5]))
    print ('# -*- python -*-')
    print ('FONT_METRICS = {')
-    for (fontname,(props,chars)) in fonts.iteritems():
+    for (fontname,(props,chars)) in six.iteritems(fonts):
        print (' %r: %r,' % (fontname, (props,chars)))
    print ('}')
    return 0
--- a/tools/pdf2html.cgi
+++ b/tools/pdf2html.cgi
@ -26,6 +26,7 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.converter import HTMLConverter, TextConverter
 from pdfminer.layout import LAParams

+import six #Python 2+3 compatibility

 # quote HTML metacharacters
 def q(x):
@ -35,7 +36,7 @@ def q(x):
 Q = re.compile(r'[^a-zA-Z0-9_.-=]')
 def url(base, **kw):
    r = []
-    for (k,v) in kw.iteritems():
+    for (k,v) in six.iteritems(kw):
        v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
        r.append('%s=%s' % (k, v))
    return base+'&'.join(r)