diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..b2680c2 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,32 @@ +# Contributing guidelines + +Any contribution is appreciated! You might want to: + +* Fix spelling errors +* Improve documentation +* Add tests for untested code +* Add new features +* Fix bugs + +## How can I contribute? + +* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features + - If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the + issue. +* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request) +* Help others giving your thoughts on open issues and pull requests. + +## General guidelines for creating issues and pull requests + +* Search previous issues, as yours might be a duplicate. +* When creating a new issue for a bug, include a minimal reproducible example. +* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This + will help others to see the importance of your feature request. +* Link pull request to a single issue. +* Pull requests should be merged to develop, not master. This ensures that master always equals the released verion. +* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case + of features, this will show that your code works correctly. +* Code should work for Python 2.7 and Python 3.x (for now). +* Code changes should conform to PEP8 coding style (with a line-width of 120). Existing code may stay as it is. +* New features should be well documented using docstrings. +* Check spelling and grammar. \ No newline at end of file diff --git a/README.md b/README.md index 2a96278..488027c 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Features How to Install -------------- - * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six) + * Install Python 2.7 or newer. * Install `pip install pdfminer.six` @@ -81,6 +81,12 @@ TODO * Performance improvements. +Contributing +------------ + +Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). + + Terms and Conditions -------------------- diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index f7f4a0b..83110e7 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -83,7 +83,7 @@ class CMap(CMapBase): assert isinstance(cmap, CMap), str(type(cmap)) def copy(dst, src): - for (k, v) in src.iteritems(): + for (k, v) in six.iteritems(src): if isinstance(v, dict): d = {} dst[k] = d @@ -110,7 +110,7 @@ class CMap(CMapBase): if code2cid is None: code2cid = self.code2cid code = () - for (k, v) in sorted(code2cid.iteritems()): + for (k, v) in sorted(six.iteritems(code2cid)): c = code+(k,) if isinstance(v, int): out.write('code %r = cid %d\n' % (c, v)) @@ -157,7 +157,7 @@ class UnicodeMap(CMapBase): return self.cid2unichr[cid] def dump(self, out=sys.stdout): - for (k, v) in sorted(self.cid2unichr.iteritems()): + for (k, v) in sorted(six.iteritems(self.cid2unichr)): out.write('cid %d = unicode %r\n' % (k, v)) return diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index 870bd28..7100235 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -1,28 +1,67 @@ - +import logging import re -from .psparser import PSLiteral + +import six # Python 2+3 compatibility + from .glyphlist import glyphname2unicode from .latin_enc import ENCODING +from .psparser import PSLiteral -import six # Python 2+3 compatibility +HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') -STRIP_NAME = re.compile(r'[0-9]+') +log = logging.getLogger(__name__) -## name2unicode -## def name2unicode(name): - """Converts Adobe glyph names to Unicode numbers.""" - if name in glyphname2unicode: - return glyphname2unicode[name] - m = STRIP_NAME.search(name) - if not m: - raise KeyError(name) - return six.unichr(int(m.group(0))) + """Converts Adobe glyph names to Unicode numbers. + + In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown. + This way the caller must explicitly define what to do when there is not a match. + + Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping + + :returns unicode character if name resembles something, otherwise a KeyError + """ + name = name.split('.')[0] + components = name.split('_') + + if len(components) > 1: + return ''.join(map(name2unicode, components)) + + else: + if name in glyphname2unicode: + return glyphname2unicode.get(name) + + elif name.startswith('uni'): + name_without_uni = name.strip('uni') + + if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: + unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)] + for digit in unicode_digits: + raise_key_error_for_invalid_unicode(digit) + characters = map(six.unichr, unicode_digits) + return ''.join(characters) + + elif name.startswith('u'): + name_without_u = name.strip('u') + + if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: + unicode_digit = int(name_without_u, base=16) + raise_key_error_for_invalid_unicode(unicode_digit) + return six.unichr(unicode_digit) + + raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name) + + +def raise_key_error_for_invalid_unicode(unicode_digit): + """Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16 + + :raises KeyError if unicode digit is invalid + """ + if 55295 < unicode_digit < 57344: + raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit) -## EncodingDB -## class EncodingDB(object): std2unicode = {} @@ -59,7 +98,7 @@ class EncodingDB(object): elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(x.name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) cid += 1 return cid2unicode diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 0d4c175..54925f1 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -178,7 +178,7 @@ class TagExtractor(PDFDevice): s = '' if isinstance(props, dict): s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) - in sorted(props.iteritems())) + in sorted(six.iteritems(props))) out_s = '<%s%s>' % (utils.enc(tag.name), s) self.outfp.write(utils.make_compat_bytes(out_s)) self._stack.append(tag) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 17b80cd..4bfd6ed 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -1,35 +1,39 @@ - -import sys +import logging import struct +import sys from io import BytesIO + +import six # Python 2+3 compatibility + +from . import settings +from .cmapdb import CMap from .cmapdb import CMapDB from .cmapdb import CMapParser from .cmapdb import FileUnicodeMap -from .cmapdb import CMap from .encodingdb import EncodingDB from .encodingdb import name2unicode -from .psparser import PSStackParser -from .psparser import PSEOF -from .psparser import LIT -from .psparser import KWD -from . import settings -from .psparser import PSLiteral -from .psparser import literal_name +from .fontmetrics import FONT_METRICS from .pdftypes import PDFException from .pdftypes import PDFStream from .pdftypes import resolve1 -from .pdftypes import int_value -from .pdftypes import num_value -from .pdftypes import list_value from .pdftypes import dict_value +from .pdftypes import int_value +from .pdftypes import list_value +from .pdftypes import num_value +from .pdftypes import resolve1 from .pdftypes import stream_value -from .fontmetrics import FONT_METRICS +from .psparser import KWD +from .psparser import LIT +from .psparser import PSEOF +from .psparser import PSLiteral +from .psparser import PSStackParser +from .psparser import literal_name from .utils import apply_matrix_norm -from .utils import nunpack from .utils import choplist from .utils import isnumber +from .utils import nunpack -import six #Python 2+3 compatibility +log = logging.getLogger(__name__) def get_widths(seq): @@ -99,7 +103,6 @@ class Type1FontHeaderParser(PSStackParser): KEYWORD_ARRAY = KWD(b'array') KEYWORD_READONLY = KWD(b'readonly') KEYWORD_FOR = KWD(b'for') - KEYWORD_FOR = KWD(b'for') def __init__(self, data): PSStackParser.__init__(self, data) @@ -107,6 +110,17 @@ class Type1FontHeaderParser(PSStackParser): return def get_encoding(self): + """Parse the font encoding + + The Type1 font encoding maps character codes to character names. These character names could either be standard + Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a + sequence of operations that describe how the character should be drawn. + Currently, this function returns '' (empty string) for character names that are associated with a CharStrings. + + References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf + + :returns mapping of character identifiers (cid's) to unicode characters + """ while 1: try: (cid, name) = self.nextobject() @@ -114,8 +128,8 @@ class Type1FontHeaderParser(PSStackParser): break try: self._cid2unicode[cid] = name2unicode(name) - except KeyError: - pass + except KeyError as e: + log.debug(str(e)) return self._cid2unicode def do_keyword(self, pos, token): @@ -460,7 +474,7 @@ class TrueTypeFont(object): assert False, str(('Unhandled', fmttype)) # create unicode map unicode_map = FileUnicodeMap() - for (char, gid) in char2gid.iteritems(): + for (char, gid) in six.iteritems(char2gid): unicode_map.add_cid2unichr(gid, char) return unicode_map diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 40cca46..c6e8d86 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -96,7 +96,7 @@ def resolve_all(x, default=None): if isinstance(x, list): x = [resolve_all(v, default=default) for v in x] elif isinstance(x, dict): - for (k, v) in x.iteritems(): + for (k, v) in six.iteritems(x): x[k] = resolve_all(v, default=default) return x diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py new file mode 100644 index 0000000..36e4b11 --- /dev/null +++ b/tests/test_encodingdb.py @@ -0,0 +1,121 @@ +""" +Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping) + +While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are +added. +""" +from nose.tools import assert_raises + +from pdfminer.encodingdb import name2unicode + + +def test_name2unicode_name_in_agl(): + """The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL""" + assert u'\u013B' == name2unicode('Lcommaaccent') + + +def test_name2unicode_uni(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013B') + + +def test_name2unicode_uni_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('uni013b') + + +def test_name2unicode_uni_with_sequence_of_digits(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20AC0308') + + +def test_name2unicode_uni_with_sequence_of_digits_lowercase(): + """The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308""" + assert u'\u20AC\u0308' == name2unicode('uni20ac0308') + + +def test_name2unicode_uni_empty_string(): + """The name "uni20ac" has a single component, which is mapped to a euro-sign. + + According to the specification this should be mapped to an empty string, but we also want to support lowercase + hexadecimals + """ + assert u'\u20ac' == name2unicode('uni20ac') + + +def test_name2unicode_uni_empty_string_long(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C. + """ + assert_raises(KeyError, name2unicode, 'uniD801DC0C') + + +def test_name2unicode_uni_empty_string_long_lowercase(): + """The name "uniD801DC0C" has a single component, which is mapped to an empty string + + Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is + expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the + glyph name "u1040C.""" + assert_raises(KeyError, name2unicode, 'uniD801DC0C') + + +def test_name2unicode_uni_pua(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('uniF6FB') + + +def test_name2unicode_uni_pua_lowercase(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('unif6fb') + + +def test_name2unicode_u_with_4_digits(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013B') + + +def test_name2unicode_u_with_4_digits_lowercase(): + """The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B""" + assert u'\u013B' == name2unicode('u013b') + + +def test_name2unicode_u_with_5_digits(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040C') + + +def test_name2unicode_u_with_5_digits_lowercase(): + """The name "u1040C" has a single component, which is mapped to the string U+1040C""" + assert u'\U0001040C' == name2unicode('u1040c') + + +def test_name2unicode_multiple_components(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') + + +def test_name2unicode_multiple_components_lowercase(): + """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C""" + assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate') + + +def test_name2unicode_foo(): + """The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'""" + assert_raises(KeyError, name2unicode, 'foo') + + +def test_name2unicode_notdef(): + """The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)""" + assert_raises(KeyError, name2unicode, '.notdef') + + +def test_name2unicode_pua_ogoneksmall(): + """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB.""" + assert u'\uF6FB' == name2unicode('Ogoneksmall') + + +def test_name2unicode_overflow_error(): + assert_raises(KeyError, name2unicode, '226215240241240240240240') diff --git a/tools/conv_afm.py b/tools/conv_afm.py index 2402a8e..2345982 100755 --- a/tools/conv_afm.py +++ b/tools/conv_afm.py @@ -3,6 +3,8 @@ import sys import fileinput +import six #Python 2+3 compatibility + def main(argv): fonts = {} for line in fileinput.input(): @@ -33,7 +35,7 @@ def main(argv): props[k] = tuple(map(float, f[1:5])) print ('# -*- python -*-') print ('FONT_METRICS = {') - for (fontname,(props,chars)) in fonts.iteritems(): + for (fontname,(props,chars)) in six.iteritems(fonts): print (' %r: %r,' % (fontname, (props,chars))) print ('}') return 0 diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi index 8522a04..e2ea964 100755 --- a/tools/pdf2html.cgi +++ b/tools/pdf2html.cgi @@ -26,6 +26,7 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.layout import LAParams +import six #Python 2+3 compatibility # quote HTML metacharacters def q(x): @@ -35,7 +36,7 @@ def q(x): Q = re.compile(r'[^a-zA-Z0-9_.-=]') def url(base, **kw): r = [] - for (k,v) in kw.iteritems(): + for (k,v) in six.iteritems(kw): v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) r.append('%s=%s' % (k, v)) return base+'&'.join(r)