Merge branch 'pdfstream-as-cmap' of https://github.com/fakabbir/pdfminer.six into pdfstream-as-cmap

pull/283/head
Fakabbir Amin 2019-08-10 11:04:10 +05:30
commit 3f0f05def6
10 changed files with 260 additions and 45 deletions

32
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,32 @@
# Contributing guidelines
Any contribution is appreciated! You might want to:
* Fix spelling errors
* Improve documentation
* Add tests for untested code
* Add new features
* Fix bugs
## How can I contribute?
* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features
- If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the
issue.
* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request)
* Help others giving your thoughts on open issues and pull requests.
## General guidelines for creating issues and pull requests
* Search previous issues, as yours might be a duplicate.
* When creating a new issue for a bug, include a minimal reproducible example.
* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This
will help others to see the importance of your feature request.
* Link pull request to a single issue.
* Pull requests should be merged to develop, not master. This ensures that master always equals the released verion.
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
of features, this will show that your code works correctly.
* Code should work for Python 2.7 and Python 3.x (for now).
* Code changes should conform to PEP8 coding style (with a line-width of 120). Existing code may stay as it is.
* New features should be well documented using docstrings.
* Check spelling and grammar.

View File

@ -35,7 +35,7 @@ Features
How to Install How to Install
-------------- --------------
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six) * Install Python 2.7 or newer.
* Install * Install
`pip install pdfminer.six` `pip install pdfminer.six`
@ -81,6 +81,12 @@ TODO
* Performance improvements. * Performance improvements.
Contributing
------------
Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md).
Terms and Conditions Terms and Conditions
-------------------- --------------------

View File

@ -83,7 +83,7 @@ class CMap(CMapBase):
assert isinstance(cmap, CMap), str(type(cmap)) assert isinstance(cmap, CMap), str(type(cmap))
def copy(dst, src): def copy(dst, src):
for (k, v) in src.iteritems(): for (k, v) in six.iteritems(src):
if isinstance(v, dict): if isinstance(v, dict):
d = {} d = {}
dst[k] = d dst[k] = d
@ -110,7 +110,7 @@ class CMap(CMapBase):
if code2cid is None: if code2cid is None:
code2cid = self.code2cid code2cid = self.code2cid
code = () code = ()
for (k, v) in sorted(code2cid.iteritems()): for (k, v) in sorted(six.iteritems(code2cid)):
c = code+(k,) c = code+(k,)
if isinstance(v, int): if isinstance(v, int):
out.write('code %r = cid %d\n' % (c, v)) out.write('code %r = cid %d\n' % (c, v))
@ -157,7 +157,7 @@ class UnicodeMap(CMapBase):
return self.cid2unichr[cid] return self.cid2unichr[cid]
def dump(self, out=sys.stdout): def dump(self, out=sys.stdout):
for (k, v) in sorted(self.cid2unichr.iteritems()): for (k, v) in sorted(six.iteritems(self.cid2unichr)):
out.write('cid %d = unicode %r\n' % (k, v)) out.write('cid %d = unicode %r\n' % (k, v))
return return

View File

@ -1,28 +1,67 @@
import logging
import re import re
from .psparser import PSLiteral
import six # Python 2+3 compatibility
from .glyphlist import glyphname2unicode from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING from .latin_enc import ENCODING
from .psparser import PSLiteral
import six # Python 2+3 compatibility HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
STRIP_NAME = re.compile(r'[0-9]+') log = logging.getLogger(__name__)
## name2unicode
##
def name2unicode(name): def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers.""" """Converts Adobe glyph names to Unicode numbers.
if name in glyphname2unicode:
return glyphname2unicode[name] In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
m = STRIP_NAME.search(name) This way the caller must explicitly define what to do when there is not a match.
if not m:
raise KeyError(name) Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
return six.unichr(int(m.group(0)))
:returns unicode character if name resembles something, otherwise a KeyError
"""
name = name.split('.')[0]
components = name.split('_')
if len(components) > 1:
return ''.join(map(name2unicode, components))
else:
if name in glyphname2unicode:
return glyphname2unicode.get(name)
elif name.startswith('uni'):
name_without_uni = name.strip('uni')
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
for digit in unicode_digits:
raise_key_error_for_invalid_unicode(digit)
characters = map(six.unichr, unicode_digits)
return ''.join(characters)
elif name.startswith('u'):
name_without_u = name.strip('u')
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16)
raise_key_error_for_invalid_unicode(unicode_digit)
return six.unichr(unicode_digit)
raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
def raise_key_error_for_invalid_unicode(unicode_digit):
"""Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
:raises KeyError if unicode digit is invalid
"""
if 55295 < unicode_digit < 57344:
raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)
## EncodingDB
##
class EncodingDB(object): class EncodingDB(object):
std2unicode = {} std2unicode = {}
@ -59,7 +98,7 @@ class EncodingDB(object):
elif isinstance(x, PSLiteral): elif isinstance(x, PSLiteral):
try: try:
cid2unicode[cid] = name2unicode(x.name) cid2unicode[cid] = name2unicode(x.name)
except KeyError: except KeyError as e:
pass log.debug(str(e))
cid += 1 cid += 1
return cid2unicode return cid2unicode

View File

@ -178,7 +178,7 @@ class TagExtractor(PDFDevice):
s = '' s = ''
if isinstance(props, dict): if isinstance(props, dict):
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
in sorted(props.iteritems())) in sorted(six.iteritems(props)))
out_s = '<%s%s>' % (utils.enc(tag.name), s) out_s = '<%s%s>' % (utils.enc(tag.name), s)
self.outfp.write(utils.make_compat_bytes(out_s)) self.outfp.write(utils.make_compat_bytes(out_s))
self._stack.append(tag) self._stack.append(tag)

View File

@ -1,35 +1,39 @@
import logging
import sys
import struct import struct
import sys
from io import BytesIO from io import BytesIO
import six # Python 2+3 compatibility
from . import settings
from .cmapdb import CMap
from .cmapdb import CMapDB from .cmapdb import CMapDB
from .cmapdb import CMapParser from .cmapdb import CMapParser
from .cmapdb import FileUnicodeMap from .cmapdb import FileUnicodeMap
from .cmapdb import CMap
from .encodingdb import EncodingDB from .encodingdb import EncodingDB
from .encodingdb import name2unicode from .encodingdb import name2unicode
from .psparser import PSStackParser from .fontmetrics import FONT_METRICS
from .psparser import PSEOF
from .psparser import LIT
from .psparser import KWD
from . import settings
from .psparser import PSLiteral
from .psparser import literal_name
from .pdftypes import PDFException from .pdftypes import PDFException
from .pdftypes import PDFStream from .pdftypes import PDFStream
from .pdftypes import resolve1 from .pdftypes import resolve1
from .pdftypes import int_value
from .pdftypes import num_value
from .pdftypes import list_value
from .pdftypes import dict_value from .pdftypes import dict_value
from .pdftypes import int_value
from .pdftypes import list_value
from .pdftypes import num_value
from .pdftypes import resolve1
from .pdftypes import stream_value from .pdftypes import stream_value
from .fontmetrics import FONT_METRICS from .psparser import KWD
from .psparser import LIT
from .psparser import PSEOF
from .psparser import PSLiteral
from .psparser import PSStackParser
from .psparser import literal_name
from .utils import apply_matrix_norm from .utils import apply_matrix_norm
from .utils import nunpack
from .utils import choplist from .utils import choplist
from .utils import isnumber from .utils import isnumber
from .utils import nunpack
import six #Python 2+3 compatibility log = logging.getLogger(__name__)
def get_widths(seq): def get_widths(seq):
@ -99,7 +103,6 @@ class Type1FontHeaderParser(PSStackParser):
KEYWORD_ARRAY = KWD(b'array') KEYWORD_ARRAY = KWD(b'array')
KEYWORD_READONLY = KWD(b'readonly') KEYWORD_READONLY = KWD(b'readonly')
KEYWORD_FOR = KWD(b'for') KEYWORD_FOR = KWD(b'for')
KEYWORD_FOR = KWD(b'for')
def __init__(self, data): def __init__(self, data):
PSStackParser.__init__(self, data) PSStackParser.__init__(self, data)
@ -107,6 +110,17 @@ class Type1FontHeaderParser(PSStackParser):
return return
def get_encoding(self): def get_encoding(self):
"""Parse the font encoding
The Type1 font encoding maps character codes to character names. These character names could either be standard
Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
sequence of operations that describe how the character should be drawn.
Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
:returns mapping of character identifiers (cid's) to unicode characters
"""
while 1: while 1:
try: try:
(cid, name) = self.nextobject() (cid, name) = self.nextobject()
@ -114,8 +128,8 @@ class Type1FontHeaderParser(PSStackParser):
break break
try: try:
self._cid2unicode[cid] = name2unicode(name) self._cid2unicode[cid] = name2unicode(name)
except KeyError: except KeyError as e:
pass log.debug(str(e))
return self._cid2unicode return self._cid2unicode
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
@ -460,7 +474,7 @@ class TrueTypeFont(object):
assert False, str(('Unhandled', fmttype)) assert False, str(('Unhandled', fmttype))
# create unicode map # create unicode map
unicode_map = FileUnicodeMap() unicode_map = FileUnicodeMap()
for (char, gid) in char2gid.iteritems(): for (char, gid) in six.iteritems(char2gid):
unicode_map.add_cid2unichr(gid, char) unicode_map.add_cid2unichr(gid, char)
return unicode_map return unicode_map

View File

@ -96,7 +96,7 @@ def resolve_all(x, default=None):
if isinstance(x, list): if isinstance(x, list):
x = [resolve_all(v, default=default) for v in x] x = [resolve_all(v, default=default) for v in x]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k, v) in x.iteritems(): for (k, v) in six.iteritems(x):
x[k] = resolve_all(v, default=default) x[k] = resolve_all(v, default=default)
return x return x

121
tests/test_encodingdb.py Normal file
View File

@ -0,0 +1,121 @@
"""
Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
added.
"""
from nose.tools import assert_raises
from pdfminer.encodingdb import name2unicode
def test_name2unicode_name_in_agl():
"""The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
assert u'\u013B' == name2unicode('Lcommaaccent')
def test_name2unicode_uni():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
assert u'\u013B' == name2unicode('uni013B')
def test_name2unicode_uni_lowercase():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
assert u'\u013B' == name2unicode('uni013b')
def test_name2unicode_uni_with_sequence_of_digits():
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
def test_name2unicode_uni_empty_string():
"""The name "uni20ac" has a single component, which is mapped to a euro-sign.
According to the specification this should be mapped to an empty string, but we also want to support lowercase
hexadecimals
"""
assert u'\u20ac' == name2unicode('uni20ac')
def test_name2unicode_uni_empty_string_long():
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
glyph name "u1040C.
"""
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
def test_name2unicode_uni_empty_string_long_lowercase():
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
glyph name "u1040C."""
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
def test_name2unicode_uni_pua():
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
assert u'\uF6FB' == name2unicode('uniF6FB')
def test_name2unicode_uni_pua_lowercase():
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
assert u'\uF6FB' == name2unicode('unif6fb')
def test_name2unicode_u_with_4_digits():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
assert u'\u013B' == name2unicode('u013B')
def test_name2unicode_u_with_4_digits_lowercase():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
assert u'\u013B' == name2unicode('u013b')
def test_name2unicode_u_with_5_digits():
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
assert u'\U0001040C' == name2unicode('u1040C')
def test_name2unicode_u_with_5_digits_lowercase():
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
assert u'\U0001040C' == name2unicode('u1040c')
def test_name2unicode_multiple_components():
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
def test_name2unicode_multiple_components_lowercase():
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
def test_name2unicode_foo():
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
assert_raises(KeyError, name2unicode, 'foo')
def test_name2unicode_notdef():
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
assert_raises(KeyError, name2unicode, '.notdef')
def test_name2unicode_pua_ogoneksmall():
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
assert u'\uF6FB' == name2unicode('Ogoneksmall')
def test_name2unicode_overflow_error():
assert_raises(KeyError, name2unicode, '226215240241240240240240')

View File

@ -3,6 +3,8 @@
import sys import sys
import fileinput import fileinput
import six #Python 2+3 compatibility
def main(argv): def main(argv):
fonts = {} fonts = {}
for line in fileinput.input(): for line in fileinput.input():
@ -33,7 +35,7 @@ def main(argv):
props[k] = tuple(map(float, f[1:5])) props[k] = tuple(map(float, f[1:5]))
print ('# -*- python -*-') print ('# -*- python -*-')
print ('FONT_METRICS = {') print ('FONT_METRICS = {')
for (fontname,(props,chars)) in fonts.iteritems(): for (fontname,(props,chars)) in six.iteritems(fonts):
print (' %r: %r,' % (fontname, (props,chars))) print (' %r: %r,' % (fontname, (props,chars)))
print ('}') print ('}')
return 0 return 0

View File

@ -26,6 +26,7 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
import six #Python 2+3 compatibility
# quote HTML metacharacters # quote HTML metacharacters
def q(x): def q(x):
@ -35,7 +36,7 @@ def q(x):
Q = re.compile(r'[^a-zA-Z0-9_.-=]') Q = re.compile(r'[^a-zA-Z0-9_.-=]')
def url(base, **kw): def url(base, **kw):
r = [] r = []
for (k,v) in kw.iteritems(): for (k,v) in six.iteritems(kw):
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0]) v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
r.append('%s=%s' % (k, v)) r.append('%s=%s' % (k, v))
return base+'&'.join(r) return base+'&'.join(r)