commit
a5a34d53bd
|
@ -5,9 +5,6 @@ python:
|
||||||
- "3.5"
|
- "3.5"
|
||||||
- "3.6"
|
- "3.6"
|
||||||
install:
|
install:
|
||||||
- pip install six
|
- pip install tox-travis
|
||||||
- pip install pycryptodome
|
|
||||||
- pip install chardet
|
|
||||||
- pip install sortedcontainers
|
|
||||||
script:
|
script:
|
||||||
nosetests --nologcapture
|
- tox
|
||||||
|
|
41
CHANGELOG.md
41
CHANGELOG.md
|
@ -1,6 +1,37 @@
|
||||||
# List of changes
|
# Changelog
|
||||||
|
All notable changes in pdfminer.six will be documented in this file.
|
||||||
|
|
||||||
## Version 20181108
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- PR #141 to speedup layout analysis
|
|
||||||
- PR #173 for using argparse and replace deprecated getopt
|
## [Unreleased]
|
||||||
- PR #142 to compile pdfminer.six with cython, successfully
|
|
||||||
|
Nothing yet
|
||||||
|
|
||||||
|
## [20191020] - 2019-10-20
|
||||||
|
|
||||||
|
### Deprecated
|
||||||
|
- Support for Python 2 is dropped at January 1st, 2020 ([#307](https://github.com/pdfminer/pdfminer.six/pull/307))
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Contribution guidelines in [CONTRIBUTING.md](CONTRIBUTING.md) ([#259](https://github.com/pdfminer/pdfminer.six/pull/259))
|
||||||
|
- Support new encodings OneByteEncoding and DLIdent for CMaps ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Use `six.iteritems()` instead of `dict().iteritems()` to ensure Python2 and Python3 compatibility ([#274](https://github.com/pdfminer/pdfminer.six/pull/274))
|
||||||
|
- Properly convert Adobe Glyph names to unicode characters ([#263](https://github.com/pdfminer/pdfminer.six/pull/263))
|
||||||
|
- Allow CMap to be a content stream ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
|
||||||
|
- Resolve indirect objects for width and bounding boxes for fonts ([#273](https://github.com/pdfminer/pdfminer.six/pull/273))
|
||||||
|
- Actually updating stroke color in graphic state ([#298](https://github.com/pdfminer/pdfminer.six/pull/298))
|
||||||
|
- Interpret (invalid) negative font descent as a positive descent ([#203](https://github.com/pdfminer/pdfminer.six/pull/203))
|
||||||
|
- Correct colorspace comparision for images ([#132](https://github.com/pdfminer/pdfminer.six/pull/132))
|
||||||
|
- Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246))
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
|
||||||
|
|
||||||
|
## [20181108] - 2018-11-08
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Speedup layout analysis ([#141](https://github.com/pdfminer/pdfminer.six/pull/141))
|
||||||
|
- Use argparse instead of replace deprecated getopt ([#173](https://github.com/pdfminer/pdfminer.six/pull/173))
|
||||||
|
- Allow pdfminer.six to be compiled with cython ([#142](https://github.com/pdfminer/pdfminer.six/pull/142))
|
|
@ -0,0 +1,64 @@
|
||||||
|
# Contributing guidelines
|
||||||
|
|
||||||
|
Any contribution is appreciated! You might want to:
|
||||||
|
|
||||||
|
* Fix spelling errors
|
||||||
|
* Improve documentation
|
||||||
|
* Add tests for untested code
|
||||||
|
* Add new features
|
||||||
|
* Fix bugs
|
||||||
|
|
||||||
|
## How can I contribute?
|
||||||
|
|
||||||
|
* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features
|
||||||
|
- If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the
|
||||||
|
issue.
|
||||||
|
* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request).
|
||||||
|
* Help others by giving your thoughts on open issues and pull requests.
|
||||||
|
|
||||||
|
## Guidelines for creating issues
|
||||||
|
|
||||||
|
* Search previous issues, as yours might be a duplicate.
|
||||||
|
* When creating a new issue for a bug, include a minimal reproducible example.
|
||||||
|
* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This
|
||||||
|
will help others to see the importance of your feature request.
|
||||||
|
|
||||||
|
## Guideline for creating pull request
|
||||||
|
|
||||||
|
* A pull request should close an existing issue.
|
||||||
|
* Pull requests should be merged to develop, not master. This ensures that master always equals the released version.
|
||||||
|
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
|
||||||
|
of features, this will show that your code works correctly.
|
||||||
|
* Code should work for Python 2.7 and Python 3.x (for now), conform to PEP8 code style (with a line-width of 120)
|
||||||
|
and properly documented with docstrings.
|
||||||
|
* Check spelling and grammar.
|
||||||
|
* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])
|
||||||
|
|
||||||
|
## Getting started
|
||||||
|
|
||||||
|
1. Clone the repository
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone https://github.com/pdfminer/pdfminer.six
|
||||||
|
cd pdfminer.six
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install dev dependencies
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pip install -e .[dev]
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Run the tests
|
||||||
|
|
||||||
|
On all Python versions:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
tox
|
||||||
|
```
|
||||||
|
|
||||||
|
Or on a single Python version:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
tox -e py36
|
||||||
|
```
|
31
README.md
31
README.md
|
@ -35,7 +35,7 @@ Features
|
||||||
How to Install
|
How to Install
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
|
* Install Python 2.7 or newer.
|
||||||
* Install
|
* Install
|
||||||
|
|
||||||
`pip install pdfminer.six`
|
`pip install pdfminer.six`
|
||||||
|
@ -81,30 +81,7 @@ TODO
|
||||||
* Performance improvements.
|
* Performance improvements.
|
||||||
|
|
||||||
|
|
||||||
Terms and Conditions
|
Contributing
|
||||||
--------------------
|
------------
|
||||||
|
|
||||||
(This is so-called MIT/X License)
|
Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md).
|
||||||
|
|
||||||
Copyright (c) 2004-2014 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person
|
|
||||||
obtaining a copy of this software and associated documentation
|
|
||||||
files (the "Software"), to deal in the Software without
|
|
||||||
restriction, including without limitation the rights to use,
|
|
||||||
copy, modify, merge, publish, distribute, sublicense, and/or
|
|
||||||
sell copies of the Software, and to permit persons to whom the
|
|
||||||
Software is furnished to do so, subject to the following
|
|
||||||
conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be
|
|
||||||
included in all copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
|
||||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
||||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
|
||||||
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
||||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
||||||
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
||||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
|
|
|
@ -10,7 +10,15 @@ It includes a PDF converter that can transform PDF files into other text
|
||||||
formats (such as HTML). It has an extensible PDF parser that can be used for
|
formats (such as HTML). It has an extensible PDF parser that can be used for
|
||||||
other purposes instead of text analysis.
|
other purposes instead of text analysis.
|
||||||
"""
|
"""
|
||||||
__version__ = '20181108'
|
import sys
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
__version__ = '20191020'
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info < (3, 0):
|
||||||
|
warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
|
||||||
|
'more information see https://github.com/pdfminer/pdfminer.six/issues/194')
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(__version__)
|
print(__version__)
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
|
|
||||||
|
|
||||||
""" Adobe character mapping (CMap) support.
|
""" Adobe character mapping (CMap) support.
|
||||||
|
|
||||||
CMaps provide the mapping between character codes and Unicode
|
CMaps provide the mapping between character codes and Unicode
|
||||||
|
@ -40,8 +38,6 @@ class CMapError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
## CMapBase
|
|
||||||
##
|
|
||||||
class CMapBase(object):
|
class CMapBase(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
@ -67,8 +63,6 @@ class CMapBase(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## CMap
|
|
||||||
##
|
|
||||||
class CMap(CMapBase):
|
class CMap(CMapBase):
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
@ -83,7 +77,7 @@ class CMap(CMapBase):
|
||||||
assert isinstance(cmap, CMap), str(type(cmap))
|
assert isinstance(cmap, CMap), str(type(cmap))
|
||||||
|
|
||||||
def copy(dst, src):
|
def copy(dst, src):
|
||||||
for (k, v) in src.iteritems():
|
for (k, v) in six.iteritems(src):
|
||||||
if isinstance(v, dict):
|
if isinstance(v, dict):
|
||||||
d = {}
|
d = {}
|
||||||
dst[k] = d
|
dst[k] = d
|
||||||
|
@ -110,7 +104,7 @@ class CMap(CMapBase):
|
||||||
if code2cid is None:
|
if code2cid is None:
|
||||||
code2cid = self.code2cid
|
code2cid = self.code2cid
|
||||||
code = ()
|
code = ()
|
||||||
for (k, v) in sorted(code2cid.iteritems()):
|
for (k, v) in sorted(six.iteritems(code2cid)):
|
||||||
c = code+(k,)
|
c = code+(k,)
|
||||||
if isinstance(v, int):
|
if isinstance(v, int):
|
||||||
out.write('code %r = cid %d\n' % (c, v))
|
out.write('code %r = cid %d\n' % (c, v))
|
||||||
|
@ -119,8 +113,6 @@ class CMap(CMapBase):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## IdentityCMap
|
|
||||||
##
|
|
||||||
class IdentityCMap(CMapBase):
|
class IdentityCMap(CMapBase):
|
||||||
|
|
||||||
def decode(self, code):
|
def decode(self, code):
|
||||||
|
@ -131,8 +123,16 @@ class IdentityCMap(CMapBase):
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
|
||||||
## UnicodeMap
|
class IdentityCMapByte(IdentityCMap):
|
||||||
##
|
|
||||||
|
def decode(self, code):
|
||||||
|
n = len(code)
|
||||||
|
if n:
|
||||||
|
return struct.unpack('>%dB' % n, code)
|
||||||
|
else:
|
||||||
|
return ()
|
||||||
|
|
||||||
|
|
||||||
class UnicodeMap(CMapBase):
|
class UnicodeMap(CMapBase):
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
@ -148,13 +148,11 @@ class UnicodeMap(CMapBase):
|
||||||
return self.cid2unichr[cid]
|
return self.cid2unichr[cid]
|
||||||
|
|
||||||
def dump(self, out=sys.stdout):
|
def dump(self, out=sys.stdout):
|
||||||
for (k, v) in sorted(self.cid2unichr.iteritems()):
|
for (k, v) in sorted(six.iteritems(self.cid2unichr)):
|
||||||
out.write('cid %d = unicode %r\n' % (k, v))
|
out.write('cid %d = unicode %r\n' % (k, v))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## FileCMap
|
|
||||||
##
|
|
||||||
class FileCMap(CMap):
|
class FileCMap(CMap):
|
||||||
|
|
||||||
def add_code2cid(self, code, cid):
|
def add_code2cid(self, code, cid):
|
||||||
|
@ -173,8 +171,6 @@ class FileCMap(CMap):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## FileUnicodeMap
|
|
||||||
##
|
|
||||||
class FileUnicodeMap(UnicodeMap):
|
class FileUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
def add_cid2unichr(self, cid, code):
|
def add_cid2unichr(self, cid, code):
|
||||||
|
@ -192,8 +188,6 @@ class FileUnicodeMap(UnicodeMap):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## PyCMap
|
|
||||||
##
|
|
||||||
class PyCMap(CMap):
|
class PyCMap(CMap):
|
||||||
|
|
||||||
def __init__(self, name, module):
|
def __init__(self, name, module):
|
||||||
|
@ -204,8 +198,6 @@ class PyCMap(CMap):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## PyUnicodeMap
|
|
||||||
##
|
|
||||||
class PyUnicodeMap(UnicodeMap):
|
class PyUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
def __init__(self, name, module, vertical):
|
def __init__(self, name, module, vertical):
|
||||||
|
@ -218,8 +210,6 @@ class PyUnicodeMap(UnicodeMap):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## CMapDB
|
|
||||||
##
|
|
||||||
class CMapDB(object):
|
class CMapDB(object):
|
||||||
|
|
||||||
_cmap_cache = {}
|
_cmap_cache = {}
|
||||||
|
@ -252,6 +242,10 @@ class CMapDB(object):
|
||||||
return IdentityCMap(WMode=0)
|
return IdentityCMap(WMode=0)
|
||||||
elif name == 'Identity-V':
|
elif name == 'Identity-V':
|
||||||
return IdentityCMap(WMode=1)
|
return IdentityCMap(WMode=1)
|
||||||
|
elif name == 'OneByteIdentityH':
|
||||||
|
return IdentityCMapByte(WMode=0)
|
||||||
|
elif name == 'OneByteIdentityV':
|
||||||
|
return IdentityCMapByte(WMode=1)
|
||||||
try:
|
try:
|
||||||
return klass._cmap_cache[name]
|
return klass._cmap_cache[name]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -271,8 +265,6 @@ class CMapDB(object):
|
||||||
return umaps[vertical]
|
return umaps[vertical]
|
||||||
|
|
||||||
|
|
||||||
## CMapParser
|
|
||||||
##
|
|
||||||
class CMapParser(PSStackParser):
|
class CMapParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, cmap, fp):
|
def __init__(self, cmap, fp):
|
||||||
|
@ -360,7 +352,6 @@ class CMapParser(PSStackParser):
|
||||||
s1 = nunpack(svar)
|
s1 = nunpack(svar)
|
||||||
e1 = nunpack(evar)
|
e1 = nunpack(evar)
|
||||||
vlen = len(svar)
|
vlen = len(svar)
|
||||||
#assert s1 <= e1, str((s1, e1))
|
|
||||||
for i in range(e1-s1+1):
|
for i in range(e1-s1+1):
|
||||||
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||||
self.cmap.add_code2cid(x, cid+i)
|
self.cmap.add_code2cid(x, cid+i)
|
||||||
|
@ -387,7 +378,6 @@ class CMapParser(PSStackParser):
|
||||||
continue
|
continue
|
||||||
s1 = nunpack(s)
|
s1 = nunpack(s)
|
||||||
e1 = nunpack(e)
|
e1 = nunpack(e)
|
||||||
#assert s1 <= e1, str((s1, e1))
|
|
||||||
if isinstance(code, list):
|
if isinstance(code, list):
|
||||||
for i in range(e1-s1+1):
|
for i in range(e1-s1+1):
|
||||||
self.cmap.add_cid2unichr(s1+i, code[i])
|
self.cmap.add_cid2unichr(s1+i, code[i])
|
||||||
|
@ -422,17 +412,16 @@ class CMapParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# test
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
for fname in args:
|
for fname in args:
|
||||||
fp = open(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
cmap = FileUnicodeMap()
|
cmap = FileUnicodeMap()
|
||||||
#cmap = FileCMap()
|
|
||||||
CMapParser(cmap, fp).run()
|
CMapParser(cmap, fp).run()
|
||||||
fp.close()
|
fp.close()
|
||||||
cmap.dump()
|
cmap.dump()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -1,28 +1,67 @@
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from .psparser import PSLiteral
|
|
||||||
from .glyphlist import glyphname2unicode
|
|
||||||
from .latin_enc import ENCODING
|
|
||||||
|
|
||||||
import six # Python 2+3 compatibility
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
STRIP_NAME = re.compile(r'[0-9A-Fa-f]+')
|
from .glyphlist import glyphname2unicode
|
||||||
|
from .latin_enc import ENCODING
|
||||||
|
from .psparser import PSLiteral
|
||||||
|
|
||||||
|
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
## name2unicode
|
|
||||||
##
|
|
||||||
def name2unicode(name):
|
def name2unicode(name):
|
||||||
"""Converts Adobe glyph names to Unicode numbers."""
|
"""Converts Adobe glyph names to Unicode numbers.
|
||||||
|
|
||||||
|
In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
|
||||||
|
This way the caller must explicitly define what to do when there is not a match.
|
||||||
|
|
||||||
|
Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||||
|
|
||||||
|
:returns unicode character if name resembles something, otherwise a KeyError
|
||||||
|
"""
|
||||||
|
name = name.split('.')[0]
|
||||||
|
components = name.split('_')
|
||||||
|
|
||||||
|
if len(components) > 1:
|
||||||
|
return ''.join(map(name2unicode, components))
|
||||||
|
|
||||||
|
else:
|
||||||
if name in glyphname2unicode:
|
if name in glyphname2unicode:
|
||||||
return glyphname2unicode[name]
|
return glyphname2unicode.get(name)
|
||||||
m = STRIP_NAME.search(name)
|
|
||||||
if not m:
|
elif name.startswith('uni'):
|
||||||
raise KeyError(name)
|
name_without_uni = name.strip('uni')
|
||||||
return six.unichr(int(m.group(0), base=16))
|
|
||||||
|
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
||||||
|
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
|
||||||
|
for digit in unicode_digits:
|
||||||
|
raise_key_error_for_invalid_unicode(digit)
|
||||||
|
characters = map(six.unichr, unicode_digits)
|
||||||
|
return ''.join(characters)
|
||||||
|
|
||||||
|
elif name.startswith('u'):
|
||||||
|
name_without_u = name.strip('u')
|
||||||
|
|
||||||
|
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
||||||
|
unicode_digit = int(name_without_u, base=16)
|
||||||
|
raise_key_error_for_invalid_unicode(unicode_digit)
|
||||||
|
return six.unichr(unicode_digit)
|
||||||
|
|
||||||
|
raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
|
||||||
|
|
||||||
|
|
||||||
|
def raise_key_error_for_invalid_unicode(unicode_digit):
|
||||||
|
"""Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
|
||||||
|
|
||||||
|
:raises KeyError if unicode digit is invalid
|
||||||
|
"""
|
||||||
|
if 55295 < unicode_digit < 57344:
|
||||||
|
raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)
|
||||||
|
|
||||||
|
|
||||||
## EncodingDB
|
|
||||||
##
|
|
||||||
class EncodingDB(object):
|
class EncodingDB(object):
|
||||||
|
|
||||||
std2unicode = {}
|
std2unicode = {}
|
||||||
|
@ -59,7 +98,7 @@ class EncodingDB(object):
|
||||||
elif isinstance(x, PSLiteral):
|
elif isinstance(x, PSLiteral):
|
||||||
try:
|
try:
|
||||||
cid2unicode[cid] = name2unicode(x.name)
|
cid2unicode[cid] = name2unicode(x.name)
|
||||||
except KeyError:
|
except KeyError as e:
|
||||||
pass
|
log.debug(str(e))
|
||||||
cid += 1
|
cid += 1
|
||||||
return cid2unicode
|
return cid2unicode
|
||||||
|
|
|
@ -74,7 +74,7 @@ class ImageWriter(object):
|
||||||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||||
ext = '.jpg'
|
ext = '.jpg'
|
||||||
elif (image.bits == 1 or
|
elif (image.bits == 1 or
|
||||||
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
|
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||||
ext = '.%dx%d.bmp' % (width, height)
|
ext = '.%dx%d.bmp' % (width, height)
|
||||||
else:
|
else:
|
||||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||||
|
@ -101,7 +101,7 @@ class ImageWriter(object):
|
||||||
for y in range(height):
|
for y in range(height):
|
||||||
bmp.write_line(y, data[i:i+width])
|
bmp.write_line(y, data[i:i+width])
|
||||||
i += width
|
i += width
|
||||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB:
|
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
||||||
bmp = BMPWriter(fp, 24, width, height)
|
bmp = BMPWriter(fp, 24, width, height)
|
||||||
data = stream.get_data()
|
data = stream.get_data()
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -109,7 +109,7 @@ class ImageWriter(object):
|
||||||
for y in range(height):
|
for y in range(height):
|
||||||
bmp.write_line(y, data[i:i+width])
|
bmp.write_line(y, data[i:i+width])
|
||||||
i += width
|
i += width
|
||||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY:
|
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
||||||
bmp = BMPWriter(fp, 8, width, height)
|
bmp = BMPWriter(fp, 8, width, height)
|
||||||
data = stream.get_data()
|
data = stream.get_data()
|
||||||
i = 0
|
i = 0
|
||||||
|
|
|
@ -178,7 +178,7 @@ class TagExtractor(PDFDevice):
|
||||||
s = ''
|
s = ''
|
||||||
if isinstance(props, dict):
|
if isinstance(props, dict):
|
||||||
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
|
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
|
||||||
in sorted(props.iteritems()))
|
in sorted(six.iteritems(props)))
|
||||||
out_s = '<%s%s>' % (utils.enc(tag.name), s)
|
out_s = '<%s%s>' % (utils.enc(tag.name), s)
|
||||||
self.outfp.write(utils.make_compat_bytes(out_s))
|
self.outfp.write(utils.make_compat_bytes(out_s))
|
||||||
self._stack.append(tag)
|
self._stack.append(tag)
|
||||||
|
|
|
@ -1,36 +1,39 @@
|
||||||
|
import logging
|
||||||
import sys
|
|
||||||
import struct
|
import struct
|
||||||
|
import sys
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from .cmapdb import CMapDB
|
|
||||||
from .cmapdb import CMapParser
|
|
||||||
from .cmapdb import FileUnicodeMap
|
|
||||||
from .cmapdb import CMap
|
|
||||||
from .encodingdb import EncodingDB
|
|
||||||
from .encodingdb import name2unicode
|
|
||||||
from .psparser import PSStackParser
|
|
||||||
from .psparser import PSEOF
|
|
||||||
from .psparser import LIT
|
|
||||||
from .psparser import KWD
|
|
||||||
from . import settings
|
|
||||||
from .psparser import PSLiteral
|
|
||||||
from .psparser import literal_name
|
|
||||||
from .pdftypes import PDFException
|
|
||||||
from .pdftypes import PDFStream
|
|
||||||
from .pdftypes import resolve1
|
|
||||||
from .pdftypes import int_value
|
|
||||||
from .pdftypes import num_value
|
|
||||||
from .pdftypes import list_value
|
|
||||||
from .pdftypes import dict_value
|
|
||||||
from .pdftypes import stream_value
|
|
||||||
from .fontmetrics import FONT_METRICS
|
|
||||||
from .utils import apply_matrix_norm
|
|
||||||
from .utils import nunpack
|
|
||||||
from .utils import choplist
|
|
||||||
from .utils import isnumber
|
|
||||||
|
|
||||||
import six # Python 2+3 compatibility
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
|
from . import settings
|
||||||
|
from .cmapdb import CMap
|
||||||
|
from .cmapdb import CMapDB
|
||||||
|
from .cmapdb import CMapParser
|
||||||
|
from .cmapdb import FileUnicodeMap
|
||||||
|
from .encodingdb import EncodingDB
|
||||||
|
from .encodingdb import name2unicode
|
||||||
|
from .fontmetrics import FONT_METRICS
|
||||||
|
from .pdftypes import PDFException
|
||||||
|
from .pdftypes import PDFStream
|
||||||
|
from .pdftypes import resolve1
|
||||||
|
from .pdftypes import dict_value
|
||||||
|
from .pdftypes import int_value
|
||||||
|
from .pdftypes import list_value
|
||||||
|
from .pdftypes import num_value
|
||||||
|
from .pdftypes import resolve1, resolve_all
|
||||||
|
from .pdftypes import stream_value
|
||||||
|
from .psparser import KWD
|
||||||
|
from .psparser import LIT
|
||||||
|
from .psparser import PSEOF
|
||||||
|
from .psparser import PSLiteral
|
||||||
|
from .psparser import PSStackParser
|
||||||
|
from .psparser import literal_name
|
||||||
|
from .utils import apply_matrix_norm
|
||||||
|
from .utils import choplist
|
||||||
|
from .utils import isnumber
|
||||||
|
from .utils import nunpack
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
def get_widths(seq):
|
def get_widths(seq):
|
||||||
widths = {}
|
widths = {}
|
||||||
|
@ -50,10 +53,6 @@ def get_widths(seq):
|
||||||
widths[i] = w
|
widths[i] = w
|
||||||
r = []
|
r = []
|
||||||
return widths
|
return widths
|
||||||
#assert get_widths([1]) == {}
|
|
||||||
#assert get_widths([1,2,3]) == {1:3, 2:3}
|
|
||||||
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
|
|
||||||
|
|
||||||
|
|
||||||
def get_widths2(seq):
|
def get_widths2(seq):
|
||||||
widths = {}
|
widths = {}
|
||||||
|
@ -73,13 +72,8 @@ def get_widths2(seq):
|
||||||
widths[i] = (w, (vx, vy))
|
widths[i] = (w, (vx, vy))
|
||||||
r = []
|
r = []
|
||||||
return widths
|
return widths
|
||||||
#assert get_widths2([1]) == {}
|
|
||||||
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
|
|
||||||
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
|
|
||||||
|
|
||||||
|
|
||||||
## FontMetricsDB
|
|
||||||
##
|
|
||||||
class FontMetricsDB(object):
|
class FontMetricsDB(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -87,8 +81,6 @@ class FontMetricsDB(object):
|
||||||
return FONT_METRICS[fontname]
|
return FONT_METRICS[fontname]
|
||||||
|
|
||||||
|
|
||||||
## Type1FontHeaderParser
|
|
||||||
##
|
|
||||||
class Type1FontHeaderParser(PSStackParser):
|
class Type1FontHeaderParser(PSStackParser):
|
||||||
|
|
||||||
KEYWORD_BEGIN = KWD(b'begin')
|
KEYWORD_BEGIN = KWD(b'begin')
|
||||||
|
@ -99,7 +91,6 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
KEYWORD_ARRAY = KWD(b'array')
|
KEYWORD_ARRAY = KWD(b'array')
|
||||||
KEYWORD_READONLY = KWD(b'readonly')
|
KEYWORD_READONLY = KWD(b'readonly')
|
||||||
KEYWORD_FOR = KWD(b'for')
|
KEYWORD_FOR = KWD(b'for')
|
||||||
KEYWORD_FOR = KWD(b'for')
|
|
||||||
|
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
PSStackParser.__init__(self, data)
|
PSStackParser.__init__(self, data)
|
||||||
|
@ -107,6 +98,17 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_encoding(self):
|
def get_encoding(self):
|
||||||
|
"""Parse the font encoding
|
||||||
|
|
||||||
|
The Type1 font encoding maps character codes to character names. These character names could either be standard
|
||||||
|
Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
|
||||||
|
sequence of operations that describe how the character should be drawn.
|
||||||
|
Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
|
||||||
|
|
||||||
|
References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
|
||||||
|
|
||||||
|
:returns mapping of character identifiers (cid's) to unicode characters
|
||||||
|
"""
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(cid, name) = self.nextobject()
|
(cid, name) = self.nextobject()
|
||||||
|
@ -114,8 +116,8 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
self._cid2unicode[cid] = name2unicode(name)
|
self._cid2unicode[cid] = name2unicode(name)
|
||||||
except KeyError:
|
except KeyError as e:
|
||||||
pass
|
log.debug(str(e))
|
||||||
return self._cid2unicode
|
return self._cid2unicode
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
|
@ -128,12 +130,17 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
|
|
||||||
|
|
||||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
||||||
IDENTITY_ENCODER = ('Identity-H', 'Identity-V')
|
|
||||||
|
|
||||||
## CFFFont
|
#Note: DLIdent-* isn't found in PDF Reference but is been kept as
|
||||||
## (Format specified in Adobe Technical Note: #5176
|
#it is harmless and have possibility of been a type. (induced from bug report/PR)
|
||||||
## "The Compact Font Format Specification")
|
IDENTITY_ENCODER = {'Identity-H':'Identity-H',
|
||||||
##
|
'Identity-V':'Identity-V',
|
||||||
|
'DLIdent-H':'Identity-H',
|
||||||
|
'DLIdent-V':'Identity-V',
|
||||||
|
'OneByteIdentityH':'OneByteIdentityH',
|
||||||
|
'OneByteIdentityV':'OneByteIdentityV',
|
||||||
|
}
|
||||||
|
|
||||||
def getdict(data):
|
def getdict(data):
|
||||||
d = {}
|
d = {}
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
|
@ -261,6 +268,7 @@ class CFFFont(object):
|
||||||
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class INDEX(object):
|
class INDEX(object):
|
||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp):
|
||||||
|
@ -361,9 +369,6 @@ class CFFFont(object):
|
||||||
assert False, str(('Unhandled', format))
|
assert False, str(('Unhandled', format))
|
||||||
else:
|
else:
|
||||||
raise ValueError('unsupported charset format: %r' % format)
|
raise ValueError('unsupported charset format: %r' % format)
|
||||||
#print self.code2gid
|
|
||||||
#print self.name2gid
|
|
||||||
#assert 0
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def getstr(self, sid):
|
def getstr(self, sid):
|
||||||
|
@ -372,8 +377,6 @@ class CFFFont(object):
|
||||||
return self.string_index[sid-len(self.STANDARD_STRINGS)]
|
return self.string_index[sid-len(self.STANDARD_STRINGS)]
|
||||||
|
|
||||||
|
|
||||||
## TrueTypeFont
|
|
||||||
##
|
|
||||||
class TrueTypeFont(object):
|
class TrueTypeFont(object):
|
||||||
|
|
||||||
class CMapNotFound(Exception):
|
class CMapNotFound(Exception):
|
||||||
|
@ -454,13 +457,11 @@ class TrueTypeFont(object):
|
||||||
assert False, str(('Unhandled', fmttype))
|
assert False, str(('Unhandled', fmttype))
|
||||||
# create unicode map
|
# create unicode map
|
||||||
unicode_map = FileUnicodeMap()
|
unicode_map = FileUnicodeMap()
|
||||||
for (char, gid) in char2gid.iteritems():
|
for (char, gid) in six.iteritems(char2gid):
|
||||||
unicode_map.add_cid2unichr(gid, char)
|
unicode_map.add_cid2unichr(gid, char)
|
||||||
return unicode_map
|
return unicode_map
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
|
||||||
##
|
|
||||||
class PDFFontError(PDFException):
|
class PDFFontError(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -472,12 +473,11 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
||||||
LITERAL_TYPE1C = LIT('Type1C')
|
LITERAL_TYPE1C = LIT('Type1C')
|
||||||
|
|
||||||
|
|
||||||
# PDFFont
|
|
||||||
class PDFFont(object):
|
class PDFFont(object):
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, default_width=None):
|
def __init__(self, descriptor, widths, default_width=None):
|
||||||
self.descriptor = descriptor
|
self.descriptor = descriptor
|
||||||
self.widths = widths
|
self.widths = resolve_all(widths)
|
||||||
self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
|
self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
|
||||||
if isinstance(self.fontname, PSLiteral):
|
if isinstance(self.fontname, PSLiteral):
|
||||||
self.fontname = literal_name(self.fontname)
|
self.fontname = literal_name(self.fontname)
|
||||||
|
@ -487,8 +487,15 @@ class PDFFont(object):
|
||||||
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
||||||
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
|
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
|
||||||
self.leading = num_value(descriptor.get('Leading', 0))
|
self.leading = num_value(descriptor.get('Leading', 0))
|
||||||
self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
|
self.bbox = list_value(resolve_all(descriptor.get('FontBBox', (0, 0, 0, 0))))
|
||||||
self.hscale = self.vscale = .001
|
self.hscale = self.vscale = .001
|
||||||
|
|
||||||
|
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
||||||
|
# PScript5.dll seems to produce Descent with a positive number, but
|
||||||
|
# text analysis will be wrong if this is taken as correct. So force
|
||||||
|
# descent to negative.
|
||||||
|
if self.descent > 0:
|
||||||
|
self.descent = -self.descent
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -504,9 +511,11 @@ class PDFFont(object):
|
||||||
return bytearray(bytes) # map(ord, bytes)
|
return bytearray(bytes) # map(ord, bytes)
|
||||||
|
|
||||||
def get_ascent(self):
|
def get_ascent(self):
|
||||||
|
"""Ascent above the baseline, in text space units"""
|
||||||
return self.ascent * self.vscale
|
return self.ascent * self.vscale
|
||||||
|
|
||||||
def get_descent(self):
|
def get_descent(self):
|
||||||
|
"""Descent below the baseline, in text space units; always negative"""
|
||||||
return self.descent * self.vscale
|
return self.descent * self.vscale
|
||||||
|
|
||||||
def get_width(self):
|
def get_width(self):
|
||||||
|
@ -537,7 +546,6 @@ class PDFFont(object):
|
||||||
return sum(self.char_width(cid) for cid in self.decode(s))
|
return sum(self.char_width(cid) for cid in self.decode(s))
|
||||||
|
|
||||||
|
|
||||||
# PDFSimpleFont
|
|
||||||
class PDFSimpleFont(PDFFont):
|
class PDFSimpleFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, spec):
|
def __init__(self, descriptor, widths, spec):
|
||||||
|
@ -574,7 +582,6 @@ class PDFSimpleFont(PDFFont):
|
||||||
raise PDFUnicodeNotDefined(None, cid)
|
raise PDFUnicodeNotDefined(None, cid)
|
||||||
|
|
||||||
|
|
||||||
# PDFType1Font
|
|
||||||
class PDFType1Font(PDFSimpleFont):
|
class PDFType1Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, spec):
|
def __init__(self, rsrcmgr, spec):
|
||||||
|
@ -606,14 +613,12 @@ class PDFType1Font(PDFSimpleFont):
|
||||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
|
||||||
# PDFTrueTypeFont
|
|
||||||
class PDFTrueTypeFont(PDFType1Font):
|
class PDFTrueTypeFont(PDFType1Font):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
|
||||||
# PDFType3Font
|
|
||||||
class PDFType3Font(PDFSimpleFont):
|
class PDFType3Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, spec):
|
def __init__(self, rsrcmgr, spec):
|
||||||
|
@ -636,7 +641,6 @@ class PDFType3Font(PDFSimpleFont):
|
||||||
return '<PDFType3Font>'
|
return '<PDFType3Font>'
|
||||||
|
|
||||||
|
|
||||||
# PDFCIDFont
|
|
||||||
class PDFCIDFont(PDFFont):
|
class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
|
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
|
||||||
|
@ -701,9 +705,9 @@ class PDFCIDFont(PDFFont):
|
||||||
"""
|
"""
|
||||||
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
||||||
Encoding but as an attribute of CMapName, where CMapName is an
|
Encoding but as an attribute of CMapName, where CMapName is an
|
||||||
attribure of spec['Encoding'].
|
attribute of spec['Encoding'].
|
||||||
The horizaontal/vertical modes are mentioned with diffrent name
|
The horizontal/vertical modes are mentioned with different name
|
||||||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'
|
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
spec_encoding = spec['Encoding']
|
spec_encoding = spec['Encoding']
|
||||||
|
@ -723,7 +727,7 @@ class PDFCIDFont(PDFFont):
|
||||||
raise PDFFontError('CMapName unspecified for encoding')
|
raise PDFFontError('CMapName unspecified for encoding')
|
||||||
cmap_name = 'unknown'
|
cmap_name = 'unknown'
|
||||||
if cmap_name in IDENTITY_ENCODER:
|
if cmap_name in IDENTITY_ENCODER:
|
||||||
return CMapDB.get_cmap(cmap_name)
|
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
|
||||||
else:
|
else:
|
||||||
return CMap()
|
return CMap()
|
||||||
|
|
||||||
|
@ -751,16 +755,14 @@ class PDFCIDFont(PDFFont):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
|
|
||||||
|
|
||||||
# main
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
for fname in argv[1:]:
|
for fname in argv[1:]:
|
||||||
fp = open(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
#font = TrueTypeFont(fname, fp)
|
|
||||||
font = CFFFont(fname, fp)
|
font = CFFFont(fname, fp)
|
||||||
print (font)
|
print (font)
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -598,25 +598,25 @@ class PDFPageInterpreter(object):
|
||||||
|
|
||||||
# setrgb-stroking
|
# setrgb-stroking
|
||||||
def do_RG(self, r, g, b):
|
def do_RG(self, r, g, b):
|
||||||
self.graphicstate.color = (r, g, b)
|
self.graphicstate.scolor = (r, g, b)
|
||||||
#self.do_CS(LITERAL_DEVICE_RGB)
|
#self.do_CS(LITERAL_DEVICE_RGB)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setrgb-non-stroking
|
# setrgb-non-stroking
|
||||||
def do_rg(self, r, g, b):
|
def do_rg(self, r, g, b):
|
||||||
self.graphicstate.color = (r, g, b)
|
self.graphicstate.ncolor = (r, g, b)
|
||||||
#self.do_cs(LITERAL_DEVICE_RGB)
|
#self.do_cs(LITERAL_DEVICE_RGB)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setcmyk-stroking
|
# setcmyk-stroking
|
||||||
def do_K(self, c, m, y, k):
|
def do_K(self, c, m, y, k):
|
||||||
self.graphicstate.color = (c, m, y, k)
|
self.graphicstate.scolor = (c, m, y, k)
|
||||||
#self.do_CS(LITERAL_DEVICE_CMYK)
|
#self.do_CS(LITERAL_DEVICE_CMYK)
|
||||||
return
|
return
|
||||||
|
|
||||||
# setcmyk-non-stroking
|
# setcmyk-non-stroking
|
||||||
def do_k(self, c, m, y, k):
|
def do_k(self, c, m, y, k):
|
||||||
self.graphicstate.color = (c, m, y, k)
|
self.graphicstate.ncolor = (c, m, y, k)
|
||||||
#self.do_cs(LITERAL_DEVICE_CMYK)
|
#self.do_cs(LITERAL_DEVICE_CMYK)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -96,7 +96,7 @@ def resolve_all(x, default=None):
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [resolve_all(v, default=default) for v in x]
|
x = [resolve_all(v, default=default) for v in x]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k, v) in x.iteritems():
|
for (k, v) in six.iteritems(x):
|
||||||
x[k] = resolve_all(v, default=default)
|
x[k] = resolve_all(v, default=default)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
|
@ -1,53 +1,55 @@
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Miscellaneous Routines.
|
Miscellaneous Routines.
|
||||||
"""
|
"""
|
||||||
import struct
|
import struct
|
||||||
# from sys import maxint as INF #doesn't work anymore under Python3,
|
|
||||||
# but PDF still uses 32 bits ints
|
|
||||||
INF = (1<<31) - 1
|
|
||||||
|
|
||||||
import six #Python 2+3 compatibility
|
import six
|
||||||
|
|
||||||
|
# from sys import maxint as INF doesn't work anymore under Python3, but PDF still uses 32 bits ints
|
||||||
|
INF = (1 << 31) - 1
|
||||||
|
|
||||||
if six.PY3:
|
if six.PY3:
|
||||||
import chardet # For str encoding detection in Py3
|
import chardet # For str encoding detection in Py3
|
||||||
|
|
||||||
unicode = str
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
def make_compat_bytes(in_str):
|
def make_compat_bytes(in_str):
|
||||||
"In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."
|
"""In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."""
|
||||||
assert isinstance(in_str, str), str(type(in_str))
|
assert isinstance(in_str, str), str(type(in_str))
|
||||||
if six.PY2:
|
if six.PY2:
|
||||||
return in_str
|
return in_str
|
||||||
else:
|
else:
|
||||||
return in_str.encode()
|
return in_str.encode()
|
||||||
|
|
||||||
|
|
||||||
def make_compat_str(in_str):
|
def make_compat_str(in_str):
|
||||||
"In Py2, does nothing. In Py3, converts to string, guessing encoding."
|
"""In Py2, does nothing. In Py3, converts to string, guessing encoding."""
|
||||||
assert isinstance(in_str, (bytes, str, unicode)), str(type(in_str))
|
assert isinstance(in_str, (bytes, str, unicode)), str(type(in_str))
|
||||||
if six.PY3 and isinstance(in_str, bytes):
|
if six.PY3 and isinstance(in_str, bytes):
|
||||||
enc = chardet.detect(in_str)
|
enc = chardet.detect(in_str)
|
||||||
in_str = in_str.decode(enc['encoding'])
|
in_str = in_str.decode(enc['encoding'])
|
||||||
return in_str
|
return in_str
|
||||||
|
|
||||||
|
|
||||||
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
|
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
|
||||||
"When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."
|
"""When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
|
||||||
if six.PY2:
|
if six.PY2:
|
||||||
assert isinstance(bytesorstring, (str, unicode)), str(type(bytesorstring))
|
assert isinstance(bytesorstring, (str, unicode)), str(type(bytesorstring))
|
||||||
return bytesorstring.encode(encoding, erraction)
|
return bytesorstring.encode(encoding, erraction)
|
||||||
if six.PY3:
|
if six.PY3:
|
||||||
if isinstance(bytesorstring, str): return bytesorstring
|
if isinstance(bytesorstring, str):
|
||||||
|
return bytesorstring
|
||||||
assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
|
assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
|
||||||
return bytesorstring.decode(encoding, erraction)
|
return bytesorstring.decode(encoding, erraction)
|
||||||
|
|
||||||
## PNG Predictor
|
|
||||||
##
|
|
||||||
def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||||
if bitspercomponent != 8:
|
if bitspercomponent != 8:
|
||||||
# unsupported
|
# unsupported
|
||||||
raise ValueError("Unsupported `bitspercomponent': %d" %
|
raise ValueError("Unsupported `bitspercomponent': %d" %
|
||||||
bitspercomponent)
|
bitspercomponent)
|
||||||
nbytes = colors * columns * bitspercomponent // 8
|
nbytes = colors * columns * bitspercomponent // 8
|
||||||
i = 0
|
|
||||||
buf = b''
|
buf = b''
|
||||||
line0 = b'\x00' * columns
|
line0 = b'\x00' * columns
|
||||||
for i in range(0, len(data), nbytes + 1):
|
for i in range(0, len(data), nbytes + 1):
|
||||||
|
@ -91,8 +93,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
|
|
||||||
## Matrix operations
|
# Matrix operations
|
||||||
##
|
|
||||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,31 +110,29 @@ def translate_matrix(m, v):
|
||||||
"""Translates a matrix by (x, y)."""
|
"""Translates a matrix by (x, y)."""
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
(x, y) = v
|
(x, y) = v
|
||||||
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
|
return a, b, c, d, x * a + y * c + e, x * b + y * d + f
|
||||||
|
|
||||||
|
|
||||||
def apply_matrix_pt(m, v):
|
def apply_matrix_pt(m, v):
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
(x, y) = v
|
(x, y) = v
|
||||||
"""Applies a matrix to a point."""
|
"""Applies a matrix to a point."""
|
||||||
return (a*x+c*y+e, b*x+d*y+f)
|
return a * x + c * y + e, b * x + d * y + f
|
||||||
|
|
||||||
|
|
||||||
def apply_matrix_norm(m, v):
|
def apply_matrix_norm(m, v):
|
||||||
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
(p, q) = v
|
(p, q) = v
|
||||||
return (a*p+c*q, b*p+d*q)
|
return a * p + c * q, b * p + d * q
|
||||||
|
|
||||||
|
|
||||||
## Utility functions
|
# Utility functions
|
||||||
##
|
|
||||||
|
|
||||||
# isnumber
|
|
||||||
def isnumber(x):
|
def isnumber(x):
|
||||||
return isinstance(x, (six.integer_types, float))
|
return isinstance(x, (six.integer_types, float))
|
||||||
|
|
||||||
# uniq
|
|
||||||
def uniq(objs):
|
def uniq(objs):
|
||||||
"""Eliminates duplicated elements."""
|
"""Eliminates duplicated elements."""
|
||||||
done = set()
|
done = set()
|
||||||
|
@ -145,7 +144,6 @@ def uniq(objs):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# fsplit
|
|
||||||
def fsplit(pred, objs):
|
def fsplit(pred, objs):
|
||||||
"""Split a list into two classes according to the predicate."""
|
"""Split a list into two classes according to the predicate."""
|
||||||
t = []
|
t = []
|
||||||
|
@ -155,17 +153,14 @@ def fsplit(pred, objs):
|
||||||
t.append(obj)
|
t.append(obj)
|
||||||
else:
|
else:
|
||||||
f.append(obj)
|
f.append(obj)
|
||||||
return (t, f)
|
return t, f
|
||||||
|
|
||||||
|
|
||||||
# drange
|
|
||||||
def drange(v0, v1, d):
|
def drange(v0, v1, d):
|
||||||
"""Returns a discrete range."""
|
"""Returns a discrete range."""
|
||||||
assert v0 < v1, str((v0, v1, d))
|
|
||||||
return range(int(v0) // d, int(v1 + d) // d)
|
return range(int(v0) // d, int(v1 + d) // d)
|
||||||
|
|
||||||
|
|
||||||
# get_bound
|
|
||||||
def get_bound(pts):
|
def get_bound(pts):
|
||||||
"""Compute a minimal rectangle that covers all the points."""
|
"""Compute a minimal rectangle that covers all the points."""
|
||||||
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
||||||
|
@ -174,10 +169,9 @@ def get_bound(pts):
|
||||||
y0 = min(y0, y)
|
y0 = min(y0, y)
|
||||||
x1 = max(x1, x)
|
x1 = max(x1, x)
|
||||||
y1 = max(y1, y)
|
y1 = max(y1, y)
|
||||||
return (x0, y0, x1, y1)
|
return x0, y0, x1, y1
|
||||||
|
|
||||||
|
|
||||||
# pick
|
|
||||||
def pick(seq, func, maxobj=None):
|
def pick(seq, func, maxobj=None):
|
||||||
"""Picks the object obj where func(obj) has the highest value."""
|
"""Picks the object obj where func(obj) has the highest value."""
|
||||||
maxscore = None
|
maxscore = None
|
||||||
|
@ -188,7 +182,6 @@ def pick(seq, func, maxobj=None):
|
||||||
return maxobj
|
return maxobj
|
||||||
|
|
||||||
|
|
||||||
# choplist
|
|
||||||
def choplist(n, seq):
|
def choplist(n, seq):
|
||||||
"""Groups every n elements of the list."""
|
"""Groups every n elements of the list."""
|
||||||
r = []
|
r = []
|
||||||
|
@ -200,7 +193,6 @@ def choplist(n, seq):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# nunpack
|
|
||||||
def nunpack(s, default=0):
|
def nunpack(s, default=0):
|
||||||
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
||||||
l = len(s)
|
l = len(s)
|
||||||
|
@ -220,7 +212,6 @@ def nunpack(s, default=0):
|
||||||
raise TypeError('invalid length: %d' % l)
|
raise TypeError('invalid length: %d' % l)
|
||||||
|
|
||||||
|
|
||||||
# decode_text
|
|
||||||
PDFDocEncoding = ''.join(six.unichr(x) for x in (
|
PDFDocEncoding = ''.join(six.unichr(x) for x in (
|
||||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||||
|
@ -265,7 +256,6 @@ def decode_text(s):
|
||||||
return ''.join(PDFDocEncoding[c] for c in s)
|
return ''.join(PDFDocEncoding[c] for c in s)
|
||||||
|
|
||||||
|
|
||||||
# enc
|
|
||||||
def enc(x, codec='ascii'):
|
def enc(x, codec='ascii'):
|
||||||
"""Encodes a string for SGML/XML/HTML"""
|
"""Encodes a string for SGML/XML/HTML"""
|
||||||
if six.PY3 and isinstance(x, bytes):
|
if six.PY3 and isinstance(x, bytes):
|
||||||
|
@ -285,6 +275,7 @@ def matrix2str(m):
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
|
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
|
||||||
|
|
||||||
|
|
||||||
def vecBetweenBoxes(obj1, obj2):
|
def vecBetweenBoxes(obj1, obj2):
|
||||||
"""A distance function between two TextBoxes.
|
"""A distance function between two TextBoxes.
|
||||||
|
|
||||||
|
@ -304,18 +295,18 @@ def vecBetweenBoxes(obj1, obj2):
|
||||||
# if one is inside another we compute euclidean distance
|
# if one is inside another we compute euclidean distance
|
||||||
(xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
|
(xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
|
||||||
(xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
|
(xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
|
||||||
return (xc1-xc2, yc1-yc2)
|
return xc1 - xc2, yc1 - yc2
|
||||||
else:
|
else:
|
||||||
return (max(0, iw), max(0, ih))
|
return max(0, iw), max(0, ih)
|
||||||
|
|
||||||
|
|
||||||
## Plane
|
|
||||||
##
|
|
||||||
## A set-like data structure for objects placed on a plane.
|
|
||||||
## Can efficiently find objects in a certain rectangular area.
|
|
||||||
## It maintains two parallel lists of objects, each of
|
|
||||||
## which is sorted by its x or y coordinate.
|
|
||||||
##
|
|
||||||
class Plane(object):
|
class Plane(object):
|
||||||
|
"""A set-like data structure for objects placed on a plane.
|
||||||
|
|
||||||
|
Can efficiently find objects in a certain rectangular area.
|
||||||
|
It maintains two parallel lists of objects, each of
|
||||||
|
which is sorted by its x or y coordinate.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, bbox, gridsize=50):
|
def __init__(self, bbox, gridsize=50):
|
||||||
self._seq = [] # preserve the object order.
|
self._seq = [] # preserve the object order.
|
||||||
|
@ -323,10 +314,9 @@ class Plane(object):
|
||||||
self._grid = {}
|
self._grid = {}
|
||||||
self.gridsize = gridsize
|
self.gridsize = gridsize
|
||||||
(self.x0, self.y0, self.x1, self.y1) = bbox
|
(self.x0, self.y0, self.x1, self.y1) = bbox
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<Plane objs=%r>' % list(self))
|
return '<Plane objs=%r>' % list(self)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return (obj for obj in self._seq if obj in self._objs)
|
return (obj for obj in self._seq if obj in self._objs)
|
||||||
|
@ -339,25 +329,22 @@ class Plane(object):
|
||||||
|
|
||||||
def _getrange(self, bbox):
|
def _getrange(self, bbox):
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
if (x1 <= self.x0 or self.x1 <= x0 or
|
if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
|
||||||
y1 <= self.y0 or self.y1 <= y0): return
|
return
|
||||||
x0 = max(self.x0, x0)
|
x0 = max(self.x0, x0)
|
||||||
y0 = max(self.y0, y0)
|
y0 = max(self.y0, y0)
|
||||||
x1 = min(self.x1, x1)
|
x1 = min(self.x1, x1)
|
||||||
y1 = min(self.y1, y1)
|
y1 = min(self.y1, y1)
|
||||||
for y in drange(y0, y1, self.gridsize):
|
for grid_y in drange(y0, y1, self.gridsize):
|
||||||
for x in drange(x0, x1, self.gridsize):
|
for grid_x in drange(x0, x1, self.gridsize):
|
||||||
yield (x, y)
|
yield (grid_x, grid_y)
|
||||||
return
|
|
||||||
|
|
||||||
# extend(objs)
|
|
||||||
def extend(self, objs):
|
def extend(self, objs):
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
self.add(obj)
|
self.add(obj)
|
||||||
return
|
|
||||||
|
|
||||||
# add(obj): place an object.
|
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
|
"""place an object."""
|
||||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||||
if k not in self._grid:
|
if k not in self._grid:
|
||||||
r = []
|
r = []
|
||||||
|
@ -367,20 +354,18 @@ class Plane(object):
|
||||||
r.append(obj)
|
r.append(obj)
|
||||||
self._seq.append(obj)
|
self._seq.append(obj)
|
||||||
self._objs.add(obj)
|
self._objs.add(obj)
|
||||||
return
|
|
||||||
|
|
||||||
# remove(obj): displace an object.
|
|
||||||
def remove(self, obj):
|
def remove(self, obj):
|
||||||
|
"""displace an object."""
|
||||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||||
try:
|
try:
|
||||||
self._grid[k].remove(obj)
|
self._grid[k].remove(obj)
|
||||||
except (KeyError, ValueError):
|
except (KeyError, ValueError):
|
||||||
pass
|
pass
|
||||||
self._objs.remove(obj)
|
self._objs.remove(obj)
|
||||||
return
|
|
||||||
|
|
||||||
# find(): finds objects that are in a certain area.
|
|
||||||
def find(self, bbox):
|
def find(self, bbox):
|
||||||
|
"""finds objects that are in a certain area."""
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
done = set()
|
done = set()
|
||||||
for k in self._getrange(bbox):
|
for k in self._getrange(bbox):
|
||||||
|
@ -390,8 +375,6 @@ class Plane(object):
|
||||||
if obj in done:
|
if obj in done:
|
||||||
continue
|
continue
|
||||||
done.add(obj)
|
done.add(obj)
|
||||||
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
|
||||||
obj.y1 <= y0 or y1 <= obj.y0):
|
|
||||||
continue
|
continue
|
||||||
yield obj
|
yield obj
|
||||||
return
|
|
||||||
|
|
Binary file not shown.
13
setup.py
13
setup.py
|
@ -1,18 +1,19 @@
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
import sys
|
|
||||||
|
|
||||||
import pdfminer as package
|
import pdfminer as package
|
||||||
|
|
||||||
requires = ['six', 'pycryptodome', 'sortedcontainers']
|
|
||||||
if sys.version_info >= (3, 0):
|
|
||||||
requires.append('chardet')
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='pdfminer.six',
|
name='pdfminer.six',
|
||||||
version=package.__version__,
|
version=package.__version__,
|
||||||
packages=['pdfminer'],
|
packages=['pdfminer'],
|
||||||
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
||||||
install_requires=requires,
|
install_requires=[
|
||||||
|
'chardet ; python_version > "3.0"',
|
||||||
|
'pycryptodome',
|
||||||
|
'six',
|
||||||
|
'sortedcontainers',
|
||||||
|
],
|
||||||
|
extras_require={"dev": ["nose", "tox"]},
|
||||||
description='PDF parser and analyzer',
|
description='PDF parser and analyzer',
|
||||||
long_description=package.__doc__,
|
long_description=package.__doc__,
|
||||||
license='MIT/X',
|
license='MIT/X',
|
||||||
|
|
|
@ -0,0 +1,121 @@
|
||||||
|
"""
|
||||||
|
Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
|
||||||
|
|
||||||
|
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
|
||||||
|
added.
|
||||||
|
"""
|
||||||
|
from nose.tools import assert_raises
|
||||||
|
|
||||||
|
from pdfminer.encodingdb import name2unicode
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_name_in_agl():
|
||||||
|
"""The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
|
||||||
|
assert u'\u013B' == name2unicode('Lcommaaccent')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni():
|
||||||
|
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||||
|
assert u'\u013B' == name2unicode('uni013B')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_lowercase():
|
||||||
|
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||||
|
assert u'\u013B' == name2unicode('uni013b')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_with_sequence_of_digits():
|
||||||
|
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||||
|
assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
|
||||||
|
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||||
|
assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_empty_string():
|
||||||
|
"""The name "uni20ac" has a single component, which is mapped to a euro-sign.
|
||||||
|
|
||||||
|
According to the specification this should be mapped to an empty string, but we also want to support lowercase
|
||||||
|
hexadecimals
|
||||||
|
"""
|
||||||
|
assert u'\u20ac' == name2unicode('uni20ac')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_empty_string_long():
|
||||||
|
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||||
|
|
||||||
|
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||||
|
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||||
|
glyph name "u1040C.
|
||||||
|
"""
|
||||||
|
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_empty_string_long_lowercase():
|
||||||
|
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||||
|
|
||||||
|
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||||
|
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||||
|
glyph name "u1040C."""
|
||||||
|
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_pua():
|
||||||
|
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||||
|
assert u'\uF6FB' == name2unicode('uniF6FB')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_uni_pua_lowercase():
|
||||||
|
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||||
|
assert u'\uF6FB' == name2unicode('unif6fb')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_u_with_4_digits():
|
||||||
|
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||||
|
assert u'\u013B' == name2unicode('u013B')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_u_with_4_digits_lowercase():
|
||||||
|
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||||
|
assert u'\u013B' == name2unicode('u013b')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_u_with_5_digits():
|
||||||
|
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||||
|
assert u'\U0001040C' == name2unicode('u1040C')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_u_with_5_digits_lowercase():
|
||||||
|
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||||
|
assert u'\U0001040C' == name2unicode('u1040c')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_multiple_components():
|
||||||
|
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||||
|
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_multiple_components_lowercase():
|
||||||
|
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||||
|
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_foo():
|
||||||
|
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
|
||||||
|
assert_raises(KeyError, name2unicode, 'foo')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_notdef():
|
||||||
|
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
|
||||||
|
assert_raises(KeyError, name2unicode, '.notdef')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_pua_ogoneksmall():
|
||||||
|
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||||
|
assert u'\uF6FB' == name2unicode('Ogoneksmall')
|
||||||
|
|
||||||
|
|
||||||
|
def test_name2unicode_overflow_error():
|
||||||
|
assert_raises(KeyError, name2unicode, '226215240241240240240240')
|
|
@ -3,7 +3,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import nose, logging, os
|
import nose, logging, os
|
||||||
from pdfminer.cmapdb import IdentityCMap, CMap
|
from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte
|
||||||
from pdfminer.pdffont import PDFCIDFont
|
from pdfminer.pdffont import PDFCIDFont
|
||||||
from pdfminer.pdftypes import PDFStream
|
from pdfminer.pdftypes import PDFStream
|
||||||
from pdfminer.psparser import PSLiteral
|
from pdfminer.psparser import PSLiteral
|
||||||
|
@ -14,13 +14,13 @@ class TestPDFEncoding():
|
||||||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
|
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
|
||||||
spec = {'Encoding': stream}
|
spec = {'Encoding': stream}
|
||||||
font = PDFCIDFont(None, spec)
|
font = PDFCIDFont(None, spec)
|
||||||
assert isinstance(font.cmap, CMap)
|
assert isinstance(font.cmap, IdentityCMapByte)
|
||||||
|
|
||||||
def test_cmapname_onebyteidentityH(self):
|
def test_cmapname_onebyteidentityH(self):
|
||||||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
|
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
|
||||||
spec = {'Encoding': stream}
|
spec = {'Encoding': stream}
|
||||||
font = PDFCIDFont(None, spec)
|
font = PDFCIDFont(None, spec)
|
||||||
assert isinstance(font.cmap, CMap)
|
assert isinstance(font.cmap, IdentityCMapByte)
|
||||||
|
|
||||||
def test_cmapname_V(self):
|
def test_cmapname_V(self):
|
||||||
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
|
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
|
||||||
|
@ -68,6 +68,40 @@ class TestPDFEncoding():
|
||||||
font = PDFCIDFont(None, spec)
|
font = PDFCIDFont(None, spec)
|
||||||
assert isinstance(font.cmap, IdentityCMap)
|
assert isinstance(font.cmap, IdentityCMap)
|
||||||
|
|
||||||
|
def test_encoding_DLIdentH(self):
|
||||||
|
spec = {'Encoding': PSLiteral('DLIdent-H')}
|
||||||
|
font = PDFCIDFont(None, spec)
|
||||||
|
assert isinstance(font.cmap, IdentityCMap)
|
||||||
|
|
||||||
|
def test_encoding_DLIdentV(self):
|
||||||
|
spec = {'Encoding': PSLiteral('DLIdent-V')}
|
||||||
|
font = PDFCIDFont(None, spec)
|
||||||
|
assert isinstance(font.cmap, IdentityCMap)
|
||||||
|
|
||||||
|
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||||
|
stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '')
|
||||||
|
spec = {'Encoding': stream}
|
||||||
|
font = PDFCIDFont(None, spec)
|
||||||
|
assert isinstance(font.cmap, IdentityCMap)
|
||||||
|
|
||||||
|
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||||
|
stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '')
|
||||||
|
spec = {'Encoding': stream}
|
||||||
|
font = PDFCIDFont(None, spec)
|
||||||
|
assert isinstance(font.cmap, IdentityCMap)
|
||||||
|
|
||||||
|
def test_encoding_DLIdentH_as_stream(self):
|
||||||
|
stream = PDFStream({'CMapName':'DLIdent-H'}, '')
|
||||||
|
spec = {'Encoding': stream}
|
||||||
|
font = PDFCIDFont(None, spec)
|
||||||
|
assert isinstance(font.cmap, IdentityCMap)
|
||||||
|
|
||||||
|
def test_encoding_DLIdentV_as_stream(self):
|
||||||
|
stream = PDFStream({'CMapName':'DLIdent-V'}, '')
|
||||||
|
spec = {'Encoding': stream}
|
||||||
|
font = PDFCIDFont(None, spec)
|
||||||
|
assert isinstance(font.cmap, IdentityCMap)
|
||||||
|
|
||||||
def test_font_without_spec(self):
|
def test_font_without_spec(self):
|
||||||
font = PDFCIDFont(None, {})
|
font = PDFCIDFont(None, {})
|
||||||
assert isinstance(font.cmap, CMap)
|
assert isinstance(font.cmap, CMap)
|
||||||
|
|
|
@ -1,22 +1,28 @@
|
||||||
#!/usr/bin/env python
|
import os
|
||||||
|
from shutil import rmtree
|
||||||
|
from tempfile import NamedTemporaryFile, mkdtemp
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
import nose
|
||||||
|
|
||||||
import nose, logging, os
|
|
||||||
|
|
||||||
import tools.pdf2txt as pdf2txt
|
import tools.pdf2txt as pdf2txt
|
||||||
|
|
||||||
path=os.path.dirname(os.path.abspath(__file__))+'/'
|
|
||||||
|
def full_path(relative_path_to_this_file):
|
||||||
|
this_file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file))
|
||||||
|
return abspath
|
||||||
|
|
||||||
|
|
||||||
def run(datapath, filename, options=None):
|
def run(datapath, filename, options=None):
|
||||||
i=path+datapath+filename+'.pdf'
|
i = full_path(datapath + filename + '.pdf')
|
||||||
o=path+filename+'.txt'
|
o = full_path(filename + '.txt')
|
||||||
if options:
|
if options:
|
||||||
s = 'pdf2txt -o%s %s %s' % (o, options, i)
|
s = 'pdf2txt -o%s %s %s' % (o, options, i)
|
||||||
else:
|
else:
|
||||||
s = 'pdf2txt -o%s %s' % (o, i)
|
s = 'pdf2txt -o%s %s' % (o, i)
|
||||||
pdf2txt.main(s.split(' ')[1:])
|
pdf2txt.main(s.split(' ')[1:])
|
||||||
|
|
||||||
|
|
||||||
class TestDumpPDF():
|
class TestDumpPDF():
|
||||||
|
|
||||||
def test_1(self):
|
def test_1(self):
|
||||||
|
@ -24,6 +30,7 @@ class TestDumpPDF():
|
||||||
run('../samples/', 'simple1')
|
run('../samples/', 'simple1')
|
||||||
run('../samples/', 'simple2')
|
run('../samples/', 'simple2')
|
||||||
run('../samples/', 'simple3')
|
run('../samples/', 'simple3')
|
||||||
|
run('../samples/','sampleOneByteIdentityEncode')
|
||||||
|
|
||||||
def test_2(self):
|
def test_2(self):
|
||||||
run('../samples/nonfree/', 'dmca')
|
run('../samples/nonfree/', 'dmca')
|
||||||
|
@ -57,5 +64,30 @@ class TestDumpPDF():
|
||||||
def test_10(self):
|
def test_10(self):
|
||||||
run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96
|
run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96
|
||||||
|
|
||||||
|
|
||||||
|
class TestDumpImages(object):
|
||||||
|
|
||||||
|
def extract_images(self, input_file):
|
||||||
|
output_dir = mkdtemp()
|
||||||
|
with NamedTemporaryFile() as output_file:
|
||||||
|
commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]
|
||||||
|
pdf2txt.main(commands)
|
||||||
|
image_files = os.listdir(output_dir)
|
||||||
|
rmtree(output_dir)
|
||||||
|
return image_files
|
||||||
|
|
||||||
|
def test_nonfree_dmca(self):
|
||||||
|
"""Extract images of pdf containing bmp images
|
||||||
|
|
||||||
|
Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131
|
||||||
|
"""
|
||||||
|
image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf'))
|
||||||
|
assert image_files[0].endswith('bmp')
|
||||||
|
|
||||||
|
def test_nonfree_175(self):
|
||||||
|
"""Extract images of pdf containing jpg images"""
|
||||||
|
self.extract_images(full_path('../samples/nonfree/175.pdf'))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
nose.runmodule()
|
nose.runmodule()
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
from nose.tools import assert_equal
|
||||||
|
|
||||||
|
from pdfminer.layout import LTComponent
|
||||||
|
from pdfminer.utils import make_compat_str, Plane
|
||||||
|
|
||||||
|
|
||||||
|
class TestPlane(object):
|
||||||
|
def test_find_nothing_in_empty_bbox(self):
|
||||||
|
plane, _ = self.given_plane_with_one_object()
|
||||||
|
result = list(plane.find((50, 50, 100, 100)))
|
||||||
|
assert_equal(result, [])
|
||||||
|
|
||||||
|
def test_find_nothing_after_removing(self):
|
||||||
|
plane, obj = self.given_plane_with_one_object()
|
||||||
|
plane.remove(obj)
|
||||||
|
result = list(plane.find((0, 0, 100, 100)))
|
||||||
|
assert_equal(result, [])
|
||||||
|
|
||||||
|
def test_find_object_in_whole_plane(self):
|
||||||
|
plane, obj = self.given_plane_with_one_object()
|
||||||
|
result = list(plane.find((0, 0, 100, 100)))
|
||||||
|
assert_equal(result, [obj])
|
||||||
|
|
||||||
|
def test_find_if_object_is_smaller_than_gridsize(self):
|
||||||
|
plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100)
|
||||||
|
result = list(plane.find((0, 0, 100, 100)))
|
||||||
|
assert_equal(result, [obj])
|
||||||
|
|
||||||
|
def test_find_object_if_much_larger_than_gridsize(self):
|
||||||
|
plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10)
|
||||||
|
result = list(plane.find((0, 0, 100, 100)))
|
||||||
|
assert_equal(result, [obj])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def given_plane_with_one_object(object_size=50, gridsize=50):
|
||||||
|
bounding_box = (0, 0, 100, 100)
|
||||||
|
plane = Plane(bounding_box, gridsize)
|
||||||
|
obj = LTComponent((0, 0, object_size, object_size))
|
||||||
|
plane.add(obj)
|
||||||
|
return plane, obj
|
|
@ -3,6 +3,8 @@
|
||||||
import sys
|
import sys
|
||||||
import fileinput
|
import fileinput
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
fonts = {}
|
fonts = {}
|
||||||
for line in fileinput.input():
|
for line in fileinput.input():
|
||||||
|
@ -33,7 +35,7 @@ def main(argv):
|
||||||
props[k] = tuple(map(float, f[1:5]))
|
props[k] = tuple(map(float, f[1:5]))
|
||||||
print ('# -*- python -*-')
|
print ('# -*- python -*-')
|
||||||
print ('FONT_METRICS = {')
|
print ('FONT_METRICS = {')
|
||||||
for (fontname,(props,chars)) in fonts.iteritems():
|
for (fontname,(props,chars)) in six.iteritems(fonts):
|
||||||
print (' %r: %r,' % (fontname, (props,chars)))
|
print (' %r: %r,' % (fontname, (props,chars)))
|
||||||
print ('}')
|
print ('}')
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -26,6 +26,7 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
from pdfminer.converter import HTMLConverter, TextConverter
|
from pdfminer.converter import HTMLConverter, TextConverter
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
# quote HTML metacharacters
|
# quote HTML metacharacters
|
||||||
def q(x):
|
def q(x):
|
||||||
|
@ -35,7 +36,7 @@ def q(x):
|
||||||
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
|
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
|
||||||
def url(base, **kw):
|
def url(base, **kw):
|
||||||
r = []
|
r = []
|
||||||
for (k,v) in kw.iteritems():
|
for (k,v) in six.iteritems(kw):
|
||||||
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
|
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
|
||||||
r.append('%s=%s' % (k, v))
|
r.append('%s=%s' % (k, v))
|
||||||
return base+'&'.join(r)
|
return base+'&'.join(r)
|
||||||
|
|
Loading…
Reference in New Issue