commit
a5a34d53bd
|
@ -5,9 +5,6 @@ python:
|
|||
- "3.5"
|
||||
- "3.6"
|
||||
install:
|
||||
- pip install six
|
||||
- pip install pycryptodome
|
||||
- pip install chardet
|
||||
- pip install sortedcontainers
|
||||
- pip install tox-travis
|
||||
script:
|
||||
nosetests --nologcapture
|
||||
- tox
|
||||
|
|
41
CHANGELOG.md
41
CHANGELOG.md
|
@ -1,6 +1,37 @@
|
|||
# List of changes
|
||||
# Changelog
|
||||
All notable changes in pdfminer.six will be documented in this file.
|
||||
|
||||
## Version 20181108
|
||||
- PR #141 to speedup layout analysis
|
||||
- PR #173 for using argparse and replace deprecated getopt
|
||||
- PR #142 to compile pdfminer.six with cython, successfully
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
Nothing yet
|
||||
|
||||
## [20191020] - 2019-10-20
|
||||
|
||||
### Deprecated
|
||||
- Support for Python 2 is dropped at January 1st, 2020 ([#307](https://github.com/pdfminer/pdfminer.six/pull/307))
|
||||
|
||||
### Added
|
||||
- Contribution guidelines in [CONTRIBUTING.md](CONTRIBUTING.md) ([#259](https://github.com/pdfminer/pdfminer.six/pull/259))
|
||||
- Support new encodings OneByteEncoding and DLIdent for CMaps ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
|
||||
|
||||
### Fixed
|
||||
- Use `six.iteritems()` instead of `dict().iteritems()` to ensure Python2 and Python3 compatibility ([#274](https://github.com/pdfminer/pdfminer.six/pull/274))
|
||||
- Properly convert Adobe Glyph names to unicode characters ([#263](https://github.com/pdfminer/pdfminer.six/pull/263))
|
||||
- Allow CMap to be a content stream ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
|
||||
- Resolve indirect objects for width and bounding boxes for fonts ([#273](https://github.com/pdfminer/pdfminer.six/pull/273))
|
||||
- Actually updating stroke color in graphic state ([#298](https://github.com/pdfminer/pdfminer.six/pull/298))
|
||||
- Interpret (invalid) negative font descent as a positive descent ([#203](https://github.com/pdfminer/pdfminer.six/pull/203))
|
||||
- Correct colorspace comparision for images ([#132](https://github.com/pdfminer/pdfminer.six/pull/132))
|
||||
- Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246))
|
||||
|
||||
### Changed
|
||||
- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
|
||||
|
||||
## [20181108] - 2018-11-08
|
||||
|
||||
### Changed
|
||||
- Speedup layout analysis ([#141](https://github.com/pdfminer/pdfminer.six/pull/141))
|
||||
- Use argparse instead of replace deprecated getopt ([#173](https://github.com/pdfminer/pdfminer.six/pull/173))
|
||||
- Allow pdfminer.six to be compiled with cython ([#142](https://github.com/pdfminer/pdfminer.six/pull/142))
|
|
@ -0,0 +1,64 @@
|
|||
# Contributing guidelines
|
||||
|
||||
Any contribution is appreciated! You might want to:
|
||||
|
||||
* Fix spelling errors
|
||||
* Improve documentation
|
||||
* Add tests for untested code
|
||||
* Add new features
|
||||
* Fix bugs
|
||||
|
||||
## How can I contribute?
|
||||
|
||||
* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features
|
||||
- If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the
|
||||
issue.
|
||||
* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request).
|
||||
* Help others by giving your thoughts on open issues and pull requests.
|
||||
|
||||
## Guidelines for creating issues
|
||||
|
||||
* Search previous issues, as yours might be a duplicate.
|
||||
* When creating a new issue for a bug, include a minimal reproducible example.
|
||||
* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This
|
||||
will help others to see the importance of your feature request.
|
||||
|
||||
## Guideline for creating pull request
|
||||
|
||||
* A pull request should close an existing issue.
|
||||
* Pull requests should be merged to develop, not master. This ensures that master always equals the released version.
|
||||
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
|
||||
of features, this will show that your code works correctly.
|
||||
* Code should work for Python 2.7 and Python 3.x (for now), conform to PEP8 code style (with a line-width of 120)
|
||||
and properly documented with docstrings.
|
||||
* Check spelling and grammar.
|
||||
* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Clone the repository
|
||||
|
||||
```sh
|
||||
git clone https://github.com/pdfminer/pdfminer.six
|
||||
cd pdfminer.six
|
||||
```
|
||||
|
||||
2. Install dev dependencies
|
||||
|
||||
```sh
|
||||
pip install -e .[dev]
|
||||
```
|
||||
|
||||
3. Run the tests
|
||||
|
||||
On all Python versions:
|
||||
|
||||
```sh
|
||||
tox
|
||||
```
|
||||
|
||||
Or on a single Python version:
|
||||
|
||||
```sh
|
||||
tox -e py36
|
||||
```
|
31
README.md
31
README.md
|
@ -35,7 +35,7 @@ Features
|
|||
How to Install
|
||||
--------------
|
||||
|
||||
* Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six)
|
||||
* Install Python 2.7 or newer.
|
||||
* Install
|
||||
|
||||
`pip install pdfminer.six`
|
||||
|
@ -81,30 +81,7 @@ TODO
|
|||
* Performance improvements.
|
||||
|
||||
|
||||
Terms and Conditions
|
||||
--------------------
|
||||
Contributing
|
||||
------------
|
||||
|
||||
(This is so-called MIT/X License)
|
||||
|
||||
Copyright (c) 2004-2014 Yusuke Shinyama <yusuke at cs dot nyu dot edu>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation
|
||||
files (the "Software"), to deal in the Software without
|
||||
restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
||||
PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
||||
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md).
|
||||
|
|
|
@ -10,7 +10,15 @@ It includes a PDF converter that can transform PDF files into other text
|
|||
formats (such as HTML). It has an extensible PDF parser that can be used for
|
||||
other purposes instead of text analysis.
|
||||
"""
|
||||
__version__ = '20181108'
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
__version__ = '20191020'
|
||||
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
|
||||
'more information see https://github.com/pdfminer/pdfminer.six/issues/194')
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
|
||||
|
||||
""" Adobe character mapping (CMap) support.
|
||||
|
||||
CMaps provide the mapping between character codes and Unicode
|
||||
|
@ -40,8 +38,6 @@ class CMapError(Exception):
|
|||
pass
|
||||
|
||||
|
||||
## CMapBase
|
||||
##
|
||||
class CMapBase(object):
|
||||
|
||||
debug = 0
|
||||
|
@ -67,8 +63,6 @@ class CMapBase(object):
|
|||
return
|
||||
|
||||
|
||||
## CMap
|
||||
##
|
||||
class CMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
@ -83,7 +77,7 @@ class CMap(CMapBase):
|
|||
assert isinstance(cmap, CMap), str(type(cmap))
|
||||
|
||||
def copy(dst, src):
|
||||
for (k, v) in src.iteritems():
|
||||
for (k, v) in six.iteritems(src):
|
||||
if isinstance(v, dict):
|
||||
d = {}
|
||||
dst[k] = d
|
||||
|
@ -110,7 +104,7 @@ class CMap(CMapBase):
|
|||
if code2cid is None:
|
||||
code2cid = self.code2cid
|
||||
code = ()
|
||||
for (k, v) in sorted(code2cid.iteritems()):
|
||||
for (k, v) in sorted(six.iteritems(code2cid)):
|
||||
c = code+(k,)
|
||||
if isinstance(v, int):
|
||||
out.write('code %r = cid %d\n' % (c, v))
|
||||
|
@ -119,8 +113,6 @@ class CMap(CMapBase):
|
|||
return
|
||||
|
||||
|
||||
## IdentityCMap
|
||||
##
|
||||
class IdentityCMap(CMapBase):
|
||||
|
||||
def decode(self, code):
|
||||
|
@ -131,8 +123,16 @@ class IdentityCMap(CMapBase):
|
|||
return ()
|
||||
|
||||
|
||||
## UnicodeMap
|
||||
##
|
||||
class IdentityCMapByte(IdentityCMap):
|
||||
|
||||
def decode(self, code):
|
||||
n = len(code)
|
||||
if n:
|
||||
return struct.unpack('>%dB' % n, code)
|
||||
else:
|
||||
return ()
|
||||
|
||||
|
||||
class UnicodeMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
@ -148,13 +148,11 @@ class UnicodeMap(CMapBase):
|
|||
return self.cid2unichr[cid]
|
||||
|
||||
def dump(self, out=sys.stdout):
|
||||
for (k, v) in sorted(self.cid2unichr.iteritems()):
|
||||
for (k, v) in sorted(six.iteritems(self.cid2unichr)):
|
||||
out.write('cid %d = unicode %r\n' % (k, v))
|
||||
return
|
||||
|
||||
|
||||
## FileCMap
|
||||
##
|
||||
class FileCMap(CMap):
|
||||
|
||||
def add_code2cid(self, code, cid):
|
||||
|
@ -173,8 +171,6 @@ class FileCMap(CMap):
|
|||
return
|
||||
|
||||
|
||||
## FileUnicodeMap
|
||||
##
|
||||
class FileUnicodeMap(UnicodeMap):
|
||||
|
||||
def add_cid2unichr(self, cid, code):
|
||||
|
@ -192,8 +188,6 @@ class FileUnicodeMap(UnicodeMap):
|
|||
return
|
||||
|
||||
|
||||
## PyCMap
|
||||
##
|
||||
class PyCMap(CMap):
|
||||
|
||||
def __init__(self, name, module):
|
||||
|
@ -204,8 +198,6 @@ class PyCMap(CMap):
|
|||
return
|
||||
|
||||
|
||||
## PyUnicodeMap
|
||||
##
|
||||
class PyUnicodeMap(UnicodeMap):
|
||||
|
||||
def __init__(self, name, module, vertical):
|
||||
|
@ -218,8 +210,6 @@ class PyUnicodeMap(UnicodeMap):
|
|||
return
|
||||
|
||||
|
||||
## CMapDB
|
||||
##
|
||||
class CMapDB(object):
|
||||
|
||||
_cmap_cache = {}
|
||||
|
@ -252,6 +242,10 @@ class CMapDB(object):
|
|||
return IdentityCMap(WMode=0)
|
||||
elif name == 'Identity-V':
|
||||
return IdentityCMap(WMode=1)
|
||||
elif name == 'OneByteIdentityH':
|
||||
return IdentityCMapByte(WMode=0)
|
||||
elif name == 'OneByteIdentityV':
|
||||
return IdentityCMapByte(WMode=1)
|
||||
try:
|
||||
return klass._cmap_cache[name]
|
||||
except KeyError:
|
||||
|
@ -271,8 +265,6 @@ class CMapDB(object):
|
|||
return umaps[vertical]
|
||||
|
||||
|
||||
## CMapParser
|
||||
##
|
||||
class CMapParser(PSStackParser):
|
||||
|
||||
def __init__(self, cmap, fp):
|
||||
|
@ -360,7 +352,6 @@ class CMapParser(PSStackParser):
|
|||
s1 = nunpack(svar)
|
||||
e1 = nunpack(evar)
|
||||
vlen = len(svar)
|
||||
#assert s1 <= e1, str((s1, e1))
|
||||
for i in range(e1-s1+1):
|
||||
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||
self.cmap.add_code2cid(x, cid+i)
|
||||
|
@ -387,7 +378,6 @@ class CMapParser(PSStackParser):
|
|||
continue
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
#assert s1 <= e1, str((s1, e1))
|
||||
if isinstance(code, list):
|
||||
for i in range(e1-s1+1):
|
||||
self.cmap.add_cid2unichr(s1+i, code[i])
|
||||
|
@ -422,17 +412,16 @@ class CMapParser(PSStackParser):
|
|||
return
|
||||
|
||||
|
||||
# test
|
||||
def main(argv):
|
||||
args = argv[1:]
|
||||
for fname in args:
|
||||
fp = open(fname, 'rb')
|
||||
cmap = FileUnicodeMap()
|
||||
#cmap = FileCMap()
|
||||
CMapParser(cmap, fp).run()
|
||||
fp.close()
|
||||
cmap.dump()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -1,28 +1,67 @@
|
|||
|
||||
import logging
|
||||
import re
|
||||
from .psparser import PSLiteral
|
||||
from .glyphlist import glyphname2unicode
|
||||
from .latin_enc import ENCODING
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
STRIP_NAME = re.compile(r'[0-9A-Fa-f]+')
|
||||
from .glyphlist import glyphname2unicode
|
||||
from .latin_enc import ENCODING
|
||||
from .psparser import PSLiteral
|
||||
|
||||
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
## name2unicode
|
||||
##
|
||||
def name2unicode(name):
|
||||
"""Converts Adobe glyph names to Unicode numbers."""
|
||||
"""Converts Adobe glyph names to Unicode numbers.
|
||||
|
||||
In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
|
||||
This way the caller must explicitly define what to do when there is not a match.
|
||||
|
||||
Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||
|
||||
:returns unicode character if name resembles something, otherwise a KeyError
|
||||
"""
|
||||
name = name.split('.')[0]
|
||||
components = name.split('_')
|
||||
|
||||
if len(components) > 1:
|
||||
return ''.join(map(name2unicode, components))
|
||||
|
||||
else:
|
||||
if name in glyphname2unicode:
|
||||
return glyphname2unicode[name]
|
||||
m = STRIP_NAME.search(name)
|
||||
if not m:
|
||||
raise KeyError(name)
|
||||
return six.unichr(int(m.group(0), base=16))
|
||||
return glyphname2unicode.get(name)
|
||||
|
||||
elif name.startswith('uni'):
|
||||
name_without_uni = name.strip('uni')
|
||||
|
||||
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
||||
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
|
||||
for digit in unicode_digits:
|
||||
raise_key_error_for_invalid_unicode(digit)
|
||||
characters = map(six.unichr, unicode_digits)
|
||||
return ''.join(characters)
|
||||
|
||||
elif name.startswith('u'):
|
||||
name_without_u = name.strip('u')
|
||||
|
||||
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
||||
unicode_digit = int(name_without_u, base=16)
|
||||
raise_key_error_for_invalid_unicode(unicode_digit)
|
||||
return six.unichr(unicode_digit)
|
||||
|
||||
raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
|
||||
|
||||
|
||||
def raise_key_error_for_invalid_unicode(unicode_digit):
|
||||
"""Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
|
||||
|
||||
:raises KeyError if unicode digit is invalid
|
||||
"""
|
||||
if 55295 < unicode_digit < 57344:
|
||||
raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)
|
||||
|
||||
|
||||
## EncodingDB
|
||||
##
|
||||
class EncodingDB(object):
|
||||
|
||||
std2unicode = {}
|
||||
|
@ -59,7 +98,7 @@ class EncodingDB(object):
|
|||
elif isinstance(x, PSLiteral):
|
||||
try:
|
||||
cid2unicode[cid] = name2unicode(x.name)
|
||||
except KeyError:
|
||||
pass
|
||||
except KeyError as e:
|
||||
log.debug(str(e))
|
||||
cid += 1
|
||||
return cid2unicode
|
||||
|
|
|
@ -74,7 +74,7 @@ class ImageWriter(object):
|
|||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||
ext = '.jpg'
|
||||
elif (image.bits == 1 or
|
||||
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
|
||||
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||
ext = '.%dx%d.bmp' % (width, height)
|
||||
else:
|
||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||
|
@ -101,7 +101,7 @@ class ImageWriter(object):
|
|||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB:
|
||||
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
|
||||
bmp = BMPWriter(fp, 24, width, height)
|
||||
data = stream.get_data()
|
||||
i = 0
|
||||
|
@ -109,7 +109,7 @@ class ImageWriter(object):
|
|||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY:
|
||||
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
|
||||
bmp = BMPWriter(fp, 8, width, height)
|
||||
data = stream.get_data()
|
||||
i = 0
|
||||
|
|
|
@ -178,7 +178,7 @@ class TagExtractor(PDFDevice):
|
|||
s = ''
|
||||
if isinstance(props, dict):
|
||||
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
|
||||
in sorted(props.iteritems()))
|
||||
in sorted(six.iteritems(props)))
|
||||
out_s = '<%s%s>' % (utils.enc(tag.name), s)
|
||||
self.outfp.write(utils.make_compat_bytes(out_s))
|
||||
self._stack.append(tag)
|
||||
|
|
|
@ -1,36 +1,39 @@
|
|||
|
||||
import sys
|
||||
import logging
|
||||
import struct
|
||||
import sys
|
||||
from io import BytesIO
|
||||
from .cmapdb import CMapDB
|
||||
from .cmapdb import CMapParser
|
||||
from .cmapdb import FileUnicodeMap
|
||||
from .cmapdb import CMap
|
||||
from .encodingdb import EncodingDB
|
||||
from .encodingdb import name2unicode
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import PSEOF
|
||||
from .psparser import LIT
|
||||
from .psparser import KWD
|
||||
from . import settings
|
||||
from .psparser import PSLiteral
|
||||
from .psparser import literal_name
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import num_value
|
||||
from .pdftypes import list_value
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import stream_value
|
||||
from .fontmetrics import FONT_METRICS
|
||||
from .utils import apply_matrix_norm
|
||||
from .utils import nunpack
|
||||
from .utils import choplist
|
||||
from .utils import isnumber
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
from . import settings
|
||||
from .cmapdb import CMap
|
||||
from .cmapdb import CMapDB
|
||||
from .cmapdb import CMapParser
|
||||
from .cmapdb import FileUnicodeMap
|
||||
from .encodingdb import EncodingDB
|
||||
from .encodingdb import name2unicode
|
||||
from .fontmetrics import FONT_METRICS
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import list_value
|
||||
from .pdftypes import num_value
|
||||
from .pdftypes import resolve1, resolve_all
|
||||
from .pdftypes import stream_value
|
||||
from .psparser import KWD
|
||||
from .psparser import LIT
|
||||
from .psparser import PSEOF
|
||||
from .psparser import PSLiteral
|
||||
from .psparser import PSStackParser
|
||||
from .psparser import literal_name
|
||||
from .utils import apply_matrix_norm
|
||||
from .utils import choplist
|
||||
from .utils import isnumber
|
||||
from .utils import nunpack
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def get_widths(seq):
|
||||
widths = {}
|
||||
|
@ -50,10 +53,6 @@ def get_widths(seq):
|
|||
widths[i] = w
|
||||
r = []
|
||||
return widths
|
||||
#assert get_widths([1]) == {}
|
||||
#assert get_widths([1,2,3]) == {1:3, 2:3}
|
||||
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
|
||||
|
||||
|
||||
def get_widths2(seq):
|
||||
widths = {}
|
||||
|
@ -73,13 +72,8 @@ def get_widths2(seq):
|
|||
widths[i] = (w, (vx, vy))
|
||||
r = []
|
||||
return widths
|
||||
#assert get_widths2([1]) == {}
|
||||
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
|
||||
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
|
||||
|
||||
|
||||
## FontMetricsDB
|
||||
##
|
||||
class FontMetricsDB(object):
|
||||
|
||||
@classmethod
|
||||
|
@ -87,8 +81,6 @@ class FontMetricsDB(object):
|
|||
return FONT_METRICS[fontname]
|
||||
|
||||
|
||||
## Type1FontHeaderParser
|
||||
##
|
||||
class Type1FontHeaderParser(PSStackParser):
|
||||
|
||||
KEYWORD_BEGIN = KWD(b'begin')
|
||||
|
@ -99,7 +91,6 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
KEYWORD_ARRAY = KWD(b'array')
|
||||
KEYWORD_READONLY = KWD(b'readonly')
|
||||
KEYWORD_FOR = KWD(b'for')
|
||||
KEYWORD_FOR = KWD(b'for')
|
||||
|
||||
def __init__(self, data):
|
||||
PSStackParser.__init__(self, data)
|
||||
|
@ -107,6 +98,17 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
return
|
||||
|
||||
def get_encoding(self):
|
||||
"""Parse the font encoding
|
||||
|
||||
The Type1 font encoding maps character codes to character names. These character names could either be standard
|
||||
Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
|
||||
sequence of operations that describe how the character should be drawn.
|
||||
Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
|
||||
|
||||
References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
|
||||
|
||||
:returns mapping of character identifiers (cid's) to unicode characters
|
||||
"""
|
||||
while 1:
|
||||
try:
|
||||
(cid, name) = self.nextobject()
|
||||
|
@ -114,8 +116,8 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
break
|
||||
try:
|
||||
self._cid2unicode[cid] = name2unicode(name)
|
||||
except KeyError:
|
||||
pass
|
||||
except KeyError as e:
|
||||
log.debug(str(e))
|
||||
return self._cid2unicode
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
|
@ -128,12 +130,17 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
|
||||
|
||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
||||
IDENTITY_ENCODER = ('Identity-H', 'Identity-V')
|
||||
|
||||
## CFFFont
|
||||
## (Format specified in Adobe Technical Note: #5176
|
||||
## "The Compact Font Format Specification")
|
||||
##
|
||||
#Note: DLIdent-* isn't found in PDF Reference but is been kept as
|
||||
#it is harmless and have possibility of been a type. (induced from bug report/PR)
|
||||
IDENTITY_ENCODER = {'Identity-H':'Identity-H',
|
||||
'Identity-V':'Identity-V',
|
||||
'DLIdent-H':'Identity-H',
|
||||
'DLIdent-V':'Identity-V',
|
||||
'OneByteIdentityH':'OneByteIdentityH',
|
||||
'OneByteIdentityV':'OneByteIdentityV',
|
||||
}
|
||||
|
||||
def getdict(data):
|
||||
d = {}
|
||||
fp = BytesIO(data)
|
||||
|
@ -261,6 +268,7 @@ class CFFFont(object):
|
|||
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
||||
)
|
||||
|
||||
|
||||
class INDEX(object):
|
||||
|
||||
def __init__(self, fp):
|
||||
|
@ -361,9 +369,6 @@ class CFFFont(object):
|
|||
assert False, str(('Unhandled', format))
|
||||
else:
|
||||
raise ValueError('unsupported charset format: %r' % format)
|
||||
#print self.code2gid
|
||||
#print self.name2gid
|
||||
#assert 0
|
||||
return
|
||||
|
||||
def getstr(self, sid):
|
||||
|
@ -372,8 +377,6 @@ class CFFFont(object):
|
|||
return self.string_index[sid-len(self.STANDARD_STRINGS)]
|
||||
|
||||
|
||||
## TrueTypeFont
|
||||
##
|
||||
class TrueTypeFont(object):
|
||||
|
||||
class CMapNotFound(Exception):
|
||||
|
@ -454,13 +457,11 @@ class TrueTypeFont(object):
|
|||
assert False, str(('Unhandled', fmttype))
|
||||
# create unicode map
|
||||
unicode_map = FileUnicodeMap()
|
||||
for (char, gid) in char2gid.iteritems():
|
||||
for (char, gid) in six.iteritems(char2gid):
|
||||
unicode_map.add_cid2unichr(gid, char)
|
||||
return unicode_map
|
||||
|
||||
|
||||
## Fonts
|
||||
##
|
||||
class PDFFontError(PDFException):
|
||||
pass
|
||||
|
||||
|
@ -472,12 +473,11 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
|||
LITERAL_TYPE1C = LIT('Type1C')
|
||||
|
||||
|
||||
# PDFFont
|
||||
class PDFFont(object):
|
||||
|
||||
def __init__(self, descriptor, widths, default_width=None):
|
||||
self.descriptor = descriptor
|
||||
self.widths = widths
|
||||
self.widths = resolve_all(widths)
|
||||
self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
|
||||
if isinstance(self.fontname, PSLiteral):
|
||||
self.fontname = literal_name(self.fontname)
|
||||
|
@ -487,8 +487,15 @@ class PDFFont(object):
|
|||
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
||||
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
|
||||
self.bbox = list_value(resolve_all(descriptor.get('FontBBox', (0, 0, 0, 0))))
|
||||
self.hscale = self.vscale = .001
|
||||
|
||||
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
||||
# PScript5.dll seems to produce Descent with a positive number, but
|
||||
# text analysis will be wrong if this is taken as correct. So force
|
||||
# descent to negative.
|
||||
if self.descent > 0:
|
||||
self.descent = -self.descent
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -504,9 +511,11 @@ class PDFFont(object):
|
|||
return bytearray(bytes) # map(ord, bytes)
|
||||
|
||||
def get_ascent(self):
|
||||
"""Ascent above the baseline, in text space units"""
|
||||
return self.ascent * self.vscale
|
||||
|
||||
def get_descent(self):
|
||||
"""Descent below the baseline, in text space units; always negative"""
|
||||
return self.descent * self.vscale
|
||||
|
||||
def get_width(self):
|
||||
|
@ -537,7 +546,6 @@ class PDFFont(object):
|
|||
return sum(self.char_width(cid) for cid in self.decode(s))
|
||||
|
||||
|
||||
# PDFSimpleFont
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
||||
def __init__(self, descriptor, widths, spec):
|
||||
|
@ -574,7 +582,6 @@ class PDFSimpleFont(PDFFont):
|
|||
raise PDFUnicodeNotDefined(None, cid)
|
||||
|
||||
|
||||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
|
@ -606,14 +613,12 @@ class PDFType1Font(PDFSimpleFont):
|
|||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFTrueTypeFont
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
|
@ -636,7 +641,6 @@ class PDFType3Font(PDFSimpleFont):
|
|||
return '<PDFType3Font>'
|
||||
|
||||
|
||||
# PDFCIDFont
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
|
||||
|
@ -701,9 +705,9 @@ class PDFCIDFont(PDFFont):
|
|||
"""
|
||||
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
||||
Encoding but as an attribute of CMapName, where CMapName is an
|
||||
attribure of spec['Encoding'].
|
||||
The horizaontal/vertical modes are mentioned with diffrent name
|
||||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'
|
||||
attribute of spec['Encoding'].
|
||||
The horizontal/vertical modes are mentioned with different name
|
||||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
||||
"""
|
||||
try:
|
||||
spec_encoding = spec['Encoding']
|
||||
|
@ -723,7 +727,7 @@ class PDFCIDFont(PDFFont):
|
|||
raise PDFFontError('CMapName unspecified for encoding')
|
||||
cmap_name = 'unknown'
|
||||
if cmap_name in IDENTITY_ENCODER:
|
||||
return CMapDB.get_cmap(cmap_name)
|
||||
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
|
||||
else:
|
||||
return CMap()
|
||||
|
||||
|
@ -751,16 +755,14 @@ class PDFCIDFont(PDFFont):
|
|||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
for fname in argv[1:]:
|
||||
fp = open(fname, 'rb')
|
||||
#font = TrueTypeFont(fname, fp)
|
||||
font = CFFFont(fname, fp)
|
||||
print (font)
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -598,25 +598,25 @@ class PDFPageInterpreter(object):
|
|||
|
||||
# setrgb-stroking
|
||||
def do_RG(self, r, g, b):
|
||||
self.graphicstate.color = (r, g, b)
|
||||
self.graphicstate.scolor = (r, g, b)
|
||||
#self.do_CS(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setrgb-non-stroking
|
||||
def do_rg(self, r, g, b):
|
||||
self.graphicstate.color = (r, g, b)
|
||||
self.graphicstate.ncolor = (r, g, b)
|
||||
#self.do_cs(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setcmyk-stroking
|
||||
def do_K(self, c, m, y, k):
|
||||
self.graphicstate.color = (c, m, y, k)
|
||||
self.graphicstate.scolor = (c, m, y, k)
|
||||
#self.do_CS(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
|
||||
# setcmyk-non-stroking
|
||||
def do_k(self, c, m, y, k):
|
||||
self.graphicstate.color = (c, m, y, k)
|
||||
self.graphicstate.ncolor = (c, m, y, k)
|
||||
#self.do_cs(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@ def resolve_all(x, default=None):
|
|||
if isinstance(x, list):
|
||||
x = [resolve_all(v, default=default) for v in x]
|
||||
elif isinstance(x, dict):
|
||||
for (k, v) in x.iteritems():
|
||||
for (k, v) in six.iteritems(x):
|
||||
x[k] = resolve_all(v, default=default)
|
||||
return x
|
||||
|
||||
|
|
|
@ -1,53 +1,55 @@
|
|||
|
||||
"""
|
||||
Miscellaneous Routines.
|
||||
"""
|
||||
import struct
|
||||
# from sys import maxint as INF #doesn't work anymore under Python3,
|
||||
# but PDF still uses 32 bits ints
|
||||
INF = (1<<31) - 1
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
# from sys import maxint as INF doesn't work anymore under Python3, but PDF still uses 32 bits ints
|
||||
INF = (1 << 31) - 1
|
||||
|
||||
if six.PY3:
|
||||
import chardet # For str encoding detection in Py3
|
||||
|
||||
unicode = str
|
||||
|
||||
|
||||
def make_compat_bytes(in_str):
|
||||
"In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."
|
||||
"""In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."""
|
||||
assert isinstance(in_str, str), str(type(in_str))
|
||||
if six.PY2:
|
||||
return in_str
|
||||
else:
|
||||
return in_str.encode()
|
||||
|
||||
|
||||
def make_compat_str(in_str):
|
||||
"In Py2, does nothing. In Py3, converts to string, guessing encoding."
|
||||
"""In Py2, does nothing. In Py3, converts to string, guessing encoding."""
|
||||
assert isinstance(in_str, (bytes, str, unicode)), str(type(in_str))
|
||||
if six.PY3 and isinstance(in_str, bytes):
|
||||
enc = chardet.detect(in_str)
|
||||
in_str = in_str.decode(enc['encoding'])
|
||||
return in_str
|
||||
|
||||
|
||||
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
|
||||
"When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."
|
||||
"""When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
|
||||
if six.PY2:
|
||||
assert isinstance(bytesorstring, (str, unicode)), str(type(bytesorstring))
|
||||
return bytesorstring.encode(encoding, erraction)
|
||||
if six.PY3:
|
||||
if isinstance(bytesorstring, str): return bytesorstring
|
||||
if isinstance(bytesorstring, str):
|
||||
return bytesorstring
|
||||
assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
|
||||
return bytesorstring.decode(encoding, erraction)
|
||||
|
||||
## PNG Predictor
|
||||
##
|
||||
|
||||
def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||
if bitspercomponent != 8:
|
||||
# unsupported
|
||||
raise ValueError("Unsupported `bitspercomponent': %d" %
|
||||
bitspercomponent)
|
||||
nbytes = colors * columns * bitspercomponent // 8
|
||||
i = 0
|
||||
buf = b''
|
||||
line0 = b'\x00' * columns
|
||||
for i in range(0, len(data), nbytes + 1):
|
||||
|
@ -91,8 +93,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
|||
return buf
|
||||
|
||||
|
||||
## Matrix operations
|
||||
##
|
||||
# Matrix operations
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
|
||||
|
||||
|
@ -109,31 +110,29 @@ def translate_matrix(m, v):
|
|||
"""Translates a matrix by (x, y)."""
|
||||
(a, b, c, d, e, f) = m
|
||||
(x, y) = v
|
||||
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
|
||||
return a, b, c, d, x * a + y * c + e, x * b + y * d + f
|
||||
|
||||
|
||||
def apply_matrix_pt(m, v):
|
||||
(a, b, c, d, e, f) = m
|
||||
(x, y) = v
|
||||
"""Applies a matrix to a point."""
|
||||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
return a * x + c * y + e, b * x + d * y + f
|
||||
|
||||
|
||||
def apply_matrix_norm(m, v):
|
||||
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
||||
(a, b, c, d, e, f) = m
|
||||
(p, q) = v
|
||||
return (a*p+c*q, b*p+d*q)
|
||||
return a * p + c * q, b * p + d * q
|
||||
|
||||
|
||||
## Utility functions
|
||||
##
|
||||
# Utility functions
|
||||
|
||||
# isnumber
|
||||
def isnumber(x):
|
||||
return isinstance(x, (six.integer_types, float))
|
||||
|
||||
# uniq
|
||||
|
||||
def uniq(objs):
|
||||
"""Eliminates duplicated elements."""
|
||||
done = set()
|
||||
|
@ -145,7 +144,6 @@ def uniq(objs):
|
|||
return
|
||||
|
||||
|
||||
# fsplit
|
||||
def fsplit(pred, objs):
|
||||
"""Split a list into two classes according to the predicate."""
|
||||
t = []
|
||||
|
@ -155,17 +153,14 @@ def fsplit(pred, objs):
|
|||
t.append(obj)
|
||||
else:
|
||||
f.append(obj)
|
||||
return (t, f)
|
||||
return t, f
|
||||
|
||||
|
||||
# drange
|
||||
def drange(v0, v1, d):
|
||||
"""Returns a discrete range."""
|
||||
assert v0 < v1, str((v0, v1, d))
|
||||
return range(int(v0) // d, int(v1 + d) // d)
|
||||
|
||||
|
||||
# get_bound
|
||||
def get_bound(pts):
|
||||
"""Compute a minimal rectangle that covers all the points."""
|
||||
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
||||
|
@ -174,10 +169,9 @@ def get_bound(pts):
|
|||
y0 = min(y0, y)
|
||||
x1 = max(x1, x)
|
||||
y1 = max(y1, y)
|
||||
return (x0, y0, x1, y1)
|
||||
return x0, y0, x1, y1
|
||||
|
||||
|
||||
# pick
|
||||
def pick(seq, func, maxobj=None):
|
||||
"""Picks the object obj where func(obj) has the highest value."""
|
||||
maxscore = None
|
||||
|
@ -188,7 +182,6 @@ def pick(seq, func, maxobj=None):
|
|||
return maxobj
|
||||
|
||||
|
||||
# choplist
|
||||
def choplist(n, seq):
|
||||
"""Groups every n elements of the list."""
|
||||
r = []
|
||||
|
@ -200,7 +193,6 @@ def choplist(n, seq):
|
|||
return
|
||||
|
||||
|
||||
# nunpack
|
||||
def nunpack(s, default=0):
|
||||
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
||||
l = len(s)
|
||||
|
@ -220,7 +212,6 @@ def nunpack(s, default=0):
|
|||
raise TypeError('invalid length: %d' % l)
|
||||
|
||||
|
||||
# decode_text
|
||||
PDFDocEncoding = ''.join(six.unichr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
|
@ -265,7 +256,6 @@ def decode_text(s):
|
|||
return ''.join(PDFDocEncoding[c] for c in s)
|
||||
|
||||
|
||||
# enc
|
||||
def enc(x, codec='ascii'):
|
||||
"""Encodes a string for SGML/XML/HTML"""
|
||||
if six.PY3 and isinstance(x, bytes):
|
||||
|
@ -285,6 +275,7 @@ def matrix2str(m):
|
|||
(a, b, c, d, e, f) = m
|
||||
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
|
||||
|
||||
|
||||
def vecBetweenBoxes(obj1, obj2):
|
||||
"""A distance function between two TextBoxes.
|
||||
|
||||
|
@ -304,18 +295,18 @@ def vecBetweenBoxes(obj1, obj2):
|
|||
# if one is inside another we compute euclidean distance
|
||||
(xc1, yc1) = ((obj1.x0 + obj1.x1) / 2, (obj1.y0 + obj1.y1) / 2)
|
||||
(xc2, yc2) = ((obj2.x0 + obj2.x1) / 2, (obj2.y0 + obj2.y1) / 2)
|
||||
return (xc1-xc2, yc1-yc2)
|
||||
return xc1 - xc2, yc1 - yc2
|
||||
else:
|
||||
return (max(0, iw), max(0, ih))
|
||||
return max(0, iw), max(0, ih)
|
||||
|
||||
|
||||
## Plane
|
||||
##
|
||||
## A set-like data structure for objects placed on a plane.
|
||||
## Can efficiently find objects in a certain rectangular area.
|
||||
## It maintains two parallel lists of objects, each of
|
||||
## which is sorted by its x or y coordinate.
|
||||
##
|
||||
class Plane(object):
|
||||
"""A set-like data structure for objects placed on a plane.
|
||||
|
||||
Can efficiently find objects in a certain rectangular area.
|
||||
It maintains two parallel lists of objects, each of
|
||||
which is sorted by its x or y coordinate.
|
||||
"""
|
||||
|
||||
def __init__(self, bbox, gridsize=50):
|
||||
self._seq = [] # preserve the object order.
|
||||
|
@ -323,10 +314,9 @@ class Plane(object):
|
|||
self._grid = {}
|
||||
self.gridsize = gridsize
|
||||
(self.x0, self.y0, self.x1, self.y1) = bbox
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<Plane objs=%r>' % list(self))
|
||||
return '<Plane objs=%r>' % list(self)
|
||||
|
||||
def __iter__(self):
|
||||
return (obj for obj in self._seq if obj in self._objs)
|
||||
|
@ -339,25 +329,22 @@ class Plane(object):
|
|||
|
||||
def _getrange(self, bbox):
|
||||
(x0, y0, x1, y1) = bbox
|
||||
if (x1 <= self.x0 or self.x1 <= x0 or
|
||||
y1 <= self.y0 or self.y1 <= y0): return
|
||||
if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
|
||||
return
|
||||
x0 = max(self.x0, x0)
|
||||
y0 = max(self.y0, y0)
|
||||
x1 = min(self.x1, x1)
|
||||
y1 = min(self.y1, y1)
|
||||
for y in drange(y0, y1, self.gridsize):
|
||||
for x in drange(x0, x1, self.gridsize):
|
||||
yield (x, y)
|
||||
return
|
||||
for grid_y in drange(y0, y1, self.gridsize):
|
||||
for grid_x in drange(x0, x1, self.gridsize):
|
||||
yield (grid_x, grid_y)
|
||||
|
||||
# extend(objs)
|
||||
def extend(self, objs):
|
||||
for obj in objs:
|
||||
self.add(obj)
|
||||
return
|
||||
|
||||
# add(obj): place an object.
|
||||
def add(self, obj):
|
||||
"""place an object."""
|
||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||
if k not in self._grid:
|
||||
r = []
|
||||
|
@ -367,20 +354,18 @@ class Plane(object):
|
|||
r.append(obj)
|
||||
self._seq.append(obj)
|
||||
self._objs.add(obj)
|
||||
return
|
||||
|
||||
# remove(obj): displace an object.
|
||||
def remove(self, obj):
|
||||
"""displace an object."""
|
||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||
try:
|
||||
self._grid[k].remove(obj)
|
||||
except (KeyError, ValueError):
|
||||
pass
|
||||
self._objs.remove(obj)
|
||||
return
|
||||
|
||||
# find(): finds objects that are in a certain area.
|
||||
def find(self, bbox):
|
||||
"""finds objects that are in a certain area."""
|
||||
(x0, y0, x1, y1) = bbox
|
||||
done = set()
|
||||
for k in self._getrange(bbox):
|
||||
|
@ -390,8 +375,6 @@ class Plane(object):
|
|||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
||||
obj.y1 <= y0 or y1 <= obj.y0):
|
||||
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
|
||||
continue
|
||||
yield obj
|
||||
return
|
||||
|
|
Binary file not shown.
13
setup.py
13
setup.py
|
@ -1,18 +1,19 @@
|
|||
from setuptools import setup
|
||||
import sys
|
||||
|
||||
import pdfminer as package
|
||||
|
||||
requires = ['six', 'pycryptodome', 'sortedcontainers']
|
||||
if sys.version_info >= (3, 0):
|
||||
requires.append('chardet')
|
||||
|
||||
setup(
|
||||
name='pdfminer.six',
|
||||
version=package.__version__,
|
||||
packages=['pdfminer'],
|
||||
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
||||
install_requires=requires,
|
||||
install_requires=[
|
||||
'chardet ; python_version > "3.0"',
|
||||
'pycryptodome',
|
||||
'six',
|
||||
'sortedcontainers',
|
||||
],
|
||||
extras_require={"dev": ["nose", "tox"]},
|
||||
description='PDF parser and analyzer',
|
||||
long_description=package.__doc__,
|
||||
license='MIT/X',
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
"""
|
||||
Tests based on the Adobe Glyph List Specification (https://github.com/adobe-type-tools/agl-specification#2-the-mapping)
|
||||
|
||||
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
|
||||
added.
|
||||
"""
|
||||
from nose.tools import assert_raises
|
||||
|
||||
from pdfminer.encodingdb import name2unicode
|
||||
|
||||
|
||||
def test_name2unicode_name_in_agl():
|
||||
"""The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
|
||||
assert u'\u013B' == name2unicode('Lcommaaccent')
|
||||
|
||||
|
||||
def test_name2unicode_uni():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('uni013B')
|
||||
|
||||
|
||||
def test_name2unicode_uni_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('uni013b')
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits():
|
||||
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||
assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
|
||||
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||
assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string():
|
||||
"""The name "uni20ac" has a single component, which is mapped to a euro-sign.
|
||||
|
||||
According to the specification this should be mapped to an empty string, but we also want to support lowercase
|
||||
hexadecimals
|
||||
"""
|
||||
assert u'\u20ac' == name2unicode('uni20ac')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long():
|
||||
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||
|
||||
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||
glyph name "u1040C.
|
||||
"""
|
||||
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long_lowercase():
|
||||
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||
|
||||
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||
glyph name "u1040C."""
|
||||
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('uniF6FB')
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua_lowercase():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('unif6fb')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('u013B')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('u013b')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == name2unicode('u1040C')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits_lowercase():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == name2unicode('u1040c')
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components_lowercase():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
|
||||
|
||||
|
||||
def test_name2unicode_foo():
|
||||
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
|
||||
assert_raises(KeyError, name2unicode, 'foo')
|
||||
|
||||
|
||||
def test_name2unicode_notdef():
|
||||
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
|
||||
assert_raises(KeyError, name2unicode, '.notdef')
|
||||
|
||||
|
||||
def test_name2unicode_pua_ogoneksmall():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('Ogoneksmall')
|
||||
|
||||
|
||||
def test_name2unicode_overflow_error():
|
||||
assert_raises(KeyError, name2unicode, '226215240241240240240240')
|
|
@ -3,7 +3,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import nose, logging, os
|
||||
from pdfminer.cmapdb import IdentityCMap, CMap
|
||||
from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte
|
||||
from pdfminer.pdffont import PDFCIDFont
|
||||
from pdfminer.pdftypes import PDFStream
|
||||
from pdfminer.psparser import PSLiteral
|
||||
|
@ -14,13 +14,13 @@ class TestPDFEncoding():
|
|||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, CMap)
|
||||
assert isinstance(font.cmap, IdentityCMapByte)
|
||||
|
||||
def test_cmapname_onebyteidentityH(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, CMap)
|
||||
assert isinstance(font.cmap, IdentityCMapByte)
|
||||
|
||||
def test_cmapname_V(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
|
||||
|
@ -68,6 +68,40 @@ class TestPDFEncoding():
|
|||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH(self):
|
||||
spec = {'Encoding': PSLiteral('DLIdent-H')}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentV(self):
|
||||
spec = {'Encoding': PSLiteral('DLIdent-V')}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_stream(self):
|
||||
stream = PDFStream({'CMapName':'DLIdent-H'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentV_as_stream(self):
|
||||
stream = PDFStream({'CMapName':'DLIdent-V'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_font_without_spec(self):
|
||||
font = PDFCIDFont(None, {})
|
||||
assert isinstance(font.cmap, CMap)
|
||||
|
|
|
@ -1,22 +1,28 @@
|
|||
#!/usr/bin/env python
|
||||
import os
|
||||
from shutil import rmtree
|
||||
from tempfile import NamedTemporaryFile, mkdtemp
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import nose, logging, os
|
||||
import nose
|
||||
|
||||
import tools.pdf2txt as pdf2txt
|
||||
|
||||
path=os.path.dirname(os.path.abspath(__file__))+'/'
|
||||
|
||||
def full_path(relative_path_to_this_file):
|
||||
this_file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
abspath = os.path.abspath(os.path.join(this_file_dir, relative_path_to_this_file))
|
||||
return abspath
|
||||
|
||||
|
||||
def run(datapath, filename, options=None):
|
||||
i=path+datapath+filename+'.pdf'
|
||||
o=path+filename+'.txt'
|
||||
i = full_path(datapath + filename + '.pdf')
|
||||
o = full_path(filename + '.txt')
|
||||
if options:
|
||||
s = 'pdf2txt -o%s %s %s' % (o, options, i)
|
||||
else:
|
||||
s = 'pdf2txt -o%s %s' % (o, i)
|
||||
pdf2txt.main(s.split(' ')[1:])
|
||||
|
||||
|
||||
class TestDumpPDF():
|
||||
|
||||
def test_1(self):
|
||||
|
@ -24,6 +30,7 @@ class TestDumpPDF():
|
|||
run('../samples/', 'simple1')
|
||||
run('../samples/', 'simple2')
|
||||
run('../samples/', 'simple3')
|
||||
run('../samples/','sampleOneByteIdentityEncode')
|
||||
|
||||
def test_2(self):
|
||||
run('../samples/nonfree/', 'dmca')
|
||||
|
@ -57,5 +64,30 @@ class TestDumpPDF():
|
|||
def test_10(self):
|
||||
run('../samples/scancode/', 'patchelf') # https://github.com/euske/pdfminer/issues/96
|
||||
|
||||
|
||||
class TestDumpImages(object):
|
||||
|
||||
def extract_images(self, input_file):
|
||||
output_dir = mkdtemp()
|
||||
with NamedTemporaryFile() as output_file:
|
||||
commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]
|
||||
pdf2txt.main(commands)
|
||||
image_files = os.listdir(output_dir)
|
||||
rmtree(output_dir)
|
||||
return image_files
|
||||
|
||||
def test_nonfree_dmca(self):
|
||||
"""Extract images of pdf containing bmp images
|
||||
|
||||
Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131
|
||||
"""
|
||||
image_files = self.extract_images(full_path('../samples/nonfree/dmca.pdf'))
|
||||
assert image_files[0].endswith('bmp')
|
||||
|
||||
def test_nonfree_175(self):
|
||||
"""Extract images of pdf containing jpg images"""
|
||||
self.extract_images(full_path('../samples/nonfree/175.pdf'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
nose.runmodule()
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
from nose.tools import assert_equal
|
||||
|
||||
from pdfminer.layout import LTComponent
|
||||
from pdfminer.utils import make_compat_str, Plane
|
||||
|
||||
|
||||
class TestPlane(object):
|
||||
def test_find_nothing_in_empty_bbox(self):
|
||||
plane, _ = self.given_plane_with_one_object()
|
||||
result = list(plane.find((50, 50, 100, 100)))
|
||||
assert_equal(result, [])
|
||||
|
||||
def test_find_nothing_after_removing(self):
|
||||
plane, obj = self.given_plane_with_one_object()
|
||||
plane.remove(obj)
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert_equal(result, [])
|
||||
|
||||
def test_find_object_in_whole_plane(self):
|
||||
plane, obj = self.given_plane_with_one_object()
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert_equal(result, [obj])
|
||||
|
||||
def test_find_if_object_is_smaller_than_gridsize(self):
|
||||
plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100)
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert_equal(result, [obj])
|
||||
|
||||
def test_find_object_if_much_larger_than_gridsize(self):
|
||||
plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10)
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert_equal(result, [obj])
|
||||
|
||||
@staticmethod
|
||||
def given_plane_with_one_object(object_size=50, gridsize=50):
|
||||
bounding_box = (0, 0, 100, 100)
|
||||
plane = Plane(bounding_box, gridsize)
|
||||
obj = LTComponent((0, 0, object_size, object_size))
|
||||
plane.add(obj)
|
||||
return plane, obj
|
|
@ -3,6 +3,8 @@
|
|||
import sys
|
||||
import fileinput
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
def main(argv):
|
||||
fonts = {}
|
||||
for line in fileinput.input():
|
||||
|
@ -33,7 +35,7 @@ def main(argv):
|
|||
props[k] = tuple(map(float, f[1:5]))
|
||||
print ('# -*- python -*-')
|
||||
print ('FONT_METRICS = {')
|
||||
for (fontname,(props,chars)) in fonts.iteritems():
|
||||
for (fontname,(props,chars)) in six.iteritems(fonts):
|
||||
print (' %r: %r,' % (fontname, (props,chars)))
|
||||
print ('}')
|
||||
return 0
|
||||
|
|
|
@ -26,6 +26,7 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|||
from pdfminer.converter import HTMLConverter, TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
# quote HTML metacharacters
|
||||
def q(x):
|
||||
|
@ -35,7 +36,7 @@ def q(x):
|
|||
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
|
||||
def url(base, **kw):
|
||||
r = []
|
||||
for (k,v) in kw.iteritems():
|
||||
for (k,v) in six.iteritems(kw):
|
||||
v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
|
||||
r.append('%s=%s' % (k, v))
|
||||
return base+'&'.join(r)
|
||||
|
|
Loading…
Reference in New Issue