Enforce pep8 coding-style (#345)
* Code Refractor: Use code-style enforcement #312 * Add flake8 to travis-ci * Remove python 2 3 comment on six library. 891 errors > 870 errors. * Remove class and functions comments that consist of just the name. 870 errors > 855 errors. * Fix flake8 errors in pdftypes.py. 855 errors > 833 errors. * Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting * Cleanup pdfinterp.py and add documentation from PDF Reference * Cleanup pdfpage.py * Cleanup pdffont.py * Clean psparser.py * Cleanup high_level.py * Cleanup layout.py * Cleanup pdfparser.py * Cleanup pdfcolor.py * Cleanup rijndael.py * Cleanup converter.py * Rename klass to cls if it is the class variable, to be more consistent with standard practice * Cleanup cmap.py * Cleanup pdfdevice.py * flake8 ignore fontmetrics.py * Cleanup test_pdfminer_psparser.py * Fix flake8 in pdfdocument.py; 339 errors to go * Fix flake8 utils.py; 326 errors togo * pep8 correction for few files in /tools/ 328 > 160 to go (#342) * pep8 correction for few files in /tools/ 328 > 160 to go * pep8 correction: 160 > 5 to go * Fix ascii85.py errors * Fix error in getting index from target that does not exists * Remove commented print lines * Fix flake8 error in pdfinterp.py * Fix python2 specific error by removing argument from print statement * Ignore invalid python2 syntax * Update contributing.md * Added changelog * Remove unused import Co-authored-by: Fakabbir Amin <f4amin@gmail.com>pull/351/head
parent
78f06225b6
commit
f3ab1bc61e
|
@ -7,6 +7,6 @@ python:
|
|||
- "3.7"
|
||||
- "3.8"
|
||||
install:
|
||||
- pip install tox-travis
|
||||
- pip install tox-travis flake8
|
||||
script:
|
||||
- tox -r
|
||||
|
|
|
@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Changed
|
||||
- Enforce pep8 coding style by adding flake8 to CI ([#345](https://github.com/pdfminer/pdfminer.six/pull/345))
|
||||
|
||||
## [20191110] - 2019-11-10
|
||||
|
||||
### Fixed
|
||||
|
|
|
@ -14,7 +14,7 @@ Any contribution is appreciated! You might want to:
|
|||
- If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the
|
||||
issue.
|
||||
* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request).
|
||||
* Help others by giving your thoughts on open issues and pull requests.
|
||||
* Help others by sharing your thoughs in comments on issues and pull requests.
|
||||
|
||||
## Guidelines for creating issues
|
||||
|
||||
|
@ -29,11 +29,15 @@ Any contribution is appreciated! You might want to:
|
|||
* Pull requests should be merged to develop, not master. This ensures that master always equals the released version.
|
||||
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
|
||||
of features, this will show that your code works correctly.
|
||||
* Code should work for Python 2.7 and Python 3.x (for now), conform to PEP8 code style (with a line-width of 120)
|
||||
and properly documented with docstrings.
|
||||
* Code should work for Python 2.7 and Python 3.x (for now), conform to PEP8 code style (enforced by
|
||||
[flake8](http://flake8.pycqa.org/en/latest/)) and properly documented with docstrings.
|
||||
* Check spelling and grammar.
|
||||
* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])
|
||||
|
||||
## Guidelines for posting comments
|
||||
|
||||
* [Be cordial and positive](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way)
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Clone the repository
|
||||
|
|
|
@ -12,7 +12,8 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../'))
|
||||
sys.path.insert(0, os.path.join(
|
||||
os.path.abspath(os.path.dirname(__file__)), '../../'))
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
|
|
@ -17,8 +17,11 @@ __version__ = '20191110'
|
|||
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
|
||||
'more information see https://github.com/pdfminer/pdfminer.six/issues/194')
|
||||
warnings.warn('On January 1st, 2020, '
|
||||
'pdfminer.six will stop supporting Python 2. '
|
||||
'Please upgrade to Python 3. '
|
||||
'For more information see '
|
||||
'https://github.com/pdfminer/pdfminer.six/issues/194')
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(__version__)
|
||||
|
|
|
@ -7,16 +7,17 @@ This code is in the public domain.
|
|||
"""
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
## Arcfour
|
||||
##
|
||||
|
||||
|
||||
class Arcfour(object):
|
||||
|
||||
def __init__(self, key):
|
||||
s = [i for i in range(256)] #because Py3 range is not indexable
|
||||
# because Py3 range is not indexable
|
||||
s = [i for i in range(256)]
|
||||
j = 0
|
||||
klen = len(key)
|
||||
for i in range(256):
|
||||
j = (j + s[i] + six.indexbytes(key,i % klen)) % 256
|
||||
j = (j + s[i] + six.indexbytes(key, i % klen)) % 256
|
||||
(s[i], s[j]) = (s[j], s[i])
|
||||
self.s = s
|
||||
(self.i, self.j) = (0, 0)
|
||||
|
@ -37,4 +38,5 @@ class Arcfour(object):
|
|||
|
||||
encrypt = decrypt = process
|
||||
|
||||
|
||||
new = Arcfour
|
||||
|
|
|
@ -9,7 +9,7 @@ This code is in the public domain.
|
|||
import re
|
||||
import struct
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
|
||||
# ascii85decode(data)
|
||||
|
@ -27,7 +27,7 @@ def ascii85decode(data):
|
|||
n = b = 0
|
||||
out = b''
|
||||
for i in six.iterbytes(data):
|
||||
c=six.int2byte(i)
|
||||
c = six.int2byte(i)
|
||||
if b'!' <= c and c <= b'u':
|
||||
n += 1
|
||||
b = b*85+(ord(c)-33)
|
||||
|
@ -45,9 +45,11 @@ def ascii85decode(data):
|
|||
break
|
||||
return out
|
||||
|
||||
|
||||
# asciihexdecode(data)
|
||||
hex_re = re.compile(b'([a-f\d]{2})', re.IGNORECASE)
|
||||
trail_re = re.compile(b'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||
hex_re = re.compile(b'([a-f0-9]{2})', re.IGNORECASE)
|
||||
trail_re = re.compile(b'^(?:[a-f0-9]{2}|[ \t\n\r\f\v])*'
|
||||
b'([a-f0-9])[ \t\n\r\f\v>]*$', re.IGNORECASE)
|
||||
|
||||
|
||||
def asciihexdecode(data):
|
||||
|
@ -61,14 +63,14 @@ def asciihexdecode(data):
|
|||
will behave as if a 0 followed the last digit.
|
||||
"""
|
||||
def decode(x):
|
||||
i=int(x,16)
|
||||
i = int(x, 16)
|
||||
return six.int2byte(i)
|
||||
|
||||
out=b''
|
||||
out = b''
|
||||
for x in hex_re.findall(data):
|
||||
out+=decode(x)
|
||||
out += decode(x)
|
||||
|
||||
m = trail_re.search(data)
|
||||
if m:
|
||||
out+=decode(m.group(1)+b'0')
|
||||
out += decode(m.group(1)+b'0')
|
||||
return out
|
||||
|
|
|
@ -5,15 +5,17 @@
|
|||
#
|
||||
# cf.
|
||||
# ITU-T Recommendation T.4
|
||||
# "Standardization of Group 3 facsimile terminals for document transmission"
|
||||
# "Standardization of Group 3 facsimile terminals
|
||||
# for document transmission"
|
||||
# ITU-T Recommendation T.6
|
||||
# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS FOR GROUP 4 FACSIMILE APPARATUS"
|
||||
# "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
|
||||
# FOR GROUP 4 FACSIMILE APPARATUS"
|
||||
|
||||
|
||||
import sys
|
||||
import array
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
if six.PY3:
|
||||
def get_bytes(data):
|
||||
|
@ -25,8 +27,6 @@ else:
|
|||
yield ord(char)
|
||||
|
||||
|
||||
## BitParser
|
||||
##
|
||||
class BitParser(object):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -34,7 +34,7 @@ class BitParser(object):
|
|||
return
|
||||
|
||||
@classmethod
|
||||
def add(klass, root, v, bits):
|
||||
def add(cls, root, v, bits):
|
||||
p = root
|
||||
b = None
|
||||
for i in range(len(bits)):
|
||||
|
@ -68,8 +68,6 @@ class BitParser(object):
|
|||
return
|
||||
|
||||
|
||||
## CCITTG4Parser
|
||||
##
|
||||
class CCITTG4Parser(BitParser):
|
||||
|
||||
MODE = [None, None]
|
||||
|
@ -93,85 +91,85 @@ class CCITTG4Parser(BitParser):
|
|||
BitParser.add(MODE, 'e', '000000000001000000000001')
|
||||
|
||||
WHITE = [None, None]
|
||||
BitParser.add(WHITE, 0 , '00110101')
|
||||
BitParser.add(WHITE, 1 , '000111')
|
||||
BitParser.add(WHITE, 2 , '0111')
|
||||
BitParser.add(WHITE, 3 , '1000')
|
||||
BitParser.add(WHITE, 4 , '1011')
|
||||
BitParser.add(WHITE, 5 , '1100')
|
||||
BitParser.add(WHITE, 6 , '1110')
|
||||
BitParser.add(WHITE, 7 , '1111')
|
||||
BitParser.add(WHITE, 8 , '10011')
|
||||
BitParser.add(WHITE, 9 , '10100')
|
||||
BitParser.add(WHITE, 10 , '00111')
|
||||
BitParser.add(WHITE, 11 , '01000')
|
||||
BitParser.add(WHITE, 12 , '001000')
|
||||
BitParser.add(WHITE, 13 , '000011')
|
||||
BitParser.add(WHITE, 14 , '110100')
|
||||
BitParser.add(WHITE, 15 , '110101')
|
||||
BitParser.add(WHITE, 16 , '101010')
|
||||
BitParser.add(WHITE, 17 , '101011')
|
||||
BitParser.add(WHITE, 18 , '0100111')
|
||||
BitParser.add(WHITE, 19 , '0001100')
|
||||
BitParser.add(WHITE, 20 , '0001000')
|
||||
BitParser.add(WHITE, 21 , '0010111')
|
||||
BitParser.add(WHITE, 22 , '0000011')
|
||||
BitParser.add(WHITE, 23 , '0000100')
|
||||
BitParser.add(WHITE, 24 , '0101000')
|
||||
BitParser.add(WHITE, 25 , '0101011')
|
||||
BitParser.add(WHITE, 26 , '0010011')
|
||||
BitParser.add(WHITE, 27 , '0100100')
|
||||
BitParser.add(WHITE, 28 , '0011000')
|
||||
BitParser.add(WHITE, 29 , '00000010')
|
||||
BitParser.add(WHITE, 30 , '00000011')
|
||||
BitParser.add(WHITE, 31 , '00011010')
|
||||
BitParser.add(WHITE, 32 , '00011011')
|
||||
BitParser.add(WHITE, 33 , '00010010')
|
||||
BitParser.add(WHITE, 34 , '00010011')
|
||||
BitParser.add(WHITE, 35 , '00010100')
|
||||
BitParser.add(WHITE, 36 , '00010101')
|
||||
BitParser.add(WHITE, 37 , '00010110')
|
||||
BitParser.add(WHITE, 38 , '00010111')
|
||||
BitParser.add(WHITE, 39 , '00101000')
|
||||
BitParser.add(WHITE, 40 , '00101001')
|
||||
BitParser.add(WHITE, 41 , '00101010')
|
||||
BitParser.add(WHITE, 42 , '00101011')
|
||||
BitParser.add(WHITE, 43 , '00101100')
|
||||
BitParser.add(WHITE, 44 , '00101101')
|
||||
BitParser.add(WHITE, 45 , '00000100')
|
||||
BitParser.add(WHITE, 46 , '00000101')
|
||||
BitParser.add(WHITE, 47 , '00001010')
|
||||
BitParser.add(WHITE, 48 , '00001011')
|
||||
BitParser.add(WHITE, 49 , '01010010')
|
||||
BitParser.add(WHITE, 50 , '01010011')
|
||||
BitParser.add(WHITE, 51 , '01010100')
|
||||
BitParser.add(WHITE, 52 , '01010101')
|
||||
BitParser.add(WHITE, 53 , '00100100')
|
||||
BitParser.add(WHITE, 54 , '00100101')
|
||||
BitParser.add(WHITE, 55 , '01011000')
|
||||
BitParser.add(WHITE, 56 , '01011001')
|
||||
BitParser.add(WHITE, 57 , '01011010')
|
||||
BitParser.add(WHITE, 58 , '01011011')
|
||||
BitParser.add(WHITE, 59 , '01001010')
|
||||
BitParser.add(WHITE, 60 , '01001011')
|
||||
BitParser.add(WHITE, 61 , '00110010')
|
||||
BitParser.add(WHITE, 62 , '00110011')
|
||||
BitParser.add(WHITE, 63 , '00110100')
|
||||
BitParser.add(WHITE, 64 , '11011')
|
||||
BitParser.add(WHITE, 128 , '10010')
|
||||
BitParser.add(WHITE, 192 , '010111')
|
||||
BitParser.add(WHITE, 256 , '0110111')
|
||||
BitParser.add(WHITE, 320 , '00110110')
|
||||
BitParser.add(WHITE, 384 , '00110111')
|
||||
BitParser.add(WHITE, 448 , '01100100')
|
||||
BitParser.add(WHITE, 512 , '01100101')
|
||||
BitParser.add(WHITE, 576 , '01101000')
|
||||
BitParser.add(WHITE, 640 , '01100111')
|
||||
BitParser.add(WHITE, 704 , '011001100')
|
||||
BitParser.add(WHITE, 768 , '011001101')
|
||||
BitParser.add(WHITE, 832 , '011010010')
|
||||
BitParser.add(WHITE, 896 , '011010011')
|
||||
BitParser.add(WHITE, 960 , '011010100')
|
||||
BitParser.add(WHITE, 0, '00110101')
|
||||
BitParser.add(WHITE, 1, '000111')
|
||||
BitParser.add(WHITE, 2, '0111')
|
||||
BitParser.add(WHITE, 3, '1000')
|
||||
BitParser.add(WHITE, 4, '1011')
|
||||
BitParser.add(WHITE, 5, '1100')
|
||||
BitParser.add(WHITE, 6, '1110')
|
||||
BitParser.add(WHITE, 7, '1111')
|
||||
BitParser.add(WHITE, 8, '10011')
|
||||
BitParser.add(WHITE, 9, '10100')
|
||||
BitParser.add(WHITE, 10, '00111')
|
||||
BitParser.add(WHITE, 11, '01000')
|
||||
BitParser.add(WHITE, 12, '001000')
|
||||
BitParser.add(WHITE, 13, '000011')
|
||||
BitParser.add(WHITE, 14, '110100')
|
||||
BitParser.add(WHITE, 15, '110101')
|
||||
BitParser.add(WHITE, 16, '101010')
|
||||
BitParser.add(WHITE, 17, '101011')
|
||||
BitParser.add(WHITE, 18, '0100111')
|
||||
BitParser.add(WHITE, 19, '0001100')
|
||||
BitParser.add(WHITE, 20, '0001000')
|
||||
BitParser.add(WHITE, 21, '0010111')
|
||||
BitParser.add(WHITE, 22, '0000011')
|
||||
BitParser.add(WHITE, 23, '0000100')
|
||||
BitParser.add(WHITE, 24, '0101000')
|
||||
BitParser.add(WHITE, 25, '0101011')
|
||||
BitParser.add(WHITE, 26, '0010011')
|
||||
BitParser.add(WHITE, 27, '0100100')
|
||||
BitParser.add(WHITE, 28, '0011000')
|
||||
BitParser.add(WHITE, 29, '00000010')
|
||||
BitParser.add(WHITE, 30, '00000011')
|
||||
BitParser.add(WHITE, 31, '00011010')
|
||||
BitParser.add(WHITE, 32, '00011011')
|
||||
BitParser.add(WHITE, 33, '00010010')
|
||||
BitParser.add(WHITE, 34, '00010011')
|
||||
BitParser.add(WHITE, 35, '00010100')
|
||||
BitParser.add(WHITE, 36, '00010101')
|
||||
BitParser.add(WHITE, 37, '00010110')
|
||||
BitParser.add(WHITE, 38, '00010111')
|
||||
BitParser.add(WHITE, 39, '00101000')
|
||||
BitParser.add(WHITE, 40, '00101001')
|
||||
BitParser.add(WHITE, 41, '00101010')
|
||||
BitParser.add(WHITE, 42, '00101011')
|
||||
BitParser.add(WHITE, 43, '00101100')
|
||||
BitParser.add(WHITE, 44, '00101101')
|
||||
BitParser.add(WHITE, 45, '00000100')
|
||||
BitParser.add(WHITE, 46, '00000101')
|
||||
BitParser.add(WHITE, 47, '00001010')
|
||||
BitParser.add(WHITE, 48, '00001011')
|
||||
BitParser.add(WHITE, 49, '01010010')
|
||||
BitParser.add(WHITE, 50, '01010011')
|
||||
BitParser.add(WHITE, 51, '01010100')
|
||||
BitParser.add(WHITE, 52, '01010101')
|
||||
BitParser.add(WHITE, 53, '00100100')
|
||||
BitParser.add(WHITE, 54, '00100101')
|
||||
BitParser.add(WHITE, 55, '01011000')
|
||||
BitParser.add(WHITE, 56, '01011001')
|
||||
BitParser.add(WHITE, 57, '01011010')
|
||||
BitParser.add(WHITE, 58, '01011011')
|
||||
BitParser.add(WHITE, 59, '01001010')
|
||||
BitParser.add(WHITE, 60, '01001011')
|
||||
BitParser.add(WHITE, 61, '00110010')
|
||||
BitParser.add(WHITE, 62, '00110011')
|
||||
BitParser.add(WHITE, 63, '00110100')
|
||||
BitParser.add(WHITE, 64, '11011')
|
||||
BitParser.add(WHITE, 128, '10010')
|
||||
BitParser.add(WHITE, 192, '010111')
|
||||
BitParser.add(WHITE, 256, '0110111')
|
||||
BitParser.add(WHITE, 320, '00110110')
|
||||
BitParser.add(WHITE, 384, '00110111')
|
||||
BitParser.add(WHITE, 448, '01100100')
|
||||
BitParser.add(WHITE, 512, '01100101')
|
||||
BitParser.add(WHITE, 576, '01101000')
|
||||
BitParser.add(WHITE, 640, '01100111')
|
||||
BitParser.add(WHITE, 704, '011001100')
|
||||
BitParser.add(WHITE, 768, '011001101')
|
||||
BitParser.add(WHITE, 832, '011010010')
|
||||
BitParser.add(WHITE, 896, '011010011')
|
||||
BitParser.add(WHITE, 960, '011010100')
|
||||
BitParser.add(WHITE, 1024, '011010101')
|
||||
BitParser.add(WHITE, 1088, '011010110')
|
||||
BitParser.add(WHITE, 1152, '011010111')
|
||||
|
@ -199,85 +197,85 @@ class CCITTG4Parser(BitParser):
|
|||
BitParser.add(WHITE, 2560, '000000011111')
|
||||
|
||||
BLACK = [None, None]
|
||||
BitParser.add(BLACK, 0 , '0000110111')
|
||||
BitParser.add(BLACK, 1 , '010')
|
||||
BitParser.add(BLACK, 2 , '11')
|
||||
BitParser.add(BLACK, 3 , '10')
|
||||
BitParser.add(BLACK, 4 , '011')
|
||||
BitParser.add(BLACK, 5 , '0011')
|
||||
BitParser.add(BLACK, 6 , '0010')
|
||||
BitParser.add(BLACK, 7 , '00011')
|
||||
BitParser.add(BLACK, 8 , '000101')
|
||||
BitParser.add(BLACK, 9 , '000100')
|
||||
BitParser.add(BLACK, 10 , '0000100')
|
||||
BitParser.add(BLACK, 11 , '0000101')
|
||||
BitParser.add(BLACK, 12 , '0000111')
|
||||
BitParser.add(BLACK, 13 , '00000100')
|
||||
BitParser.add(BLACK, 14 , '00000111')
|
||||
BitParser.add(BLACK, 15 , '000011000')
|
||||
BitParser.add(BLACK, 16 , '0000010111')
|
||||
BitParser.add(BLACK, 17 , '0000011000')
|
||||
BitParser.add(BLACK, 18 , '0000001000')
|
||||
BitParser.add(BLACK, 19 , '00001100111')
|
||||
BitParser.add(BLACK, 20 , '00001101000')
|
||||
BitParser.add(BLACK, 21 , '00001101100')
|
||||
BitParser.add(BLACK, 22 , '00000110111')
|
||||
BitParser.add(BLACK, 23 , '00000101000')
|
||||
BitParser.add(BLACK, 24 , '00000010111')
|
||||
BitParser.add(BLACK, 25 , '00000011000')
|
||||
BitParser.add(BLACK, 26 , '000011001010')
|
||||
BitParser.add(BLACK, 27 , '000011001011')
|
||||
BitParser.add(BLACK, 28 , '000011001100')
|
||||
BitParser.add(BLACK, 29 , '000011001101')
|
||||
BitParser.add(BLACK, 30 , '000001101000')
|
||||
BitParser.add(BLACK, 31 , '000001101001')
|
||||
BitParser.add(BLACK, 32 , '000001101010')
|
||||
BitParser.add(BLACK, 33 , '000001101011')
|
||||
BitParser.add(BLACK, 34 , '000011010010')
|
||||
BitParser.add(BLACK, 35 , '000011010011')
|
||||
BitParser.add(BLACK, 36 , '000011010100')
|
||||
BitParser.add(BLACK, 37 , '000011010101')
|
||||
BitParser.add(BLACK, 38 , '000011010110')
|
||||
BitParser.add(BLACK, 39 , '000011010111')
|
||||
BitParser.add(BLACK, 40 , '000001101100')
|
||||
BitParser.add(BLACK, 41 , '000001101101')
|
||||
BitParser.add(BLACK, 42 , '000011011010')
|
||||
BitParser.add(BLACK, 43 , '000011011011')
|
||||
BitParser.add(BLACK, 44 , '000001010100')
|
||||
BitParser.add(BLACK, 45 , '000001010101')
|
||||
BitParser.add(BLACK, 46 , '000001010110')
|
||||
BitParser.add(BLACK, 47 , '000001010111')
|
||||
BitParser.add(BLACK, 48 , '000001100100')
|
||||
BitParser.add(BLACK, 49 , '000001100101')
|
||||
BitParser.add(BLACK, 50 , '000001010010')
|
||||
BitParser.add(BLACK, 51 , '000001010011')
|
||||
BitParser.add(BLACK, 52 , '000000100100')
|
||||
BitParser.add(BLACK, 53 , '000000110111')
|
||||
BitParser.add(BLACK, 54 , '000000111000')
|
||||
BitParser.add(BLACK, 55 , '000000100111')
|
||||
BitParser.add(BLACK, 56 , '000000101000')
|
||||
BitParser.add(BLACK, 57 , '000001011000')
|
||||
BitParser.add(BLACK, 58 , '000001011001')
|
||||
BitParser.add(BLACK, 59 , '000000101011')
|
||||
BitParser.add(BLACK, 60 , '000000101100')
|
||||
BitParser.add(BLACK, 61 , '000001011010')
|
||||
BitParser.add(BLACK, 62 , '000001100110')
|
||||
BitParser.add(BLACK, 63 , '000001100111')
|
||||
BitParser.add(BLACK, 64 , '0000001111')
|
||||
BitParser.add(BLACK, 128 , '000011001000')
|
||||
BitParser.add(BLACK, 192 , '000011001001')
|
||||
BitParser.add(BLACK, 256 , '000001011011')
|
||||
BitParser.add(BLACK, 320 , '000000110011')
|
||||
BitParser.add(BLACK, 384 , '000000110100')
|
||||
BitParser.add(BLACK, 448 , '000000110101')
|
||||
BitParser.add(BLACK, 512 , '0000001101100')
|
||||
BitParser.add(BLACK, 576 , '0000001101101')
|
||||
BitParser.add(BLACK, 640 , '0000001001010')
|
||||
BitParser.add(BLACK, 704 , '0000001001011')
|
||||
BitParser.add(BLACK, 768 , '0000001001100')
|
||||
BitParser.add(BLACK, 832 , '0000001001101')
|
||||
BitParser.add(BLACK, 896 , '0000001110010')
|
||||
BitParser.add(BLACK, 960 , '0000001110011')
|
||||
BitParser.add(BLACK, 0, '0000110111')
|
||||
BitParser.add(BLACK, 1, '010')
|
||||
BitParser.add(BLACK, 2, '11')
|
||||
BitParser.add(BLACK, 3, '10')
|
||||
BitParser.add(BLACK, 4, '011')
|
||||
BitParser.add(BLACK, 5, '0011')
|
||||
BitParser.add(BLACK, 6, '0010')
|
||||
BitParser.add(BLACK, 7, '00011')
|
||||
BitParser.add(BLACK, 8, '000101')
|
||||
BitParser.add(BLACK, 9, '000100')
|
||||
BitParser.add(BLACK, 10, '0000100')
|
||||
BitParser.add(BLACK, 11, '0000101')
|
||||
BitParser.add(BLACK, 12, '0000111')
|
||||
BitParser.add(BLACK, 13, '00000100')
|
||||
BitParser.add(BLACK, 14, '00000111')
|
||||
BitParser.add(BLACK, 15, '000011000')
|
||||
BitParser.add(BLACK, 16, '0000010111')
|
||||
BitParser.add(BLACK, 17, '0000011000')
|
||||
BitParser.add(BLACK, 18, '0000001000')
|
||||
BitParser.add(BLACK, 19, '00001100111')
|
||||
BitParser.add(BLACK, 20, '00001101000')
|
||||
BitParser.add(BLACK, 21, '00001101100')
|
||||
BitParser.add(BLACK, 22, '00000110111')
|
||||
BitParser.add(BLACK, 23, '00000101000')
|
||||
BitParser.add(BLACK, 24, '00000010111')
|
||||
BitParser.add(BLACK, 25, '00000011000')
|
||||
BitParser.add(BLACK, 26, '000011001010')
|
||||
BitParser.add(BLACK, 27, '000011001011')
|
||||
BitParser.add(BLACK, 28, '000011001100')
|
||||
BitParser.add(BLACK, 29, '000011001101')
|
||||
BitParser.add(BLACK, 30, '000001101000')
|
||||
BitParser.add(BLACK, 31, '000001101001')
|
||||
BitParser.add(BLACK, 32, '000001101010')
|
||||
BitParser.add(BLACK, 33, '000001101011')
|
||||
BitParser.add(BLACK, 34, '000011010010')
|
||||
BitParser.add(BLACK, 35, '000011010011')
|
||||
BitParser.add(BLACK, 36, '000011010100')
|
||||
BitParser.add(BLACK, 37, '000011010101')
|
||||
BitParser.add(BLACK, 38, '000011010110')
|
||||
BitParser.add(BLACK, 39, '000011010111')
|
||||
BitParser.add(BLACK, 40, '000001101100')
|
||||
BitParser.add(BLACK, 41, '000001101101')
|
||||
BitParser.add(BLACK, 42, '000011011010')
|
||||
BitParser.add(BLACK, 43, '000011011011')
|
||||
BitParser.add(BLACK, 44, '000001010100')
|
||||
BitParser.add(BLACK, 45, '000001010101')
|
||||
BitParser.add(BLACK, 46, '000001010110')
|
||||
BitParser.add(BLACK, 47, '000001010111')
|
||||
BitParser.add(BLACK, 48, '000001100100')
|
||||
BitParser.add(BLACK, 49, '000001100101')
|
||||
BitParser.add(BLACK, 50, '000001010010')
|
||||
BitParser.add(BLACK, 51, '000001010011')
|
||||
BitParser.add(BLACK, 52, '000000100100')
|
||||
BitParser.add(BLACK, 53, '000000110111')
|
||||
BitParser.add(BLACK, 54, '000000111000')
|
||||
BitParser.add(BLACK, 55, '000000100111')
|
||||
BitParser.add(BLACK, 56, '000000101000')
|
||||
BitParser.add(BLACK, 57, '000001011000')
|
||||
BitParser.add(BLACK, 58, '000001011001')
|
||||
BitParser.add(BLACK, 59, '000000101011')
|
||||
BitParser.add(BLACK, 60, '000000101100')
|
||||
BitParser.add(BLACK, 61, '000001011010')
|
||||
BitParser.add(BLACK, 62, '000001100110')
|
||||
BitParser.add(BLACK, 63, '000001100111')
|
||||
BitParser.add(BLACK, 64, '0000001111')
|
||||
BitParser.add(BLACK, 128, '000011001000')
|
||||
BitParser.add(BLACK, 192, '000011001001')
|
||||
BitParser.add(BLACK, 256, '000001011011')
|
||||
BitParser.add(BLACK, 320, '000000110011')
|
||||
BitParser.add(BLACK, 384, '000000110100')
|
||||
BitParser.add(BLACK, 448, '000000110101')
|
||||
BitParser.add(BLACK, 512, '0000001101100')
|
||||
BitParser.add(BLACK, 576, '0000001101101')
|
||||
BitParser.add(BLACK, 640, '0000001001010')
|
||||
BitParser.add(BLACK, 704, '0000001001011')
|
||||
BitParser.add(BLACK, 768, '0000001001100')
|
||||
BitParser.add(BLACK, 832, '0000001001101')
|
||||
BitParser.add(BLACK, 896, '0000001110010')
|
||||
BitParser.add(BLACK, 960, '0000001110011')
|
||||
BitParser.add(BLACK, 1024, '0000001110100')
|
||||
BitParser.add(BLACK, 1088, '0000001110101')
|
||||
BitParser.add(BLACK, 1152, '0000001110110')
|
||||
|
@ -434,7 +432,7 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def output_line(self, y, bits):
|
||||
print (y, ''.join(str(b) for b in bits))
|
||||
print(y, ''.join(str(b) for b in bits))
|
||||
return
|
||||
|
||||
def _reset_line(self):
|
||||
|
@ -454,8 +452,6 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def _do_vertical(self, dx):
|
||||
#print '* vertical(%d): curpos=%r, color=%r' % (dx, self._curpos, self._color)
|
||||
#print ' refline:', self._get_refline(self._curpos+1)
|
||||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
|
@ -481,8 +477,6 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def _do_pass(self):
|
||||
#print '* pass: curpos=%r, color=%r' % (self._curpos, self._color)
|
||||
#print ' refline:', self._get_refline(self._curpos+1)
|
||||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
|
@ -510,7 +504,6 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def _do_horizontal(self, n1, n2):
|
||||
#print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color)
|
||||
if self._curpos < 0:
|
||||
self._curpos = 0
|
||||
x = self._curpos
|
||||
|
@ -528,7 +521,6 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def _do_uncompressed(self, bits):
|
||||
#print '* uncompressed(%r): curpos=%r' % (bits, self._curpos)
|
||||
for c in bits:
|
||||
self._curline[self._curpos] = int(c)
|
||||
self._curpos += 1
|
||||
|
@ -536,8 +528,6 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
|
||||
|
||||
|
||||
class CCITTFaxDecoder(CCITTG4Parser):
|
||||
|
||||
def __init__(self, width, bytealign=False, reversed=False):
|
||||
|
@ -607,5 +597,6 @@ def main(argv):
|
|||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -29,7 +29,7 @@ from .encodingdb import name2unicode
|
|||
from .utils import choplist
|
||||
from .utils import nunpack
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -156,7 +156,8 @@ class UnicodeMap(CMapBase):
|
|||
class FileCMap(CMap):
|
||||
|
||||
def add_code2cid(self, code, cid):
|
||||
assert isinstance(code, str) and isinstance(cid, int), str((type(code), type(cid)))
|
||||
assert isinstance(code, str) and isinstance(cid, int),\
|
||||
str((type(code), type(cid)))
|
||||
d = self.code2cid
|
||||
for c in code[:-1]:
|
||||
c = ord(c)
|
||||
|
@ -219,7 +220,7 @@ class CMapDB(object):
|
|||
pass
|
||||
|
||||
@classmethod
|
||||
def _load_data(klass, name):
|
||||
def _load_data(cls, name):
|
||||
name = name.replace("\0", "")
|
||||
filename = '%s.pickle.gz' % name
|
||||
log.info('loading: %r', name)
|
||||
|
@ -237,7 +238,7 @@ class CMapDB(object):
|
|||
raise CMapDB.CMapNotFound(name)
|
||||
|
||||
@classmethod
|
||||
def get_cmap(klass, name):
|
||||
def get_cmap(cls, name):
|
||||
if name == 'Identity-H':
|
||||
return IdentityCMap(WMode=0)
|
||||
elif name == 'Identity-V':
|
||||
|
@ -247,22 +248,23 @@ class CMapDB(object):
|
|||
elif name == 'OneByteIdentityV':
|
||||
return IdentityCMapByte(WMode=1)
|
||||
try:
|
||||
return klass._cmap_cache[name]
|
||||
return cls._cmap_cache[name]
|
||||
except KeyError:
|
||||
pass
|
||||
data = klass._load_data(name)
|
||||
klass._cmap_cache[name] = cmap = PyCMap(name, data)
|
||||
data = cls._load_data(name)
|
||||
cls._cmap_cache[name] = cmap = PyCMap(name, data)
|
||||
return cmap
|
||||
|
||||
@classmethod
|
||||
def get_unicode_map(klass, name, vertical=False):
|
||||
def get_unicode_map(cls, name, vertical=False):
|
||||
try:
|
||||
return klass._umap_cache[name][vertical]
|
||||
return cls._umap_cache[name][vertical]
|
||||
except KeyError:
|
||||
pass
|
||||
data = klass._load_data('to-unicode-%s' % name)
|
||||
klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
|
||||
return umaps[vertical]
|
||||
data = cls._load_data('to-unicode-%s' % name)
|
||||
cls._umap_cache[name] = [PyUnicodeMap(name, data, v)
|
||||
for v in (False, True)]
|
||||
return cls._umap_cache[name][vertical]
|
||||
|
||||
|
||||
class CMapParser(PSStackParser):
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# -*- coding: utf-8 -*-
|
||||
import logging
|
||||
import re
|
||||
|
@ -24,15 +23,12 @@ from .utils import enc
|
|||
from .utils import bbox2str
|
||||
from . import utils
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
## PDFLayoutAnalyzer
|
||||
##
|
||||
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||
|
||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||
PDFTextDevice.__init__(self, rsrcmgr)
|
||||
self.pageno = pageno
|
||||
|
@ -87,7 +83,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||
if x0 == x1 or y0 == y1:
|
||||
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1),
|
||||
stroke, fill, evenodd, gstate.scolor, gstate.ncolor))
|
||||
stroke, fill, evenodd, gstate.scolor,
|
||||
gstate.ncolor))
|
||||
return
|
||||
if shape == 'mlllh':
|
||||
# rectangle
|
||||
|
@ -99,21 +96,23 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
|
||||
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2),
|
||||
stroke, fill, evenodd, gstate.scolor, gstate.ncolor))
|
||||
stroke, fill, evenodd, gstate.scolor,
|
||||
gstate.ncolor))
|
||||
return
|
||||
# other shapes
|
||||
pts = []
|
||||
for p in path:
|
||||
for i in range(1, len(p), 2):
|
||||
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
|
||||
self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill,
|
||||
evenodd, gstate.scolor, gstate.ncolor))
|
||||
self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||
gstate.scolor, gstate.ncolor))
|
||||
return
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||
graphicstate):
|
||||
try:
|
||||
text = font.to_unichr(cid)
|
||||
assert isinstance(text, six.text_type), str(type(text))
|
||||
|
@ -121,7 +120,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
text = self.handle_undefined_char(font, cid)
|
||||
textwidth = font.char_width(cid)
|
||||
textdisp = font.char_disp(cid)
|
||||
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate)
|
||||
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
|
||||
textdisp, ncs, graphicstate)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
|
@ -133,12 +133,10 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
return
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
##
|
||||
class PDFPageAggregator(PDFLayoutAnalyzer):
|
||||
|
||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.result = None
|
||||
return
|
||||
|
||||
|
@ -150,12 +148,11 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
|
|||
return self.result
|
||||
|
||||
|
||||
## PDFConverter
|
||||
##
|
||||
class PDFConverter(PDFLayoutAnalyzer):
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||
laparams=None):
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
if hasattr(self.outfp, 'mode'):
|
||||
|
@ -178,13 +175,11 @@ class PDFConverter(PDFLayoutAnalyzer):
|
|||
return
|
||||
|
||||
|
||||
## TextConverter
|
||||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
showpageno=False, imagewriter=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
self.imagewriter = imagewriter
|
||||
return
|
||||
|
@ -227,12 +222,8 @@ class TextConverter(PDFConverter):
|
|||
return
|
||||
|
||||
|
||||
## HTMLConverter
|
||||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
RECT_COLORS = {
|
||||
#'char': 'green',
|
||||
'figure': 'yellow',
|
||||
'textline': 'magenta',
|
||||
'textbox': 'cyan',
|
||||
|
@ -248,10 +239,15 @@ class HTMLConverter(PDFConverter):
|
|||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
|
||||
pagemargin=50, imagewriter=None, debug=0,
|
||||
rect_colors={'curve': 'black', 'page': 'gray'},
|
||||
text_colors={'char': 'black'}):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
pagemargin=50, imagewriter=None, debug=0, rect_colors=None,
|
||||
text_colors=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
if text_colors is None:
|
||||
text_colors = {'char': 'black'}
|
||||
if rect_colors is None:
|
||||
rect_colors = {'curve': 'black', 'page': 'gray'}
|
||||
|
||||
self.scale = scale
|
||||
self.fontscale = fontscale
|
||||
self.layoutmode = layoutmode
|
||||
|
@ -280,15 +276,20 @@ class HTMLConverter(PDFConverter):
|
|||
def write_header(self):
|
||||
self.write('<html><head>\n')
|
||||
if self.codec:
|
||||
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||
s = '<meta http-equiv="Content-Type" content="text/html; ' \
|
||||
'charset=%s">\n' % self.codec
|
||||
else:
|
||||
self.write('<meta http-equiv="Content-Type" content="text/html">\n')
|
||||
s = '<meta http-equiv="Content-Type" content="text/html">\n'
|
||||
self.write(s)
|
||||
self.write('</head><body>\n')
|
||||
return
|
||||
|
||||
def write_footer(self):
|
||||
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||
', '.join('<a href="#%s">%s</a>' % (i, i) for i in range(1, self.pageno)))
|
||||
page_links = ['<a href="#%s">%s</a>' % (i, i)
|
||||
for i in range(1, self.pageno)]
|
||||
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
|
||||
', '.join(page_links)
|
||||
self.write(s)
|
||||
self.write('</body></html>\n')
|
||||
return
|
||||
|
||||
|
@ -299,32 +300,39 @@ class HTMLConverter(PDFConverter):
|
|||
def place_rect(self, color, borderwidth, x, y, w, h):
|
||||
color = self.rect_colors.get(color)
|
||||
if color is not None:
|
||||
self.write('<span style="position:absolute; border: %s %dpx solid; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(color, borderwidth,
|
||||
x*self.scale, (self._yoffset-y)*self.scale,
|
||||
w*self.scale, h*self.scale))
|
||||
s = '<span style="position:absolute; border: %s %dpx solid; ' \
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \
|
||||
(color, borderwidth, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale,
|
||||
h * self.scale)
|
||||
self.write(
|
||||
s)
|
||||
return
|
||||
|
||||
def place_border(self, color, borderwidth, item):
|
||||
self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
|
||||
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
return
|
||||
|
||||
def place_image(self, item, borderwidth, x, y, w, h):
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||
'width="%d" height="%d" />\n' %
|
||||
(enc(name, None), borderwidth,
|
||||
x*self.scale, (self._yoffset-y)*self.scale,
|
||||
w*self.scale, h*self.scale))
|
||||
s = '<img src="%s" border="%d" style="position:absolute; ' \
|
||||
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
|
||||
(enc(name, None), borderwidth, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale,
|
||||
h * self.scale)
|
||||
self.write(s)
|
||||
return
|
||||
|
||||
def place_text(self, color, text, x, y, size):
|
||||
color = self.text_colors.get(color)
|
||||
if color is not None:
|
||||
self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||
(color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale))
|
||||
s = '<span style="position:absolute; color:%s; left:%dpx; ' \
|
||||
'top:%dpx; font-size:%dpx;">' % \
|
||||
(color, x * self.scale, (self._yoffset - y) * self.scale,
|
||||
size * self.scale * self.fontscale)
|
||||
self.write(s)
|
||||
self.write_text(text)
|
||||
self.write('</span>\n')
|
||||
return
|
||||
|
@ -332,11 +340,12 @@ class HTMLConverter(PDFConverter):
|
|||
def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
|
||||
self._fontstack.append(self._font)
|
||||
self._font = None
|
||||
self.write('<div style="position:absolute; border: %s %dpx solid; writing-mode:%s; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;">' %
|
||||
(color, borderwidth, writing_mode,
|
||||
x*self.scale, (self._yoffset-y)*self.scale,
|
||||
w*self.scale, h*self.scale))
|
||||
s = '<div style="position:absolute; border: %s %dpx solid; ' \
|
||||
'writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; ' \
|
||||
'height:%dpx;">' % \
|
||||
(color, borderwidth, writing_mode, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale, h * self.scale)
|
||||
self.write(s)
|
||||
return
|
||||
|
||||
def end_div(self, color):
|
||||
|
@ -376,7 +385,8 @@ class HTMLConverter(PDFConverter):
|
|||
if self.showpageno:
|
||||
self.write('<div style="position:absolute; top:%dpx;">' %
|
||||
((self._yoffset-item.y1)*self.scale))
|
||||
self.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
|
||||
self.write('<a name="%s">Page %s</a></div>\n' % (
|
||||
item.pageid, item.pageid))
|
||||
for child in item:
|
||||
render(child)
|
||||
if item.groups is not None:
|
||||
|
@ -385,12 +395,14 @@ class HTMLConverter(PDFConverter):
|
|||
elif isinstance(item, LTCurve):
|
||||
self.place_border('curve', 1, item)
|
||||
elif isinstance(item, LTFigure):
|
||||
self.begin_div('figure', 1, item.x0, item.y1, item.width, item.height)
|
||||
self.begin_div('figure', 1, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.end_div('figure')
|
||||
elif isinstance(item, LTImage):
|
||||
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
|
||||
self.place_image(item, 1, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
else:
|
||||
if self.layoutmode == 'exact':
|
||||
if isinstance(item, LTTextLine):
|
||||
|
@ -399,12 +411,14 @@ class HTMLConverter(PDFConverter):
|
|||
render(child)
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.place_border('textbox', 1, item)
|
||||
self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20)
|
||||
self.place_text('textbox', str(item.index+1), item.x0,
|
||||
item.y1, 20)
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTChar):
|
||||
self.place_border('char', 1, item)
|
||||
self.place_text('char', item.get_text(), item.x0, item.y1, item.size)
|
||||
self.place_text('char', item.get_text(), item.x0,
|
||||
item.y1, item.size)
|
||||
else:
|
||||
if isinstance(item, LTTextLine):
|
||||
for child in item:
|
||||
|
@ -412,13 +426,15 @@ class HTMLConverter(PDFConverter):
|
|||
if self.layoutmode != 'loose':
|
||||
self.put_newline()
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.begin_div('textbox', 1, item.x0, item.y1, item.width, item.height,
|
||||
self.begin_div('textbox', 1, item.x0, item.y1,
|
||||
item.width, item.height,
|
||||
item.get_writing_mode())
|
||||
for child in item:
|
||||
render(child)
|
||||
self.end_div('textbox')
|
||||
elif isinstance(item, LTChar):
|
||||
self.put_text(item.get_text(), item.fontname, item.size)
|
||||
self.put_text(item.get_text(), item.fontname,
|
||||
item.size)
|
||||
elif isinstance(item, LTText):
|
||||
self.write_text(item.get_text())
|
||||
return
|
||||
|
@ -431,15 +447,14 @@ class HTMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
|
||||
## XMLConverter
|
||||
##
|
||||
class XMLConverter(PDFConverter):
|
||||
|
||||
CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||
laparams=None, imagewriter=None, stripcontrol=False):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
imagewriter=None, stripcontrol=False):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
self.imagewriter = imagewriter
|
||||
self.stripcontrol = stripcontrol
|
||||
self.write_header()
|
||||
|
@ -483,8 +498,9 @@ class XMLConverter(PDFConverter):
|
|||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
(item.pageid, bbox2str(item.bbox), item.rotate))
|
||||
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
|
||||
(item.pageid, bbox2str(item.bbox), item.rotate)
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
if item.groups is not None:
|
||||
|
@ -494,17 +510,21 @@ class XMLConverter(PDFConverter):
|
|||
self.write('</layout>\n')
|
||||
self.write('</page>\n')
|
||||
elif isinstance(item, LTLine):
|
||||
self.write('<line linewidth="%d" bbox="%s" />\n' %
|
||||
(item.linewidth, bbox2str(item.bbox)))
|
||||
s = '<line linewidth="%d" bbox="%s" />\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox))
|
||||
self.write(s)
|
||||
elif isinstance(item, LTRect):
|
||||
self.write('<rect linewidth="%d" bbox="%s" />\n' %
|
||||
(item.linewidth, bbox2str(item.bbox)))
|
||||
s = '<rect linewidth="%d" bbox="%s" />\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox))
|
||||
self.write(s)
|
||||
elif isinstance(item, LTCurve):
|
||||
self.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
|
||||
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
|
||||
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox), item.get_pts())
|
||||
self.write(s)
|
||||
elif isinstance(item, LTFigure):
|
||||
self.write('<figure name="%s" bbox="%s">\n' %
|
||||
(item.name, bbox2str(item.bbox)))
|
||||
s = '<figure name="%s" bbox="%s">\n' % \
|
||||
(item.name, bbox2str(item.bbox))
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</figure>\n')
|
||||
|
@ -517,15 +537,18 @@ class XMLConverter(PDFConverter):
|
|||
wmode = ''
|
||||
if isinstance(item, LTTextBoxVertical):
|
||||
wmode = ' wmode="vertical"'
|
||||
self.write('<textbox id="%d" bbox="%s"%s>\n' %
|
||||
(item.index, bbox2str(item.bbox), wmode))
|
||||
s = '<textbox id="%d" bbox="%s"%s>\n' %\
|
||||
(item.index, bbox2str(item.bbox), wmode)
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</textbox>\n')
|
||||
elif isinstance(item, LTChar):
|
||||
self.write('<text font="%s" bbox="%s" colourspace="%s" ncolour="%s" size="%.3f">' %
|
||||
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
|
||||
'ncolour="%s" size="%.3f">' % \
|
||||
(enc(item.fontname, None), bbox2str(item.bbox),
|
||||
item.ncs.name, item.graphicstate.ncolor, item.size))
|
||||
item.ncs.name, item.graphicstate.ncolor, item.size)
|
||||
self.write(s)
|
||||
self.write_text(item.get_text())
|
||||
self.write('</text>\n')
|
||||
elif isinstance(item, LTText):
|
||||
|
|
|
@ -15,12 +15,16 @@ log = logging.getLogger(__name__)
|
|||
def name2unicode(name):
|
||||
"""Converts Adobe glyph names to Unicode numbers.
|
||||
|
||||
In contrast to the specification, this raises a KeyError instead of return an empty string when the key is unknown.
|
||||
This way the caller must explicitly define what to do when there is not a match.
|
||||
In contrast to the specification, this raises a KeyError instead of return
|
||||
an empty string when the key is unknown.
|
||||
This way the caller must explicitly define what to do
|
||||
when there is not a match.
|
||||
|
||||
Reference: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||
Reference:
|
||||
https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||
|
||||
:returns unicode character if name resembles something, otherwise a KeyError
|
||||
:returns unicode character if name resembles something,
|
||||
otherwise a KeyError
|
||||
"""
|
||||
name = name.split('.')[0]
|
||||
components = name.split('_')
|
||||
|
@ -35,8 +39,10 @@ def name2unicode(name):
|
|||
elif name.startswith('uni'):
|
||||
name_without_uni = name.strip('uni')
|
||||
|
||||
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
||||
unicode_digits = [int(name_without_uni[i:i + 4], base=16) for i in range(0, len(name_without_uni), 4)]
|
||||
if HEXADECIMAL.match(name_without_uni) and \
|
||||
len(name_without_uni) % 4 == 0:
|
||||
unicode_digits = [int(name_without_uni[i:i + 4], base=16)
|
||||
for i in range(0, len(name_without_uni), 4)]
|
||||
for digit in unicode_digits:
|
||||
raise_key_error_for_invalid_unicode(digit)
|
||||
characters = map(six.unichr, unicode_digits)
|
||||
|
@ -45,21 +51,25 @@ def name2unicode(name):
|
|||
elif name.startswith('u'):
|
||||
name_without_u = name.strip('u')
|
||||
|
||||
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
||||
if HEXADECIMAL.match(name_without_u) and \
|
||||
4 <= len(name_without_u) <= 6:
|
||||
unicode_digit = int(name_without_u, base=16)
|
||||
raise_key_error_for_invalid_unicode(unicode_digit)
|
||||
return six.unichr(unicode_digit)
|
||||
|
||||
raise KeyError('Could not convert unicode name "%s" to character because it does not match specification' % name)
|
||||
raise KeyError('Could not convert unicode name "%s" to character because '
|
||||
'it does not match specification' % name)
|
||||
|
||||
|
||||
def raise_key_error_for_invalid_unicode(unicode_digit):
|
||||
"""Unicode values should not be in the range D800 through DFFF because that is used for surrogate pairs in UTF-16
|
||||
"""Unicode values should not be in the range D800 through DFFF because
|
||||
that is used for surrogate pairs in UTF-16
|
||||
|
||||
:raises KeyError if unicode digit is invalid
|
||||
"""
|
||||
if 55295 < unicode_digit < 57344:
|
||||
raise KeyError('Unicode digit %d is invalid because it is in the range D800 through DFFF' % unicode_digit)
|
||||
raise KeyError('Unicode digit %d is invalid because '
|
||||
'it is in the range D800 through DFFF' % unicode_digit)
|
||||
|
||||
|
||||
class EncodingDB(object):
|
||||
|
@ -87,8 +97,8 @@ class EncodingDB(object):
|
|||
}
|
||||
|
||||
@classmethod
|
||||
def get_encoding(klass, name, diff=None):
|
||||
cid2unicode = klass.encodings.get(name, klass.std2unicode)
|
||||
def get_encoding(cls, name, diff=None):
|
||||
cid2unicode = cls.encodings.get(name, cls.std2unicode)
|
||||
if diff:
|
||||
cid2unicode = cid2unicode.copy()
|
||||
cid = 0
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
|
||||
|
||||
""" Font metrics for the Adobe core 14 fonts.
|
||||
|
||||
Font metrics are used to compute the boundary of each character
|
||||
|
@ -28,6 +26,8 @@ The following data were extracted from the AFM files:
|
|||
|
||||
### END Verbatim copy of the license part
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
FONT_METRICS = {
|
||||
'Courier': ({'FontName': 'Courier', 'Descent': -194.0, 'FontBBox': (-6.0, -249.0, 639.0, 803.0), 'FontWeight': 'Medium', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {u' ': 600, u'!': 600, u'"': 600, u'#': 600, u'$': 600, u'%': 600, u'&': 600, u"'": 600, u'(': 600, u')': 600, u'*': 600, u'+': 600, u',': 600, u'-': 600, u'.': 600, u'/': 600, u'0': 600, u'1': 600, u'2': 600, u'3': 600, u'4': 600, u'5': 600, u'6': 600, u'7': 600, u'8': 600, u'9': 600, u':': 600, u';': 600, u'<': 600, u'=': 600, u'>': 600, u'?': 600, u'@': 600, u'A': 600, u'B': 600, u'C': 600, u'D': 600, u'E': 600, u'F': 600, u'G': 600, u'H': 600, u'I': 600, u'J': 600, u'K': 600, u'L': 600, u'M': 600, u'N': 600, u'O': 600, u'P': 600, u'Q': 600, u'R': 600, u'S': 600, u'T': 600, u'U': 600, u'V': 600, u'W': 600, u'X': 600, u'Y': 600, u'Z': 600, u'[': 600, u'\\': 600, u']': 600, u'^': 600, u'_': 600, u'`': 600, u'a': 600, u'b': 600, u'c': 600, u'd': 600, u'e': 600, u'f': 600, u'g': 600, u'h': 600, u'i': 600, u'j': 600, u'k': 600, u'l': 600, u'm': 600, u'n': 600, u'o': 600, u'p': 600, u'q': 600, u'r': 600, u's': 600, u't': 600, u'u': 600, u'v': 600, u'w': 600, u'x': 600, u'y': 600, u'z': 600, u'{': 600, u'|': 600, u'}': 600, u'~': 600, u'\xa1': 600, u'\xa2': 600, u'\xa3': 600, u'\xa4': 600, u'\xa5': 600, u'\xa6': 600, u'\xa7': 600, u'\xa8': 600, u'\xa9': 600, u'\xaa': 600, u'\xab': 600, u'\xac': 600, u'\xae': 600, u'\xaf': 600, u'\xb0': 600, u'\xb1': 600, u'\xb2': 600, u'\xb3': 600, u'\xb4': 600, u'\xb5': 600, u'\xb6': 600, u'\xb7': 600, u'\xb8': 600, u'\xb9': 600, u'\xba': 600, u'\xbb': 600, u'\xbc': 600, u'\xbd': 600, u'\xbe': 600, u'\xbf': 600, u'\xc0': 600, u'\xc1': 600, u'\xc2': 600, u'\xc3': 600, u'\xc4': 600, u'\xc5': 600, u'\xc6': 600, u'\xc7': 600, u'\xc8': 600, u'\xc9': 600, u'\xca': 600, u'\xcb': 600, u'\xcc': 600, u'\xcd': 600, u'\xce': 600, u'\xcf': 600, u'\xd0': 600, u'\xd1': 600, u'\xd2': 600, u'\xd3': 600, u'\xd4': 600, u'\xd5': 600, u'\xd6': 600, u'\xd7': 600, u'\xd8': 600, u'\xd9': 600, u'\xda': 600, u'\xdb': 600, u'\xdc': 600, u'\xdd': 600, u'\xde': 600, u'\xdf': 600, u'\xe0': 600, u'\xe1': 600, u'\xe2': 600, u'\xe3': 600, u'\xe4': 600, u'\xe5': 600, u'\xe6': 600, u'\xe7': 600, u'\xe8': 600, u'\xe9': 600, u'\xea': 600, u'\xeb': 600, u'\xec': 600, u'\xed': 600, u'\xee': 600, u'\xef': 600, u'\xf0': 600, u'\xf1': 600, u'\xf2': 600, u'\xf3': 600, u'\xf4': 600, u'\xf5': 600, u'\xf6': 600, u'\xf7': 600, u'\xf8': 600, u'\xf9': 600, u'\xfa': 600, u'\xfb': 600, u'\xfc': 600, u'\xfd': 600, u'\xfe': 600, u'\xff': 600, u'\u0100': 600, u'\u0101': 600, u'\u0102': 600, u'\u0103': 600, u'\u0104': 600, u'\u0105': 600, u'\u0106': 600, u'\u0107': 600, u'\u010c': 600, u'\u010d': 600, u'\u010e': 600, u'\u010f': 600, u'\u0110': 600, u'\u0111': 600, u'\u0112': 600, u'\u0113': 600, u'\u0116': 600, u'\u0117': 600, u'\u0118': 600, u'\u0119': 600, u'\u011a': 600, u'\u011b': 600, u'\u011e': 600, u'\u011f': 600, u'\u0122': 600, u'\u0123': 600, u'\u012a': 600, u'\u012b': 600, u'\u012e': 600, u'\u012f': 600, u'\u0130': 600, u'\u0131': 600, u'\u0136': 600, u'\u0137': 600, u'\u0139': 600, u'\u013a': 600, u'\u013b': 600, u'\u013c': 600, u'\u013d': 600, u'\u013e': 600, u'\u0141': 600, u'\u0142': 600, u'\u0143': 600, u'\u0144': 600, u'\u0145': 600, u'\u0146': 600, u'\u0147': 600, u'\u0148': 600, u'\u014c': 600, u'\u014d': 600, u'\u0150': 600, u'\u0151': 600, u'\u0152': 600, u'\u0153': 600, u'\u0154': 600, u'\u0155': 600, u'\u0156': 600, u'\u0157': 600, u'\u0158': 600, u'\u0159': 600, u'\u015a': 600, u'\u015b': 600, u'\u015e': 600, u'\u015f': 600, u'\u0160': 600, u'\u0161': 600, u'\u0162': 600, u'\u0163': 600, u'\u0164': 600, u'\u0165': 600, u'\u016a': 600, u'\u016b': 600, u'\u016e': 600, u'\u016f': 600, u'\u0170': 600, u'\u0171': 600, u'\u0172': 600, u'\u0173': 600, u'\u0178': 600, u'\u0179': 600, u'\u017a': 600, u'\u017b': 600, u'\u017c': 600, u'\u017d': 600, u'\u017e': 600, u'\u0192': 600, u'\u0218': 600, u'\u0219': 600, u'\u02c6': 600, u'\u02c7': 600, u'\u02d8': 600, u'\u02d9': 600, u'\u02da': 600, u'\u02db': 600, u'\u02dc': 600, u'\u02dd': 600, u'\u2013': 600, u'\u2014': 600, u'\u2018': 600, u'\u2019': 600, u'\u201a': 600, u'\u201c': 600, u'\u201d': 600, u'\u201e': 600, u'\u2020': 600, u'\u2021': 600, u'\u2022': 600, u'\u2026': 600, u'\u2030': 600, u'\u2039': 600, u'\u203a': 600, u'\u2044': 600, u'\u2122': 600, u'\u2202': 600, u'\u2206': 600, u'\u2211': 600, u'\u2212': 600, u'\u221a': 600, u'\u2260': 600, u'\u2264': 600, u'\u2265': 600, u'\u25ca': 600, u'\uf6c3': 600, u'\ufb01': 600, u'\ufb02': 600}),
|
||||
'Courier-Bold': ({'FontName': 'Courier-Bold', 'Descent': -194.0, 'FontBBox': (-88.0, -249.0, 697.0, 811.0), 'FontWeight': 'Bold', 'CapHeight': 572.0, 'FontFamily': 'Courier', 'Flags': 64, 'XHeight': 434.0, 'ItalicAngle': 0.0, 'Ascent': 627.0}, {u' ': 600, u'!': 600, u'"': 600, u'#': 600, u'$': 600, u'%': 600, u'&': 600, u"'": 600, u'(': 600, u')': 600, u'*': 600, u'+': 600, u',': 600, u'-': 600, u'.': 600, u'/': 600, u'0': 600, u'1': 600, u'2': 600, u'3': 600, u'4': 600, u'5': 600, u'6': 600, u'7': 600, u'8': 600, u'9': 600, u':': 600, u';': 600, u'<': 600, u'=': 600, u'>': 600, u'?': 600, u'@': 600, u'A': 600, u'B': 600, u'C': 600, u'D': 600, u'E': 600, u'F': 600, u'G': 600, u'H': 600, u'I': 600, u'J': 600, u'K': 600, u'L': 600, u'M': 600, u'N': 600, u'O': 600, u'P': 600, u'Q': 600, u'R': 600, u'S': 600, u'T': 600, u'U': 600, u'V': 600, u'W': 600, u'X': 600, u'Y': 600, u'Z': 600, u'[': 600, u'\\': 600, u']': 600, u'^': 600, u'_': 600, u'`': 600, u'a': 600, u'b': 600, u'c': 600, u'd': 600, u'e': 600, u'f': 600, u'g': 600, u'h': 600, u'i': 600, u'j': 600, u'k': 600, u'l': 600, u'm': 600, u'n': 600, u'o': 600, u'p': 600, u'q': 600, u'r': 600, u's': 600, u't': 600, u'u': 600, u'v': 600, u'w': 600, u'x': 600, u'y': 600, u'z': 600, u'{': 600, u'|': 600, u'}': 600, u'~': 600, u'\xa1': 600, u'\xa2': 600, u'\xa3': 600, u'\xa4': 600, u'\xa5': 600, u'\xa6': 600, u'\xa7': 600, u'\xa8': 600, u'\xa9': 600, u'\xaa': 600, u'\xab': 600, u'\xac': 600, u'\xae': 600, u'\xaf': 600, u'\xb0': 600, u'\xb1': 600, u'\xb2': 600, u'\xb3': 600, u'\xb4': 600, u'\xb5': 600, u'\xb6': 600, u'\xb7': 600, u'\xb8': 600, u'\xb9': 600, u'\xba': 600, u'\xbb': 600, u'\xbc': 600, u'\xbd': 600, u'\xbe': 600, u'\xbf': 600, u'\xc0': 600, u'\xc1': 600, u'\xc2': 600, u'\xc3': 600, u'\xc4': 600, u'\xc5': 600, u'\xc6': 600, u'\xc7': 600, u'\xc8': 600, u'\xc9': 600, u'\xca': 600, u'\xcb': 600, u'\xcc': 600, u'\xcd': 600, u'\xce': 600, u'\xcf': 600, u'\xd0': 600, u'\xd1': 600, u'\xd2': 600, u'\xd3': 600, u'\xd4': 600, u'\xd5': 600, u'\xd6': 600, u'\xd7': 600, u'\xd8': 600, u'\xd9': 600, u'\xda': 600, u'\xdb': 600, u'\xdc': 600, u'\xdd': 600, u'\xde': 600, u'\xdf': 600, u'\xe0': 600, u'\xe1': 600, u'\xe2': 600, u'\xe3': 600, u'\xe4': 600, u'\xe5': 600, u'\xe6': 600, u'\xe7': 600, u'\xe8': 600, u'\xe9': 600, u'\xea': 600, u'\xeb': 600, u'\xec': 600, u'\xed': 600, u'\xee': 600, u'\xef': 600, u'\xf0': 600, u'\xf1': 600, u'\xf2': 600, u'\xf3': 600, u'\xf4': 600, u'\xf5': 600, u'\xf6': 600, u'\xf7': 600, u'\xf8': 600, u'\xf9': 600, u'\xfa': 600, u'\xfb': 600, u'\xfc': 600, u'\xfd': 600, u'\xfe': 600, u'\xff': 600, u'\u0100': 600, u'\u0101': 600, u'\u0102': 600, u'\u0103': 600, u'\u0104': 600, u'\u0105': 600, u'\u0106': 600, u'\u0107': 600, u'\u010c': 600, u'\u010d': 600, u'\u010e': 600, u'\u010f': 600, u'\u0110': 600, u'\u0111': 600, u'\u0112': 600, u'\u0113': 600, u'\u0116': 600, u'\u0117': 600, u'\u0118': 600, u'\u0119': 600, u'\u011a': 600, u'\u011b': 600, u'\u011e': 600, u'\u011f': 600, u'\u0122': 600, u'\u0123': 600, u'\u012a': 600, u'\u012b': 600, u'\u012e': 600, u'\u012f': 600, u'\u0130': 600, u'\u0131': 600, u'\u0136': 600, u'\u0137': 600, u'\u0139': 600, u'\u013a': 600, u'\u013b': 600, u'\u013c': 600, u'\u013d': 600, u'\u013e': 600, u'\u0141': 600, u'\u0142': 600, u'\u0143': 600, u'\u0144': 600, u'\u0145': 600, u'\u0146': 600, u'\u0147': 600, u'\u0148': 600, u'\u014c': 600, u'\u014d': 600, u'\u0150': 600, u'\u0151': 600, u'\u0152': 600, u'\u0153': 600, u'\u0154': 600, u'\u0155': 600, u'\u0156': 600, u'\u0157': 600, u'\u0158': 600, u'\u0159': 600, u'\u015a': 600, u'\u015b': 600, u'\u015e': 600, u'\u015f': 600, u'\u0160': 600, u'\u0161': 600, u'\u0162': 600, u'\u0163': 600, u'\u0164': 600, u'\u0165': 600, u'\u016a': 600, u'\u016b': 600, u'\u016e': 600, u'\u016f': 600, u'\u0170': 600, u'\u0171': 600, u'\u0172': 600, u'\u0173': 600, u'\u0178': 600, u'\u0179': 600, u'\u017a': 600, u'\u017b': 600, u'\u017c': 600, u'\u017d': 600, u'\u017e': 600, u'\u0192': 600, u'\u0218': 600, u'\u0219': 600, u'\u02c6': 600, u'\u02c7': 600, u'\u02d8': 600, u'\u02d9': 600, u'\u02da': 600, u'\u02db': 600, u'\u02dc': 600, u'\u02dd': 600, u'\u2013': 600, u'\u2014': 600, u'\u2018': 600, u'\u2019': 600, u'\u201a': 600, u'\u201c': 600, u'\u201d': 600, u'\u201e': 600, u'\u2020': 600, u'\u2021': 600, u'\u2022': 600, u'\u2026': 600, u'\u2030': 600, u'\u2039': 600, u'\u203a': 600, u'\u2044': 600, u'\u2122': 600, u'\u2202': 600, u'\u2206': 600, u'\u2211': 600, u'\u2212': 600, u'\u221a': 600, u'\u2260': 600, u'\u2264': 600, u'\u2265': 600, u'\u25ca': 600, u'\uf6c3': 600, u'\ufb01': 600, u'\ufb02': 600}),
|
||||
|
|
|
@ -4336,4 +4336,4 @@ glyphname2unicode = {
|
|||
'zuhiragana': u'\u305A',
|
||||
'zukatakana': u'\u30BA',
|
||||
}
|
||||
#--end
|
||||
# --end
|
||||
|
|
|
@ -5,12 +5,6 @@ import sys
|
|||
|
||||
import six
|
||||
|
||||
# Conditional import because python 2 is stupid
|
||||
if sys.version_info > (3, 0):
|
||||
from io import StringIO
|
||||
else:
|
||||
from io import BytesIO as StringIO
|
||||
|
||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from .pdfdevice import TagExtractor
|
||||
from .pdfpage import PDFPage
|
||||
|
@ -18,36 +12,46 @@ from .converter import XMLConverter, HTMLConverter, TextConverter
|
|||
from .image import ImageWriter
|
||||
from .layout import LAParams
|
||||
|
||||
# Conditional import because python 2 is stupid
|
||||
if sys.version_info > (3, 0):
|
||||
from io import StringIO
|
||||
else:
|
||||
from io import BytesIO as StringIO
|
||||
|
||||
|
||||
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||
laparams=None, maxpages=0, page_numbers=None,
|
||||
password="", scale=1.0, rotation=0, layoutmode='normal',
|
||||
output_dir=None, strip_control=False, debug=False,
|
||||
disable_caching=False, **kwargs):
|
||||
"""Parses text from inf-file and writes to outfp file-like object.
|
||||
|
||||
def extract_text_to_fp(inf, outfp,
|
||||
output_type='text', codec='utf-8', laparams = None,
|
||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||
layoutmode='normal', output_dir=None, strip_control=False,
|
||||
debug=False, disable_caching=False, **kwargs):
|
||||
"""
|
||||
Parses text from inf-file and writes to outfp file-like object.
|
||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
||||
Beware laparams: Including an empty LAParams is not the same as passing None!
|
||||
Returns nothing, acting as it does on two streams. Use StringIO to get strings.
|
||||
Beware laparams: Including an empty LAParams is not the same as passing
|
||||
None!
|
||||
|
||||
:param inf: a file-like object to read PDF structure from, such as a
|
||||
file handler (using the builtin `open()` function) or a `BytesIO`.
|
||||
:param outfp: a file-like object to write the text to.
|
||||
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works properly.
|
||||
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
|
||||
properly.
|
||||
:param codec: Text decoding codec
|
||||
:param laparams: An LAParams object from pdfminer.layout. Default is None but may not layout correctly.
|
||||
:param laparams: An LAParams object from pdfminer.layout. Default is None
|
||||
but may not layout correctly.
|
||||
:param maxpages: How many pages to stop parsing after
|
||||
:param page_numbers: zero-indexed page numbers to operate on.
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
:param scale: Scale factor
|
||||
:param rotation: Rotation factor
|
||||
:param layoutmode: Default is 'normal', see pdfminer.converter.HTMLConverter
|
||||
:param layoutmode: Default is 'normal', see
|
||||
pdfminer.converter.HTMLConverter
|
||||
:param output_dir: If given, creates an ImageWriter for extracted images.
|
||||
:param strip_control: Does what it says on the tin
|
||||
:param debug: Output more logging data
|
||||
:param disable_caching: Does what it says on the tin
|
||||
:param other:
|
||||
:return:
|
||||
:return: nothing, acting as it does on two streams. Use StringIO to get
|
||||
strings.
|
||||
"""
|
||||
if '_py2_no_more_posargs' in kwargs is not None:
|
||||
raise DeprecationWarning(
|
||||
|
@ -134,4 +138,3 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
interpreter.process_page(page)
|
||||
|
||||
return output_string.getvalue()
|
||||
|
||||
|
|
|
@ -15,8 +15,6 @@ def align32(x):
|
|||
return ((x+3)//4)*4
|
||||
|
||||
|
||||
## BMPWriter
|
||||
##
|
||||
class BMPWriter(object):
|
||||
|
||||
def __init__(self, fp, bits, width, height):
|
||||
|
@ -35,9 +33,11 @@ class BMPWriter(object):
|
|||
self.linesize = align32((self.width*self.bits+7)//8)
|
||||
self.datasize = self.linesize * self.height
|
||||
headersize = 14+40+ncols*4
|
||||
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height, 1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
|
||||
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height,
|
||||
1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
|
||||
assert len(info) == 40, str(len(info))
|
||||
header = struct.pack('<ccIHHI', b'B', b'M', headersize+self.datasize, 0, 0, headersize)
|
||||
header = struct.pack('<ccIHHI', b'B', b'M',
|
||||
headersize+self.datasize, 0, 0, headersize)
|
||||
assert len(header) == 14, str(len(header))
|
||||
self.fp.write(header)
|
||||
self.fp.write(info)
|
||||
|
@ -76,7 +76,8 @@ class ImageWriter(object):
|
|||
|
||||
is_jbig2 = self.is_jbig2_image(image)
|
||||
ext = self._get_image_extension(image, width, height, is_jbig2)
|
||||
name, path = self._create_unique_image_name(self.outdir, image.name, ext)
|
||||
name, path = self._create_unique_image_name(self.outdir,
|
||||
image.name, ext)
|
||||
|
||||
fp = open(path, 'wb')
|
||||
if ext == '.jpg':
|
||||
|
@ -146,7 +147,9 @@ class ImageWriter(object):
|
|||
elif is_jbig2:
|
||||
ext = '.jb2'
|
||||
elif (image.bits == 1 or
|
||||
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||
image.bits == 8 and
|
||||
(LITERAL_DEVICE_RGB in image.colorspace or
|
||||
LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||
ext = '.%dx%d.bmp' % (width, height)
|
||||
else:
|
||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import heapq
|
||||
import logging
|
||||
|
||||
from .utils import INF, shorten_str
|
||||
from .utils import INF
|
||||
from .utils import Plane
|
||||
from .utils import apply_matrix_pt
|
||||
from .utils import bbox2str
|
||||
|
@ -75,8 +75,10 @@ class LAParams(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<LAParams: char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
|
||||
(self.char_margin, self.line_margin, self.word_margin, self.all_texts))
|
||||
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
||||
'word_margin=%.1f all_texts=%r>' % \
|
||||
(self.char_margin, self.line_margin, self.word_margin,
|
||||
self.all_texts)
|
||||
|
||||
|
||||
class LTItem(object):
|
||||
|
@ -178,7 +180,8 @@ class LTComponent(LTItem):
|
|||
class LTCurve(LTComponent):
|
||||
"""A generic Bezier curve"""
|
||||
|
||||
def __init__(self, linewidth, pts, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None):
|
||||
def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False,
|
||||
stroking_color=None, non_stroking_color=None):
|
||||
LTComponent.__init__(self, get_bound(pts))
|
||||
self.pts = pts
|
||||
self.linewidth = linewidth
|
||||
|
@ -199,8 +202,10 @@ class LTLine(LTCurve):
|
|||
Could be used for separating text or figures.
|
||||
"""
|
||||
|
||||
def __init__(self, linewidth, p0, p1, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None):
|
||||
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, stroking_color, non_stroking_color)
|
||||
def __init__(self, linewidth, p0, p1, stroke=False, fill=False,
|
||||
evenodd=False, stroking_color=None, non_stroking_color=None):
|
||||
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
|
||||
stroking_color, non_stroking_color)
|
||||
return
|
||||
|
||||
|
||||
|
@ -210,9 +215,12 @@ class LTRect(LTCurve):
|
|||
Could be used for framing another pictures or figures.
|
||||
"""
|
||||
|
||||
def __init__(self, linewidth, bbox, stroke = False, fill = False, evenodd = False, stroking_color = None, non_stroking_color = None):
|
||||
def __init__(self, linewidth, bbox, stroke=False, fill=False,
|
||||
evenodd=False, stroking_color=None, non_stroking_color=None):
|
||||
(x0, y0, x1, y1) = bbox
|
||||
LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke, fill, evenodd, stroking_color, non_stroking_color)
|
||||
LTCurve.__init__(self, linewidth,
|
||||
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke,
|
||||
fill, evenodd, stroking_color, non_stroking_color)
|
||||
return
|
||||
|
||||
|
||||
|
@ -367,7 +375,8 @@ class LTTextContainer(LTExpandableContainer, LTText):
|
|||
return
|
||||
|
||||
def get_text(self):
|
||||
return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
|
||||
return ''.join(obj.get_text() for obj in self
|
||||
if isinstance(obj, LTText))
|
||||
|
||||
|
||||
class LTTextLine(LTTextContainer):
|
||||
|
@ -449,9 +458,9 @@ class LTTextLineVertical(LTTextLine):
|
|||
class LTTextBox(LTTextContainer):
|
||||
"""Represents a group of text chunks in a rectangular area.
|
||||
|
||||
Note that this box is created by geometric analysis and does not necessarily
|
||||
represents a logical boundary of the text. It contains a list of
|
||||
LTTextLine objects.
|
||||
Note that this box is created by geometric analysis and does not
|
||||
necessarily represents a logical boundary of the text. It contains a list
|
||||
of LTTextLine objects.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -496,9 +505,9 @@ class LTTextGroupLRTB(LTTextGroup):
|
|||
def analyze(self, laparams):
|
||||
LTTextGroup.analyze(self, laparams)
|
||||
# reorder the objects from top-left to bottom-right.
|
||||
self._objs.sort(key=lambda obj:
|
||||
(1-laparams.boxes_flow)*(obj.x0) -
|
||||
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
|
||||
self._objs.sort(
|
||||
key=lambda obj: (1 - laparams.boxes_flow) * obj.x0
|
||||
- (1 + laparams.boxes_flow) * (obj.y0 + obj.y1))
|
||||
return
|
||||
|
||||
|
||||
|
@ -506,9 +515,9 @@ class LTTextGroupTBRL(LTTextGroup):
|
|||
def analyze(self, laparams):
|
||||
LTTextGroup.analyze(self, laparams)
|
||||
# reorder the objects from top-right to bottom-left.
|
||||
self._objs.sort(key=lambda obj:
|
||||
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
||||
-(1-laparams.boxes_flow)*(obj.y1))
|
||||
self._objs.sort(
|
||||
key=lambda obj: - (1 + laparams.boxes_flow) * (obj.x0 + obj.x1)
|
||||
- (1 - laparams.boxes_flow) * obj.y1)
|
||||
return
|
||||
|
||||
|
||||
|
@ -534,12 +543,13 @@ class LTLayoutContainer(LTContainer):
|
|||
#
|
||||
# |<--->|
|
||||
# (char_margin)
|
||||
halign = (obj0.is_compatible(obj1) and
|
||||
obj0.is_voverlap(obj1) and
|
||||
(min(obj0.height, obj1.height) * laparams.line_overlap <
|
||||
obj0.voverlap(obj1)) and
|
||||
(obj0.hdistance(obj1) <
|
||||
max(obj0.width, obj1.width) * laparams.char_margin))
|
||||
halign = \
|
||||
obj0.is_compatible(obj1) \
|
||||
and obj0.is_voverlap(obj1) \
|
||||
and min(obj0.height, obj1.height) * laparams.line_overlap \
|
||||
< obj0.voverlap(obj1) \
|
||||
and obj0.hdistance(obj1) \
|
||||
< max(obj0.width, obj1.width) * laparams.char_margin
|
||||
|
||||
# valign: obj0 and obj1 is vertically aligned.
|
||||
#
|
||||
|
@ -555,13 +565,14 @@ class LTLayoutContainer(LTContainer):
|
|||
#
|
||||
# |<-->|
|
||||
# (line_overlap)
|
||||
valign = (laparams.detect_vertical and
|
||||
obj0.is_compatible(obj1) and
|
||||
obj0.is_hoverlap(obj1) and
|
||||
(min(obj0.width, obj1.width) * laparams.line_overlap <
|
||||
obj0.hoverlap(obj1)) and
|
||||
(obj0.vdistance(obj1) <
|
||||
max(obj0.height, obj1.height) * laparams.char_margin))
|
||||
valign = \
|
||||
laparams.detect_vertical \
|
||||
and obj0.is_compatible(obj1) \
|
||||
and obj0.is_hoverlap(obj1) \
|
||||
and min(obj0.width, obj1.width) * laparams.line_overlap \
|
||||
< obj0.hoverlap(obj1) \
|
||||
and obj0.vdistance(obj1) \
|
||||
< max(obj0.height, obj1.height) * laparams.char_margin
|
||||
|
||||
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
||||
(valign and isinstance(line, LTTextLineVertical))):
|
||||
|
@ -598,7 +609,8 @@ class LTLayoutContainer(LTContainer):
|
|||
boxes = {}
|
||||
for line in lines:
|
||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||
if line not in neighbors: continue
|
||||
if line not in neighbors:
|
||||
continue
|
||||
members = []
|
||||
for obj1 in neighbors:
|
||||
members.append(obj1)
|
||||
|
@ -613,7 +625,8 @@ class LTLayoutContainer(LTContainer):
|
|||
boxes[obj] = box
|
||||
done = set()
|
||||
for line in lines:
|
||||
if line not in boxes: continue
|
||||
if line not in boxes:
|
||||
continue
|
||||
box = boxes[line]
|
||||
if box in done:
|
||||
continue
|
||||
|
@ -625,14 +638,16 @@ class LTLayoutContainer(LTContainer):
|
|||
def group_textboxes(self, laparams, boxes):
|
||||
"""Group textboxes hierarchically.
|
||||
|
||||
Get pair-wise distances, via dist func defined below, and then merge from the closest textbox pair. Once
|
||||
obj1 and obj2 are merged / grouped, the resulting group is considered as a new object, and its distances to
|
||||
other objects & groups are added to the process queue.
|
||||
Get pair-wise distances, via dist func defined below, and then merge
|
||||
from the closest textbox pair. Once obj1 and obj2 are merged /
|
||||
grouped, the resulting group is considered as a new object, and its
|
||||
distances to other objects & groups are added to the process queue.
|
||||
|
||||
For performance reason, pair-wise distances and object pair info are maintained in a heap of
|
||||
(idx, dist, id(obj1), id(obj2), obj1, obj2) tuples. It ensures quick access to the smallest element. Note that
|
||||
since comparison operators, e.g., __lt__, are disabled for LTComponent, id(obj) has to appear before obj in
|
||||
element tuples.
|
||||
For performance reason, pair-wise distances and object pair info are
|
||||
maintained in a heap of (idx, dist, id(obj1), id(obj2), obj1, obj2)
|
||||
tuples. It ensures quick access to the smallest element. Note that
|
||||
since comparison operators, e.g., __lt__, are disabled for
|
||||
LTComponent, id(obj) has to appear before obj in element tuples.
|
||||
|
||||
:param laparams: LAParams object.
|
||||
:param boxes: All textbox objects to be grouped.
|
||||
|
@ -655,7 +670,8 @@ class LTLayoutContainer(LTContainer):
|
|||
y0 = min(obj1.y0, obj2.y0)
|
||||
x1 = max(obj1.x1, obj2.x1)
|
||||
y1 = max(obj1.y1, obj2.y1)
|
||||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||
return (x1 - x0) * (y1 - y0) \
|
||||
- obj1.width*obj1.height - obj2.width*obj2.height
|
||||
|
||||
def isany(obj1, obj2):
|
||||
"""Check if there's any other object between obj1 and obj2."""
|
||||
|
@ -695,14 +711,16 @@ class LTLayoutContainer(LTContainer):
|
|||
done.update([id1, id2])
|
||||
|
||||
for other in plane:
|
||||
heapq.heappush(dists, (False, dist(group, other), id(group), id(other), group, other))
|
||||
heapq.heappush(dists, (False, dist(group, other),
|
||||
id(group), id(other), group, other))
|
||||
plane.add(group)
|
||||
return list(plane)
|
||||
|
||||
def analyze(self, laparams):
|
||||
# textobjs is a list of LTChar objects, i.e.
|
||||
# it has all the individual characters in the page.
|
||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
|
||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
|
||||
self)
|
||||
for obj in otherobjs:
|
||||
obj.analyze(laparams)
|
||||
if not textobjs:
|
||||
|
@ -712,7 +730,8 @@ class LTLayoutContainer(LTContainer):
|
|||
for obj in empties:
|
||||
obj.analyze(laparams)
|
||||
textboxes = list(self.group_textlines(laparams, textlines))
|
||||
if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 and textboxes:
|
||||
if -1 <= laparams.boxes_flow and laparams.boxes_flow <= +1 \
|
||||
and textboxes:
|
||||
self.groups = self.group_textboxes(laparams, textboxes)
|
||||
assigner = IndexAssigner()
|
||||
for group in self.groups:
|
||||
|
@ -742,8 +761,8 @@ class LTFigure(LTLayoutContainer):
|
|||
self.name = name
|
||||
self.matrix = matrix
|
||||
(x, y, w, h) = bbox
|
||||
bbox = get_bound(apply_matrix_pt(matrix, (p, q))
|
||||
for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
|
||||
bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h))
|
||||
bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds)
|
||||
LTLayoutContainer.__init__(self, bbox)
|
||||
return
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
|
||||
from io import BytesIO
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
import six
|
||||
import logging
|
||||
|
||||
|
||||
|
@ -12,8 +10,7 @@ logger = logging.getLogger(__name__)
|
|||
class CorruptDataError(Exception):
|
||||
pass
|
||||
|
||||
## LZWDecoder
|
||||
##
|
||||
|
||||
class LZWDecoder(object):
|
||||
|
||||
def __init__(self, fp):
|
||||
|
@ -71,12 +68,12 @@ class LZWDecoder(object):
|
|||
x = self.table[code]
|
||||
else:
|
||||
raise CorruptDataError
|
||||
l = len(self.table)
|
||||
if l == 511:
|
||||
table_length = len(self.table)
|
||||
if table_length == 511:
|
||||
self.nbits = 10
|
||||
elif l == 1023:
|
||||
elif table_length == 1023:
|
||||
self.nbits = 11
|
||||
elif l == 2047:
|
||||
elif table_length == 2047:
|
||||
self.nbits = 12
|
||||
self.prevbuf = x
|
||||
return x
|
||||
|
@ -93,13 +90,12 @@ class LZWDecoder(object):
|
|||
# just ignore corrupt data and stop yielding there
|
||||
break
|
||||
yield x
|
||||
logger.debug('nbits=%d, code=%d, output=%r, table=%r' %
|
||||
(self.nbits, code, x, self.table[258:]))
|
||||
logger.debug('nbits=%d, code=%d, output=%r, table=%r'
|
||||
% (self.nbits, code, x, self.table[258:]))
|
||||
return
|
||||
|
||||
|
||||
# lzwdecode
|
||||
def lzwdecode(data):
|
||||
fp = BytesIO(data)
|
||||
s=LZWDecoder(fp).run()
|
||||
s = LZWDecoder(fp).run()
|
||||
return b''.join(s)
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import collections
|
||||
from .psparser import LIT
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
|
||||
## PDFColorSpace
|
||||
##
|
||||
LITERAL_DEVICE_GRAY = LIT('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
||||
|
@ -18,7 +17,8 @@ class PDFColorSpace(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||
return '<PDFColorSpace: %s, ncomponents=%d>' % \
|
||||
(self.name, self.ncomponents)
|
||||
|
||||
|
||||
if six.PY2:
|
||||
|
@ -37,4 +37,4 @@ for (name, n) in [
|
|||
('Indexed', 1),
|
||||
('Pattern', 1),
|
||||
]:
|
||||
PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)
|
||||
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
|
||||
|
|
|
@ -62,8 +62,6 @@ class PDFDevice(object):
|
|||
return
|
||||
|
||||
|
||||
## PDFTextDevice
|
||||
##
|
||||
class PDFTextDevice(PDFDevice):
|
||||
|
||||
def render_string(self, textstate, seq, ncs, graphicstate):
|
||||
|
@ -80,11 +78,13 @@ class PDFTextDevice(PDFDevice):
|
|||
if font.is_vertical():
|
||||
textstate.linematrix = self.render_string_vertical(
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs,
|
||||
graphicstate)
|
||||
else:
|
||||
textstate.linematrix = self.render_string_horizontal(
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs,
|
||||
graphicstate)
|
||||
return
|
||||
|
||||
def render_string_horizontal(self, seq, matrix, pos,
|
||||
|
@ -100,9 +100,9 @@ class PDFTextDevice(PDFDevice):
|
|||
for cid in font.decode(obj):
|
||||
if needcharspace:
|
||||
x += charspace
|
||||
x += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
||||
font, fontsize, scaling, rise, cid,
|
||||
ncs, graphicstate)
|
||||
x += self.render_char(
|
||||
utils.translate_matrix(matrix, (x, y)), font,
|
||||
fontsize, scaling, rise, cid, ncs, graphicstate)
|
||||
if cid == 32 and wordspace:
|
||||
x += wordspace
|
||||
needcharspace = True
|
||||
|
@ -121,20 +121,19 @@ class PDFTextDevice(PDFDevice):
|
|||
for cid in font.decode(obj):
|
||||
if needcharspace:
|
||||
y += charspace
|
||||
y += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
||||
font, fontsize, scaling, rise, cid,
|
||||
ncs, graphicstate)
|
||||
y += self.render_char(
|
||||
utils.translate_matrix(matrix, (x, y)), font, fontsize,
|
||||
scaling, rise, cid, ncs, graphicstate)
|
||||
if cid == 32 and wordspace:
|
||||
y += wordspace
|
||||
needcharspace = True
|
||||
return (x, y)
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||
graphicstate):
|
||||
return 0
|
||||
|
||||
|
||||
## TagExtractor
|
||||
##
|
||||
class TagExtractor(PDFDevice):
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
|
||||
|
@ -165,7 +164,8 @@ class TagExtractor(PDFDevice):
|
|||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
output = '<page id="%s" bbox="%s" rotate="%d">' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
||||
output = '<page id="%s" bbox="%s" rotate="%d">' %\
|
||||
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
||||
self.outfp.write(utils.make_compat_bytes(output))
|
||||
return
|
||||
|
||||
|
@ -177,8 +177,8 @@ class TagExtractor(PDFDevice):
|
|||
def begin_tag(self, tag, props=None):
|
||||
s = ''
|
||||
if isinstance(props, dict):
|
||||
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
|
||||
in sorted(six.iteritems(props)))
|
||||
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v)))
|
||||
for (k, v) in sorted(six.iteritems(props)))
|
||||
out_s = '<%s%s>' % (utils.enc(tag.name), s)
|
||||
self.outfp.write(utils.make_compat_bytes(out_s))
|
||||
self._stack.append(tag)
|
||||
|
|
|
@ -3,7 +3,7 @@ import re
|
|||
import struct
|
||||
import logging
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
import six
|
||||
try:
|
||||
import hashlib as md5
|
||||
except ImportError:
|
||||
|
@ -39,34 +39,37 @@ from .utils import decode_text
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
## Exceptions
|
||||
##
|
||||
|
||||
class PDFNoValidXRef(PDFSyntaxError):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNoOutlines(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFDestinationNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFEncryptionError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFPasswordIncorrect(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_OBJSTM = LIT('ObjStm')
|
||||
LITERAL_XREF = LIT('XRef')
|
||||
LITERAL_CATALOG = LIT('Catalog')
|
||||
|
||||
|
||||
## XRefs
|
||||
##
|
||||
class PDFBaseXRef(object):
|
||||
|
||||
def get_trailer(self):
|
||||
|
@ -82,8 +85,6 @@ class PDFBaseXRef(object):
|
|||
raise KeyError(objid)
|
||||
|
||||
|
||||
## PDFXRef
|
||||
##
|
||||
class PDFXRef(PDFBaseXRef):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -109,14 +110,16 @@ class PDFXRef(PDFBaseXRef):
|
|||
break
|
||||
f = line.strip().split(b' ')
|
||||
if len(f) != 2:
|
||||
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
|
||||
error_msg = 'Trailer not found: %r: line=%r' % (parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
try:
|
||||
if six.PY2:
|
||||
(start, nobjs) = map(long, f)
|
||||
(start, nobjs) = map(long, f) # noqa F821
|
||||
else:
|
||||
(start, nobjs) = map(int, f)
|
||||
except ValueError:
|
||||
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
||||
error_msg = 'Invalid line: %r: line=%r' % (parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
for objid in range(start, start+nobjs):
|
||||
try:
|
||||
(_, line) = parser.nextline()
|
||||
|
@ -124,11 +127,17 @@ class PDFXRef(PDFBaseXRef):
|
|||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
f = line.strip().split(b' ')
|
||||
if len(f) != 3:
|
||||
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
||||
error_msg = 'Invalid XRef format: %r, line=%r' \
|
||||
% (parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
(pos, genno, use) = f
|
||||
if use != b'n':
|
||||
continue
|
||||
self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
|
||||
if six.PY2:
|
||||
position = long(pos) # noqa F821
|
||||
else:
|
||||
position = int(pos)
|
||||
self.offsets[objid] = None, position, int(genno)
|
||||
log.info('xref objects: %r', self.offsets)
|
||||
self.load_trailer(parser)
|
||||
return
|
||||
|
@ -160,8 +169,6 @@ class PDFXRef(PDFBaseXRef):
|
|||
raise
|
||||
|
||||
|
||||
## PDFXRefFallback
|
||||
##
|
||||
class PDFXRefFallback(PDFXRef):
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -182,7 +189,7 @@ class PDFXRefFallback(PDFXRef):
|
|||
log.info('trailer: %r', self.trailer)
|
||||
break
|
||||
if six.PY3:
|
||||
line=line.decode('latin-1') #default pdf encoding
|
||||
line = line.decode('latin-1') # default pdf encoding
|
||||
m = self.PDFOBJ_CUE.match(line)
|
||||
if not m:
|
||||
continue
|
||||
|
@ -193,7 +200,8 @@ class PDFXRefFallback(PDFXRef):
|
|||
# expand ObjStm.
|
||||
parser.seek(pos)
|
||||
(_, obj) = parser.nextobject()
|
||||
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
|
||||
if isinstance(obj, PDFStream) \
|
||||
and obj.get('Type') is LITERAL_OBJSTM:
|
||||
stream = stream_value(obj)
|
||||
try:
|
||||
n = stream['N']
|
||||
|
@ -216,8 +224,6 @@ class PDFXRefFallback(PDFXRef):
|
|||
return
|
||||
|
||||
|
||||
## PDFXRefStream
|
||||
##
|
||||
class PDFXRefStream(PDFBaseXRef):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -235,7 +241,8 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
(_, genno) = parser.nexttoken() # ignored
|
||||
(_, kwd) = parser.nexttoken()
|
||||
(_, stream) = parser.nextobject()
|
||||
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
|
||||
if not isinstance(stream, PDFStream) \
|
||||
or stream['Type'] is not LITERAL_XREF:
|
||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||
size = stream['Size']
|
||||
index_array = stream.get('Index', (0, size))
|
||||
|
@ -288,8 +295,6 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
raise KeyError(objid)
|
||||
|
||||
|
||||
## PDFSecurityHandler
|
||||
##
|
||||
class PDFStandardSecurityHandler(object):
|
||||
|
||||
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
|
||||
|
@ -306,7 +311,8 @@ class PDFStandardSecurityHandler(object):
|
|||
def init(self):
|
||||
self.init_params()
|
||||
if self.r not in self.supported_revisions:
|
||||
raise PDFEncryptionError('Unsupported revision: param=%r' % self.param)
|
||||
error_msg = 'Unsupported revision: param=%r' % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.init_key()
|
||||
return
|
||||
|
||||
|
@ -412,7 +418,8 @@ class PDFStandardSecurityHandler(object):
|
|||
return self.decrypt_rc4(objid, genno, data)
|
||||
|
||||
def decrypt_rc4(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2]
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2]
|
||||
hash = md5.md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return ARC4.new(key).decrypt(data)
|
||||
|
@ -430,16 +437,20 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
|||
self.strf = literal_name(self.param['StrF'])
|
||||
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
|
||||
if self.stmf != self.strf:
|
||||
raise PDFEncryptionError('Unsupported crypt filter: param=%r' % self.param)
|
||||
error_msg = 'Unsupported crypt filter: param=%r' % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.cfm = {}
|
||||
for k, v in self.cf.items():
|
||||
f = self.get_cfm(literal_name(v['CFM']))
|
||||
if f is None:
|
||||
raise PDFEncryptionError('Unknown crypt filter method: param=%r' % self.param)
|
||||
error_msg = 'Unknown crypt filter method: param=%r' \
|
||||
% self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.cfm[k] = f
|
||||
self.cfm['Identity'] = self.decrypt_identity
|
||||
if self.strf not in self.cfm:
|
||||
raise PDFEncryptionError('Undefined crypt filter: param=%r' % self.param)
|
||||
error_msg = 'Undefined crypt filter: param=%r' % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
return
|
||||
|
||||
def get_cfm(self, name):
|
||||
|
@ -463,7 +474,8 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
|||
return data
|
||||
|
||||
def decrypt_aes128(self, objid, genno, data):
|
||||
key = self.key + struct.pack('<L', objid)[:3] + struct.pack('<L', genno)[:2] + b'sAlT'
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
||||
hash = md5.md5(key)
|
||||
key = hash.digest()[:min(len(key), 16)]
|
||||
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
|
||||
|
@ -501,23 +513,23 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
hash = SHA256.new(password)
|
||||
hash.update(self.o_key_salt)
|
||||
hash.update(self.u)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16).decrypt(self.oe)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
|
||||
.decrypt(self.oe)
|
||||
hash = SHA256.new(password)
|
||||
hash.update(self.u_validation_salt)
|
||||
if hash.digest() == self.u_hash:
|
||||
hash = SHA256.new(password)
|
||||
hash.update(self.u_key_salt)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16).decrypt(self.ue)
|
||||
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
|
||||
.decrypt(self.ue)
|
||||
return None
|
||||
|
||||
def decrypt_aes256(self, objid, genno, data):
|
||||
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
|
||||
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16])\
|
||||
.decrypt(data[16:])
|
||||
|
||||
|
||||
## PDFDocument
|
||||
##
|
||||
class PDFDocument(object):
|
||||
|
||||
"""PDFDocument object represents a PDF document.
|
||||
|
||||
Since a PDF file can be very big, normally it is not loaded at
|
||||
|
@ -571,7 +583,6 @@ class PDFDocument(object):
|
|||
continue
|
||||
# If there's an encryption info, remember it.
|
||||
if 'Encrypt' in trailer:
|
||||
#assert not self.encryption, str(self.encryption)
|
||||
self.encryption = (list_value(trailer['ID']),
|
||||
dict_value(trailer['Encrypt']))
|
||||
self._initialize_password(password)
|
||||
|
@ -648,18 +659,18 @@ class PDFDocument(object):
|
|||
(_, objid1) = self._parser.nexttoken() # objid
|
||||
(_, genno) = self._parser.nexttoken() # genno
|
||||
(_, kwd) = self._parser.nexttoken()
|
||||
# #### hack around malformed pdf files
|
||||
# copied from https://github.com/jaepil/pdfminer3k/blob/master/pdfminer/pdfparser.py#L399
|
||||
#to solve https://github.com/pdfminer/pdfminer.six/issues/56
|
||||
#assert objid1 == objid, str((objid1, objid))
|
||||
# hack around malformed pdf files
|
||||
# copied from https://github.com/jaepil/pdfminer3k/blob/master/
|
||||
# pdfminer/pdfparser.py#L399
|
||||
# to solve https://github.com/pdfminer/pdfminer.six/issues/56
|
||||
# assert objid1 == objid, str((objid1, objid))
|
||||
if objid1 != objid:
|
||||
x = []
|
||||
while kwd is not self.KEYWORD_OBJ:
|
||||
(_,kwd) = self._parser.nexttoken()
|
||||
(_, kwd) = self._parser.nexttoken()
|
||||
x.append(kwd)
|
||||
if x:
|
||||
objid1 = x[-2]
|
||||
genno = x[-1]
|
||||
# #### end hack around malformed pdf files
|
||||
if objid1 != objid:
|
||||
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
||||
|
@ -694,7 +705,8 @@ class PDFDocument(object):
|
|||
else:
|
||||
obj = self._getobj_parse(index, objid)
|
||||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
obj = decipher_all(self.decipher, objid, genno,
|
||||
obj)
|
||||
|
||||
if isinstance(obj, PDFStream):
|
||||
obj.set_objid(objid, genno)
|
||||
|
@ -784,7 +796,10 @@ class PDFDocument(object):
|
|||
else:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
log.info('xref found: pos=%r', prev)
|
||||
return long(prev) if six.PY2 else int(prev)
|
||||
if six.PY2:
|
||||
return long(prev) # noqa F821
|
||||
else:
|
||||
return int(prev)
|
||||
|
||||
# read xref table
|
||||
def read_xref_from(self, parser, start, xrefs):
|
||||
|
|
|
@ -3,7 +3,7 @@ import struct
|
|||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
from . import settings
|
||||
from .cmapdb import CMap
|
||||
|
@ -15,7 +15,6 @@ from .encodingdb import name2unicode
|
|||
from .fontmetrics import FONT_METRICS
|
||||
from .pdftypes import PDFException
|
||||
from .pdftypes import PDFStream
|
||||
from .pdftypes import resolve1
|
||||
from .pdftypes import dict_value
|
||||
from .pdftypes import int_value
|
||||
from .pdftypes import list_value
|
||||
|
@ -35,6 +34,7 @@ from .utils import nunpack
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_widths(seq):
|
||||
widths = {}
|
||||
r = []
|
||||
|
@ -54,6 +54,7 @@ def get_widths(seq):
|
|||
r = []
|
||||
return widths
|
||||
|
||||
|
||||
def get_widths2(seq):
|
||||
widths = {}
|
||||
r = []
|
||||
|
@ -77,7 +78,7 @@ def get_widths2(seq):
|
|||
class FontMetricsDB(object):
|
||||
|
||||
@classmethod
|
||||
def get_metrics(klass, fontname):
|
||||
def get_metrics(cls, fontname):
|
||||
return FONT_METRICS[fontname]
|
||||
|
||||
|
||||
|
@ -98,14 +99,16 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
return
|
||||
|
||||
def get_encoding(self):
|
||||
"""Parse the font encoding
|
||||
"""Parse the font encoding.
|
||||
|
||||
The Type1 font encoding maps character codes to character names. These character names could either be standard
|
||||
Adobe glyph names, or character names associated with custom CharStrings for this font. A CharString is a
|
||||
sequence of operations that describe how the character should be drawn.
|
||||
Currently, this function returns '' (empty string) for character names that are associated with a CharStrings.
|
||||
The Type1 font encoding maps character codes to character names. These
|
||||
character names could either be standard Adobe glyph names, or
|
||||
character names associated with custom CharStrings for this font. A
|
||||
CharString is a sequence of operations that describe how the character
|
||||
should be drawn. Currently, this function returns '' (empty string)
|
||||
for character names that are associated with a CharStrings.
|
||||
|
||||
References: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/font/pdfs/T1_SPEC.pdf
|
||||
Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
|
||||
|
||||
:returns mapping of character identifiers (cid's) to unicode characters
|
||||
"""
|
||||
|
@ -123,24 +126,26 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_PUT:
|
||||
((_, key), (_, value)) = self.pop(2)
|
||||
if (isinstance(key, int) and
|
||||
isinstance(value, PSLiteral)):
|
||||
if (isinstance(key, int) and isinstance(value, PSLiteral)):
|
||||
self.add_results((key, literal_name(value)))
|
||||
return
|
||||
|
||||
|
||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-',
|
||||
None, '-')
|
||||
|
||||
#Note: DLIdent-* isn't found in PDF Reference but is been kept as
|
||||
#it is harmless and have possibility of been a type. (induced from bug report/PR)
|
||||
IDENTITY_ENCODER = {'Identity-H':'Identity-H',
|
||||
'Identity-V':'Identity-V',
|
||||
'DLIdent-H':'Identity-H',
|
||||
'DLIdent-V':'Identity-V',
|
||||
'OneByteIdentityH':'OneByteIdentityH',
|
||||
'OneByteIdentityV':'OneByteIdentityV',
|
||||
# Note: DLIdent-* isn't found in PDF Reference but is been kept as
|
||||
# it is harmless and have possibility of been a type.
|
||||
# (induced from bug report/PR)
|
||||
IDENTITY_ENCODER = {'Identity-H': 'Identity-H',
|
||||
'Identity-V': 'Identity-V',
|
||||
'DLIdent-H': 'Identity-H',
|
||||
'DLIdent-V': 'Identity-V',
|
||||
'OneByteIdentityH': 'OneByteIdentityH',
|
||||
'OneByteIdentityV': 'OneByteIdentityV',
|
||||
}
|
||||
|
||||
|
||||
def getdict(data):
|
||||
d = {}
|
||||
fp = BytesIO(data)
|
||||
|
@ -180,7 +185,8 @@ def getdict(data):
|
|||
if b0 == 28:
|
||||
value = b1 << 8 | b2
|
||||
else:
|
||||
value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
|
||||
value = b1 << 24 | b2 << 16 | \
|
||||
struct.unpack('>H', fp.read(2))[0]
|
||||
stack.append(value)
|
||||
return d
|
||||
|
||||
|
@ -268,7 +274,6 @@ class CFFFont(object):
|
|||
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
|
||||
)
|
||||
|
||||
|
||||
class INDEX(object):
|
||||
|
||||
def __init__(self, fp):
|
||||
|
@ -298,7 +303,8 @@ class CFFFont(object):
|
|||
self.name = name
|
||||
self.fp = fp
|
||||
# Header
|
||||
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
|
||||
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB',
|
||||
self.fp.read(4))
|
||||
self.fp.read(hdrsize-4)
|
||||
# Name INDEX
|
||||
self.name_index = self.INDEX(self.fp)
|
||||
|
@ -325,7 +331,8 @@ class CFFFont(object):
|
|||
if format == b'\x00':
|
||||
# Format 0
|
||||
(n,) = struct.unpack('B', self.fp.read(1))
|
||||
for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
|
||||
for (code, gid) in enumerate(struct.unpack('B'*n,
|
||||
self.fp.read(n))):
|
||||
self.code2gid[code] = gid
|
||||
self.gid2code[gid] = code
|
||||
elif format == b'\x01':
|
||||
|
@ -348,7 +355,8 @@ class CFFFont(object):
|
|||
if format == b'\x00':
|
||||
# Format 0
|
||||
n = self.nglyphs-1
|
||||
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
|
||||
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n,
|
||||
self.fp.read(2*n))):
|
||||
gid += 1
|
||||
name = self.getstr(sid)
|
||||
self.name2gid[name] = gid
|
||||
|
@ -390,7 +398,8 @@ class TrueTypeFont(object):
|
|||
try:
|
||||
(ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
|
||||
for _ in range(ntables):
|
||||
(name, tsum, offset, length) = struct.unpack('>4sLLL', fp.read(16))
|
||||
(name, tsum, offset, length) = struct.unpack('>4sLLL',
|
||||
fp.read(16))
|
||||
self.tables[name] = (offset, length)
|
||||
except struct.error:
|
||||
# Do not fail if there are not enough bytes to read. Even for
|
||||
|
@ -415,7 +424,8 @@ class TrueTypeFont(object):
|
|||
fp.seek(base_offset+st_offset)
|
||||
(fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
|
||||
if fmttype == 0:
|
||||
char2gid.update(enumerate(struct.unpack('>256B', fp.read(256))))
|
||||
char2gid.update(enumerate(struct.unpack('>256B',
|
||||
fp.read(256))))
|
||||
elif fmttype == 2:
|
||||
subheaderkeys = struct.unpack('>256H', fp.read(512))
|
||||
firstbytes = [0]*8192
|
||||
|
@ -424,8 +434,10 @@ class TrueTypeFont(object):
|
|||
nhdrs = max(subheaderkeys)//8 + 1
|
||||
hdrs = []
|
||||
for i in range(nhdrs):
|
||||
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
|
||||
(firstcode, entcount, delta, offset) = \
|
||||
struct.unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i, firstcode, entcount, delta,
|
||||
fp.tell()-2+offset))
|
||||
for (i, firstcode, entcount, delta, pos) in hdrs:
|
||||
if not entcount:
|
||||
continue
|
||||
|
@ -449,7 +461,8 @@ class TrueTypeFont(object):
|
|||
if idr:
|
||||
fp.seek(pos+idr)
|
||||
for c in range(sc, ec+1):
|
||||
char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff
|
||||
b = struct.unpack('>H', fp.read(2))[0]
|
||||
char2gid[c] = (b + idd) & 0xffff
|
||||
else:
|
||||
for c in range(sc, ec+1):
|
||||
char2gid[c] = (c + idd) & 0xffff
|
||||
|
@ -469,6 +482,7 @@ class PDFFontError(PDFException):
|
|||
class PDFUnicodeNotDefined(PDFFontError):
|
||||
pass
|
||||
|
||||
|
||||
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
||||
LITERAL_TYPE1C = LIT('Type1C')
|
||||
|
||||
|
@ -485,9 +499,13 @@ class PDFFont(object):
|
|||
self.ascent = num_value(descriptor.get('Ascent', 0))
|
||||
self.descent = num_value(descriptor.get('Descent', 0))
|
||||
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
|
||||
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
|
||||
if default_width is None:
|
||||
self.default_width = num_value(descriptor.get('MissingWidth', 0))
|
||||
else:
|
||||
self.default_width = default_width
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(resolve_all(descriptor.get('FontBBox', (0, 0, 0, 0))))
|
||||
self.bbox = list_value(resolve_all(descriptor.get('FontBBox',
|
||||
(0, 0, 0, 0))))
|
||||
self.hscale = self.vscale = .001
|
||||
|
||||
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
||||
|
@ -557,7 +575,8 @@ class PDFSimpleFont(PDFFont):
|
|||
else:
|
||||
encoding = LITERAL_STANDARD_ENCODING
|
||||
if isinstance(encoding, dict):
|
||||
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
||||
name = literal_name(encoding.get('BaseEncoding',
|
||||
LITERAL_STANDARD_ENCODING))
|
||||
diff = list_value(encoding.get('Differences', []))
|
||||
self.cid2unicode = EncodingDB.get_encoding(name, diff)
|
||||
else:
|
||||
|
@ -596,7 +615,7 @@ class PDFType1Font(PDFSimpleFont):
|
|||
except KeyError:
|
||||
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
#lastchar = int_value(spec.get('LastChar', 255))
|
||||
# lastchar = int_value(spec.get('LastChar', 255))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
|
@ -623,7 +642,7 @@ class PDFType3Font(PDFSimpleFont):
|
|||
|
||||
def __init__(self, rsrcmgr, spec):
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
#lastchar = int_value(spec.get('LastChar', 0))
|
||||
# lastchar = int_value(spec.get('LastChar', 0))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
|
||||
if 'FontDescriptor' in spec:
|
||||
|
@ -651,8 +670,11 @@ class PDFCIDFont(PDFFont):
|
|||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
|
||||
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
|
||||
cid_registry = resolve1(
|
||||
self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1")
|
||||
cid_ordering = resolve1(
|
||||
self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")
|
||||
self.cidcoding = '%s-%s' % (cid_registry, cid_ordering)
|
||||
self.cmap = self.get_cmap_from_spec(spec, strict)
|
||||
|
||||
try:
|
||||
|
@ -679,15 +701,17 @@ class PDFCIDFont(PDFFont):
|
|||
pass
|
||||
else:
|
||||
try:
|
||||
self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
|
||||
except CMapDB.CMapNotFound as e:
|
||||
self.unicode_map = CMapDB.get_unicode_map(
|
||||
self.cidcoding, self.cmap.is_vertical())
|
||||
except CMapDB.CMapNotFound:
|
||||
pass
|
||||
|
||||
self.vertical = self.cmap.is_vertical()
|
||||
if self.vertical:
|
||||
# writing mode: vertical
|
||||
widths = get_widths2(list_value(spec.get('W2', [])))
|
||||
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in six.iteritems(widths))
|
||||
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy)))
|
||||
in six.iteritems(widths))
|
||||
(vy, w) = spec.get('DW2', [880, -1000])
|
||||
self.default_disp = (None, vy)
|
||||
widths = dict((cid, w) for (cid, (w, _)) in six.iteritems(widths))
|
||||
|
@ -732,7 +756,8 @@ class PDFCIDFont(PDFFont):
|
|||
return CMap()
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
|
||||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' \
|
||||
% (self.basefont, self.cidcoding)
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
@ -755,11 +780,12 @@ class PDFCIDFont(PDFFont):
|
|||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
|
||||
|
||||
def main(argv):
|
||||
for fname in argv[1:]:
|
||||
fp = open(fname, 'rb')
|
||||
font = CFFFont(fname, fp)
|
||||
print (font)
|
||||
print(font)
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
import re
|
||||
import logging
|
||||
from io import BytesIO
|
||||
|
@ -31,28 +30,26 @@ from .utils import choplist
|
|||
from .utils import mult_matrix
|
||||
from .utils import MATRIX_IDENTITY
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
## Exceptions
|
||||
##
|
||||
|
||||
class PDFResourceError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFInterpreterError(PDFException):
|
||||
pass
|
||||
|
||||
## Constants
|
||||
##
|
||||
|
||||
LITERAL_PDF = LIT('PDF')
|
||||
LITERAL_TEXT = LIT('Text')
|
||||
LITERAL_FONT = LIT('Font')
|
||||
LITERAL_FORM = LIT('Form')
|
||||
LITERAL_IMAGE = LIT('Image')
|
||||
|
||||
## PDFTextState
|
||||
##
|
||||
|
||||
class PDFTextState(object):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -70,12 +67,12 @@ class PDFTextState(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<PDFTextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
|
||||
' scaling=%r, leading=%r, render=%r, rise=%r, '
|
||||
' matrix=%r, linematrix=%r>' %
|
||||
(self.font, self.fontsize, self.charspace, self.wordspace,
|
||||
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \
|
||||
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \
|
||||
'matrix=%r, linematrix=%r>' \
|
||||
% (self.font, self.fontsize, self.charspace, self.wordspace,
|
||||
self.scaling, self.leading, self.render, self.rise,
|
||||
self.matrix, self.linematrix))
|
||||
self.matrix, self.linematrix)
|
||||
|
||||
def copy(self):
|
||||
obj = PDFTextState()
|
||||
|
@ -97,8 +94,6 @@ class PDFTextState(object):
|
|||
return
|
||||
|
||||
|
||||
## PDFGraphicState
|
||||
##
|
||||
class PDFGraphicState(object):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -139,10 +134,7 @@ class PDFGraphicState(object):
|
|||
self.scolor, self.ncolor))
|
||||
|
||||
|
||||
## Resource Manager
|
||||
##
|
||||
class PDFResourceManager(object):
|
||||
|
||||
"""Repository of shared resources.
|
||||
|
||||
ResourceManager facilitates reuse of shared resources
|
||||
|
@ -162,7 +154,6 @@ class PDFResourceManager(object):
|
|||
elif proc is LITERAL_TEXT:
|
||||
pass
|
||||
else:
|
||||
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
||||
pass
|
||||
return
|
||||
|
||||
|
@ -219,8 +210,6 @@ class PDFResourceManager(object):
|
|||
return font
|
||||
|
||||
|
||||
## PDFContentParser
|
||||
##
|
||||
class PDFContentParser(PSStackParser):
|
||||
|
||||
def __init__(self, streams):
|
||||
|
@ -264,20 +253,20 @@ class PDFContentParser(PSStackParser):
|
|||
while i <= len(target):
|
||||
self.fillbuf()
|
||||
if i:
|
||||
c = six.indexbytes(self.buf,self.charpos)
|
||||
c=six.int2byte(c)
|
||||
c = six.indexbytes(self.buf, self.charpos)
|
||||
c = six.int2byte(c)
|
||||
data += c
|
||||
self.charpos += 1
|
||||
if len(target) <= i and c.isspace():
|
||||
i += 1
|
||||
elif i < len(target) and c == (six.int2byte(target[i]) if six.PY3 else target[i]):
|
||||
elif i < len(target) and (c == six.int2byte(target[i])
|
||||
if six.PY3 else target[i]):
|
||||
i += 1
|
||||
else:
|
||||
i = 0
|
||||
else:
|
||||
try:
|
||||
j = self.buf.index(target[0], self.charpos)
|
||||
#print 'found', (0, self.buf[j:j+10])
|
||||
data += self.buf[self.charpos:j+1]
|
||||
self.charpos = j+1
|
||||
i = 1
|
||||
|
@ -304,7 +293,8 @@ class PDFContentParser(PSStackParser):
|
|||
try:
|
||||
(_, objs) = self.end_type('inline')
|
||||
if len(objs) % 2 != 0:
|
||||
raise PSTypeError('Invalid dictionary construct: %r' % objs)
|
||||
error_msg = 'Invalid dictionary construct: %r' % objs
|
||||
raise PSTypeError(error_msg)
|
||||
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
|
||||
(pos, data) = self.get_inline_data(pos+len(b'ID '))
|
||||
obj = PDFStream(d, data)
|
||||
|
@ -319,7 +309,10 @@ class PDFContentParser(PSStackParser):
|
|||
|
||||
|
||||
class PDFPageInterpreter(object):
|
||||
"""Processor for the content of a PDF page"""
|
||||
"""Processor for the content of a PDF page
|
||||
|
||||
Reference: PDF Reference, Appendix A, Operator Summary
|
||||
"""
|
||||
|
||||
def __init__(self, rsrcmgr, device):
|
||||
self.rsrcmgr = rsrcmgr
|
||||
|
@ -329,9 +322,8 @@ class PDFPageInterpreter(object):
|
|||
def dup(self):
|
||||
return self.__class__(self.rsrcmgr, self.device)
|
||||
|
||||
# init_resources(resources):
|
||||
# Prepare the fonts and XObjects listed in the Resource attribute.
|
||||
def init_resources(self, resources):
|
||||
"""Prepare the fonts and XObjects listed in the Resource attribute."""
|
||||
self.resources = resources
|
||||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
|
@ -344,9 +336,11 @@ class PDFPageInterpreter(object):
|
|||
name = literal_name(spec[0])
|
||||
else:
|
||||
name = literal_name(spec)
|
||||
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
||||
if name == 'ICCBased' and isinstance(spec, list) \
|
||||
and 2 <= len(spec):
|
||||
return PDFColorSpace(name, stream_value(spec[1])['N'])
|
||||
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
||||
elif name == 'DeviceN' and isinstance(spec, list) \
|
||||
and 2 <= len(spec):
|
||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE.get(name)
|
||||
|
@ -369,11 +363,9 @@ class PDFPageInterpreter(object):
|
|||
self.xobjmap[xobjid] = xobjstrm
|
||||
return
|
||||
|
||||
# init_state(ctm)
|
||||
# Initialize the text and graphic states for rendering a page.
|
||||
def init_state(self, ctm):
|
||||
# gstack: stack for graphical states.
|
||||
self.gstack = []
|
||||
"""Initialize the text and graphic states for rendering a page."""
|
||||
self.gstack = [] # stack for graphical states.
|
||||
self.ctm = ctm
|
||||
self.device.set_ctm(self.ctm)
|
||||
self.textstate = PDFTextState()
|
||||
|
@ -406,95 +398,95 @@ class PDFPageInterpreter(object):
|
|||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
# gsave
|
||||
def do_q(self):
|
||||
"""Save graphics state"""
|
||||
self.gstack.append(self.get_current_state())
|
||||
return
|
||||
|
||||
# grestore
|
||||
def do_Q(self):
|
||||
"""Restore graphics state"""
|
||||
if self.gstack:
|
||||
self.set_current_state(self.gstack.pop())
|
||||
return
|
||||
|
||||
# concat-matrix
|
||||
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
||||
"""Concatenate matrix to current transformation matrix"""
|
||||
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
|
||||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
# setlinewidth
|
||||
def do_w(self, linewidth):
|
||||
"""Set line width"""
|
||||
self.graphicstate.linewidth = linewidth
|
||||
return
|
||||
|
||||
# setlinecap
|
||||
def do_J(self, linecap):
|
||||
"""Set line cap style"""
|
||||
self.graphicstate.linecap = linecap
|
||||
return
|
||||
|
||||
# setlinejoin
|
||||
def do_j(self, linejoin):
|
||||
"""Set line join style"""
|
||||
self.graphicstate.linejoin = linejoin
|
||||
return
|
||||
|
||||
# setmiterlimit
|
||||
def do_M(self, miterlimit):
|
||||
"""Set miter limit"""
|
||||
self.graphicstate.miterlimit = miterlimit
|
||||
return
|
||||
|
||||
# setdash
|
||||
def do_d(self, dash, phase):
|
||||
"""Set line dash pattern"""
|
||||
self.graphicstate.dash = (dash, phase)
|
||||
return
|
||||
|
||||
# setintent
|
||||
def do_ri(self, intent):
|
||||
"""Set color rendering intent"""
|
||||
self.graphicstate.intent = intent
|
||||
return
|
||||
|
||||
# setflatness
|
||||
def do_i(self, flatness):
|
||||
"""Set flatness tolerance"""
|
||||
self.graphicstate.flatness = flatness
|
||||
return
|
||||
|
||||
# load-gstate
|
||||
def do_gs(self, name):
|
||||
#XXX
|
||||
"""Set parameters from graphics state parameter dictionary"""
|
||||
# todo
|
||||
return
|
||||
|
||||
# moveto
|
||||
def do_m(self, x, y):
|
||||
"""Begin new subpath"""
|
||||
self.curpath.append(('m', x, y))
|
||||
return
|
||||
|
||||
# lineto
|
||||
def do_l(self, x, y):
|
||||
"""Append straight line segment to path"""
|
||||
self.curpath.append(('l', x, y))
|
||||
return
|
||||
|
||||
# curveto
|
||||
def do_c(self, x1, y1, x2, y2, x3, y3):
|
||||
"""Append curved segment to path (three control points)"""
|
||||
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
# urveto
|
||||
def do_v(self, x2, y2, x3, y3):
|
||||
"""Append curved segment to path (initial point replicated)"""
|
||||
self.curpath.append(('v', x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
# rveto
|
||||
def do_y(self, x1, y1, x3, y3):
|
||||
"""Append curved segment to path (final point replicated)"""
|
||||
self.curpath.append(('y', x1, y1, x3, y3))
|
||||
return
|
||||
|
||||
# closepath
|
||||
def do_h(self):
|
||||
"""Close subpath"""
|
||||
self.curpath.append(('h',))
|
||||
return
|
||||
|
||||
# rectangle
|
||||
def do_re(self, x, y, w, h):
|
||||
"""Append rectangle to path"""
|
||||
self.curpath.append(('m', x, y))
|
||||
self.curpath.append(('l', x+w, y))
|
||||
self.curpath.append(('l', x+w, y+h))
|
||||
|
@ -502,71 +494,81 @@ class PDFPageInterpreter(object):
|
|||
self.curpath.append(('h',))
|
||||
return
|
||||
|
||||
# stroke
|
||||
def do_S(self):
|
||||
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
||||
"""Stroke path"""
|
||||
self.device.paint_path(self.graphicstate, True, False, False,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# close-and-stroke
|
||||
def do_s(self):
|
||||
"""Close and stroke path"""
|
||||
self.do_h()
|
||||
self.do_S()
|
||||
return
|
||||
|
||||
# fill
|
||||
def do_f(self):
|
||||
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
||||
"""Fill path using nonzero winding number rule"""
|
||||
self.device.paint_path(self.graphicstate, False, True, False,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
# fill (obsolete)
|
||||
do_F = do_f
|
||||
|
||||
# fill-even-odd
|
||||
def do_F(self):
|
||||
"""Fill path using nonzero winding number rule (obsolete)"""
|
||||
return self.do_f()
|
||||
|
||||
def do_f_a(self):
|
||||
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
||||
"""Fill path using even-odd rule"""
|
||||
self.device.paint_path(self.graphicstate, False, True, True,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# fill-and-stroke
|
||||
def do_B(self):
|
||||
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
||||
"""Fill and stroke path using nonzero winding number rule"""
|
||||
self.device.paint_path(self.graphicstate, True, True, False,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# fill-and-stroke-even-odd
|
||||
def do_B_a(self):
|
||||
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
||||
"""Fill and stroke path using even-odd rule"""
|
||||
self.device.paint_path(self.graphicstate, True, True, True,
|
||||
self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# close-fill-and-stroke
|
||||
def do_b(self):
|
||||
"""Close, fill, and stroke path using nonzero winding number rule"""
|
||||
self.do_h()
|
||||
self.do_B()
|
||||
return
|
||||
|
||||
# close-fill-and-stroke-even-odd
|
||||
def do_b_a(self):
|
||||
"""Close, fill, and stroke path using even-odd rule"""
|
||||
self.do_h()
|
||||
self.do_B_a()
|
||||
return
|
||||
|
||||
# close-only
|
||||
def do_n(self):
|
||||
"""End path without filling or stroking"""
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# clip
|
||||
def do_W(self):
|
||||
"""Set clipping path using nonzero winding number rule"""
|
||||
return
|
||||
|
||||
# clip-even-odd
|
||||
def do_W_a(self):
|
||||
"""Set clipping path using even-odd rule"""
|
||||
return
|
||||
|
||||
# setcolorspace-stroking
|
||||
def do_CS(self, name):
|
||||
"""Set color space for stroking operations
|
||||
|
||||
Introduced in PDF 1.1
|
||||
"""
|
||||
try:
|
||||
self.scs = self.csmap[literal_name(name)]
|
||||
except KeyError:
|
||||
|
@ -574,8 +576,8 @@ class PDFPageInterpreter(object):
|
|||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||
return
|
||||
|
||||
# setcolorspace-non-strokine
|
||||
def do_cs(self, name):
|
||||
"""Set color space for nonstroking operations"""
|
||||
try:
|
||||
self.ncs = self.csmap[literal_name(name)]
|
||||
except KeyError:
|
||||
|
@ -583,44 +585,38 @@ class PDFPageInterpreter(object):
|
|||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||
return
|
||||
|
||||
# setgray-stroking
|
||||
def do_G(self, gray):
|
||||
"""Set gray level for stroking operations"""
|
||||
self.graphicstate.scolor = gray
|
||||
#self.do_CS(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
|
||||
# setgray-non-stroking
|
||||
def do_g(self, gray):
|
||||
"""Set gray level for nonstroking operations"""
|
||||
self.graphicstate.ncolor = gray
|
||||
#self.do_cs(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
|
||||
# setrgb-stroking
|
||||
def do_RG(self, r, g, b):
|
||||
"""Set RGB color for stroking operations"""
|
||||
self.graphicstate.scolor = (r, g, b)
|
||||
#self.do_CS(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setrgb-non-stroking
|
||||
def do_rg(self, r, g, b):
|
||||
"""Set RGB color for nonstroking operations"""
|
||||
self.graphicstate.ncolor = (r, g, b)
|
||||
#self.do_cs(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setcmyk-stroking
|
||||
def do_K(self, c, m, y, k):
|
||||
"""Set CMYK color for stroking operations"""
|
||||
self.graphicstate.scolor = (c, m, y, k)
|
||||
#self.do_CS(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
|
||||
# setcmyk-non-stroking
|
||||
def do_k(self, c, m, y, k):
|
||||
"""Set CMYK color for nonstroking operations"""
|
||||
self.graphicstate.ncolor = (c, m, y, k)
|
||||
#self.do_cs(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
|
||||
# setcolor
|
||||
def do_SCN(self):
|
||||
"""Set color for stroking operations."""
|
||||
if self.scs:
|
||||
n = self.scs.ncomponents
|
||||
else:
|
||||
|
@ -631,6 +627,7 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
|
||||
def do_scn(self):
|
||||
"""Set color for nonstroking operations"""
|
||||
if self.ncs:
|
||||
n = self.ncs.ncomponents
|
||||
else:
|
||||
|
@ -641,77 +638,111 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
|
||||
def do_SC(self):
|
||||
"""Set color for stroking operations"""
|
||||
self.do_SCN()
|
||||
return
|
||||
|
||||
def do_sc(self):
|
||||
"""Set color for nonstroking operations"""
|
||||
self.do_scn()
|
||||
return
|
||||
|
||||
# sharing-name
|
||||
def do_sh(self, name):
|
||||
"""Paint area defined by shading pattern"""
|
||||
return
|
||||
|
||||
# begin-text
|
||||
def do_BT(self):
|
||||
"""Begin text object
|
||||
|
||||
Initializing the text matrix, Tm, and the text line matrix, Tlm, to
|
||||
the identity matrix. Text objects cannot be nested; a second BT cannot
|
||||
appear before an ET.
|
||||
"""
|
||||
self.textstate.reset()
|
||||
return
|
||||
|
||||
# end-text
|
||||
def do_ET(self):
|
||||
"""End a text object"""
|
||||
return
|
||||
|
||||
# begin-compat
|
||||
def do_BX(self):
|
||||
"""Begin compatibility section"""
|
||||
return
|
||||
|
||||
# end-compat
|
||||
def do_EX(self):
|
||||
"""End compatibility section"""
|
||||
return
|
||||
|
||||
# marked content operators
|
||||
def do_MP(self, tag):
|
||||
"""Define marked-content point"""
|
||||
self.device.do_tag(tag)
|
||||
return
|
||||
|
||||
def do_DP(self, tag, props):
|
||||
"""Define marked-content point with property list"""
|
||||
self.device.do_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_BMC(self, tag):
|
||||
"""Begin marked-content sequence"""
|
||||
self.device.begin_tag(tag)
|
||||
return
|
||||
|
||||
def do_BDC(self, tag, props):
|
||||
"""Begin marked-content sequence with property list"""
|
||||
self.device.begin_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_EMC(self):
|
||||
"""End marked-content sequence"""
|
||||
self.device.end_tag()
|
||||
return
|
||||
|
||||
# setcharspace
|
||||
def do_Tc(self, space):
|
||||
"""Set character spacing.
|
||||
|
||||
Character spacing is used by the Tj, TJ, and ' operators.
|
||||
|
||||
:param space: a number expressed in unscaled text space units.
|
||||
"""
|
||||
self.textstate.charspace = space
|
||||
return
|
||||
|
||||
# setwordspace
|
||||
def do_Tw(self, space):
|
||||
"""Set the word spacing.
|
||||
|
||||
Word spacing is used by the Tj, TJ, and ' operators.
|
||||
|
||||
:param space: a number expressed in unscaled text space units
|
||||
"""
|
||||
self.textstate.wordspace = space
|
||||
return
|
||||
|
||||
# textscale
|
||||
def do_Tz(self, scale):
|
||||
"""Set the horizontal scaling.
|
||||
|
||||
:param scale: is a number specifying the percentage of the normal width
|
||||
"""
|
||||
self.textstate.scaling = scale
|
||||
return
|
||||
|
||||
# setleading
|
||||
def do_TL(self, leading):
|
||||
"""Set the text leading.
|
||||
|
||||
Text leading is used only by the T*, ', and " operators.
|
||||
|
||||
:param leading: a number expressed in unscaled text space units
|
||||
"""
|
||||
self.textstate.leading = -leading
|
||||
return
|
||||
|
||||
# selectfont
|
||||
def do_Tf(self, fontid, fontsize):
|
||||
"""Set the text font
|
||||
|
||||
:param fontid: the name of a font resource in the Font subdictionary
|
||||
of the current resource dictionary
|
||||
:param fontsize: size is a number representing a scale factor.
|
||||
"""
|
||||
try:
|
||||
self.textstate.font = self.fontmap[literal_name(fontid)]
|
||||
except KeyError:
|
||||
|
@ -721,82 +752,92 @@ class PDFPageInterpreter(object):
|
|||
self.textstate.fontsize = fontsize
|
||||
return
|
||||
|
||||
# setrendering
|
||||
def do_Tr(self, render):
|
||||
"""Set the text rendering mode"""
|
||||
self.textstate.render = render
|
||||
return
|
||||
|
||||
# settextrise
|
||||
def do_Ts(self, rise):
|
||||
"""Set the text rise
|
||||
|
||||
:param rise: a number expressed in unscaled text space units
|
||||
"""
|
||||
self.textstate.rise = rise
|
||||
return
|
||||
|
||||
# text-move
|
||||
def do_Td(self, tx, ty):
|
||||
"""Move text position"""
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
|
||||
return
|
||||
|
||||
# text-move
|
||||
def do_TD(self, tx, ty):
|
||||
"""Move text position and set leading"""
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||
self.textstate.leading = ty
|
||||
self.textstate.linematrix = (0, 0)
|
||||
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
|
||||
return
|
||||
|
||||
# textmatrix
|
||||
def do_Tm(self, a, b, c, d, e, f):
|
||||
"""Set text matrix and text line matrix"""
|
||||
self.textstate.matrix = (a, b, c, d, e, f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
# nextline
|
||||
def do_T_a(self):
|
||||
"""Move to start of next text line"""
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
|
||||
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e,
|
||||
self.textstate.leading*d+f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
# show-pos
|
||||
def do_TJ(self, seq):
|
||||
#print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
|
||||
"""Show text, allowing individual glyph positioning"""
|
||||
if self.textstate.font is None:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('No font specified!')
|
||||
return
|
||||
self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy())
|
||||
self.device.render_string(self.textstate, seq, self.ncs,
|
||||
self.graphicstate.copy())
|
||||
return
|
||||
|
||||
# show
|
||||
def do_Tj(self, s):
|
||||
"""Show text"""
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# quote
|
||||
def do__q(self, s):
|
||||
"""Move to next line and show text
|
||||
|
||||
The ' (single quote) operator.
|
||||
"""
|
||||
self.do_T_a()
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# doublequote
|
||||
def do__w(self, aw, ac, s):
|
||||
"""Set word and character spacing, move to next line, and show text
|
||||
|
||||
The " (double quote) operator.
|
||||
"""
|
||||
self.do_Tw(aw)
|
||||
self.do_Tc(ac)
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# inline image
|
||||
def do_BI(self): # never called
|
||||
def do_BI(self):
|
||||
"""Begin inline image object"""
|
||||
return
|
||||
|
||||
def do_ID(self): # never called
|
||||
def do_ID(self):
|
||||
"""Begin inline image data"""
|
||||
return
|
||||
|
||||
def do_EI(self, obj):
|
||||
"""End inline image object"""
|
||||
if 'W' in obj and 'H' in obj:
|
||||
iobjid = str(id(obj))
|
||||
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
|
@ -804,8 +845,8 @@ class PDFPageInterpreter(object):
|
|||
self.device.end_figure(iobjid)
|
||||
return
|
||||
|
||||
# invoke an XObject
|
||||
def do_Do(self, xobjid):
|
||||
"""Invoke named XObject"""
|
||||
xobjid = literal_name(xobjid)
|
||||
try:
|
||||
xobj = stream_value(self.xobjmap[xobjid])
|
||||
|
@ -823,9 +864,13 @@ class PDFPageInterpreter(object):
|
|||
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
||||
# instead of having their own Resources entry.
|
||||
xobjres = xobj.get('Resources')
|
||||
resources = dict_value(xobjres) if xobjres else self.resources.copy()
|
||||
if xobjres:
|
||||
resources = dict_value(xobjres)
|
||||
else:
|
||||
resources = self.resources.copy()
|
||||
self.device.begin_figure(xobjid, bbox, matrix)
|
||||
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||
interpreter.render_contents(resources, [xobj],
|
||||
ctm=mult_matrix(matrix, self.ctm))
|
||||
self.device.end_figure(xobjid)
|
||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
||||
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
|
@ -852,10 +897,11 @@ class PDFPageInterpreter(object):
|
|||
self.device.end_page(page)
|
||||
return
|
||||
|
||||
# render_contents(resources, streams, ctm)
|
||||
# Render the content streams.
|
||||
# This method may be called recursively.
|
||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
||||
"""Render the content streams.
|
||||
|
||||
This method may be called recursively.
|
||||
"""
|
||||
log.info('render_contents: resources=%r, streams=%r, ctm=%r',
|
||||
resources, streams, ctm)
|
||||
self.init_resources(resources)
|
||||
|
@ -876,7 +922,8 @@ class PDFPageInterpreter(object):
|
|||
break
|
||||
if isinstance(obj, PSKeyword):
|
||||
name = keyword_name(obj)
|
||||
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
|
||||
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w')\
|
||||
.replace("'", '_q')
|
||||
if hasattr(self, method):
|
||||
func = getattr(self, method)
|
||||
nargs = six.get_function_code(func).co_argcount-1
|
||||
|
@ -890,7 +937,8 @@ class PDFPageInterpreter(object):
|
|||
func()
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Unknown operator: %r' % name)
|
||||
error_msg = 'Unknown operator: %r' % name
|
||||
raise PDFInterpreterError(error_msg)
|
||||
else:
|
||||
self.push(obj)
|
||||
return
|
||||
|
|
|
@ -11,7 +11,7 @@ from .pdfparser import PDFParser
|
|||
from .pdfdocument import PDFDocument
|
||||
from .pdfdocument import PDFTextExtractionNotAllowed
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -19,8 +19,7 @@ log = logging.getLogger(__name__)
|
|||
LITERAL_PAGE = LIT('Page')
|
||||
LITERAL_PAGES = LIT('Pages')
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
|
||||
class PDFPage(object):
|
||||
|
||||
"""An object that holds the information about a page.
|
||||
|
@ -73,12 +72,13 @@ class PDFPage(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % \
|
||||
(self.resources, self.mediabox)
|
||||
|
||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||
|
||||
@classmethod
|
||||
def create_pages(klass, document):
|
||||
def create_pages(cls, document):
|
||||
def search(obj, parent):
|
||||
if isinstance(obj, int):
|
||||
objid = obj
|
||||
|
@ -87,7 +87,7 @@ class PDFPage(object):
|
|||
objid = obj.objid
|
||||
tree = dict_value(obj).copy()
|
||||
for (k, v) in six.iteritems(parent):
|
||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||
if k in cls.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
|
||||
tree_type = tree.get('Type')
|
||||
|
@ -104,8 +104,9 @@ class PDFPage(object):
|
|||
yield (objid, tree)
|
||||
pages = False
|
||||
if 'Pages' in document.catalog:
|
||||
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
|
||||
yield klass(document, objid, tree)
|
||||
objects = search(document.catalog['Pages'], document.catalog)
|
||||
for (objid, tree) in objects:
|
||||
yield cls(document, objid, tree)
|
||||
pages = True
|
||||
if not pages:
|
||||
# fallback when /Pages is missing.
|
||||
|
@ -113,14 +114,15 @@ class PDFPage(object):
|
|||
for objid in xref.get_objids():
|
||||
try:
|
||||
obj = document.getobj(objid)
|
||||
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
|
||||
yield klass(document, objid, obj)
|
||||
if isinstance(obj, dict) \
|
||||
and obj.get('Type') is LITERAL_PAGE:
|
||||
yield cls(document, objid, obj)
|
||||
except PDFObjectNotFound:
|
||||
pass
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def get_pages(klass, fp,
|
||||
def get_pages(cls, fp,
|
||||
pagenos=None, maxpages=0, password='',
|
||||
caching=True, check_extractable=True):
|
||||
# Create a PDF parser object associated with the file object.
|
||||
|
@ -129,9 +131,10 @@ class PDFPage(object):
|
|||
doc = PDFDocument(parser, password=password, caching=caching)
|
||||
# Check if the document allows text extraction. If not, abort.
|
||||
if check_extractable and not doc.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
error_msg = 'Text extraction is not allowed: %r' % fp
|
||||
raise PDFTextExtractionNotAllowed(error_msg)
|
||||
# Process each page contained in the document.
|
||||
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
||||
for (pageno, page) in enumerate(cls.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos):
|
||||
continue
|
||||
yield page
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from .psparser import PSStackParser
|
||||
|
@ -15,16 +14,11 @@ from .pdftypes import dict_value
|
|||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFSyntaxError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## PDFParser
|
||||
##
|
||||
class PDFParser(PSStackParser):
|
||||
|
||||
"""
|
||||
PDFParser fetch PDF objects from a file stream.
|
||||
It can handle indirect references by referring to
|
||||
|
@ -123,7 +117,8 @@ class PDFParser(PSStackParser):
|
|||
data = bytes(data)
|
||||
self.seek(pos+objlen)
|
||||
# XXX limit objlen not to exceed object boundary
|
||||
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
|
||||
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
|
||||
objlen, dic, data[:10])
|
||||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
self.push((pos, obj))
|
||||
|
||||
|
@ -134,10 +129,7 @@ class PDFParser(PSStackParser):
|
|||
return
|
||||
|
||||
|
||||
## PDFStreamParser
|
||||
##
|
||||
class PDFStreamParser(PDFParser):
|
||||
|
||||
"""
|
||||
PDFStreamParser is used to parse PDF content streams
|
||||
that is contained in each page and has instructions
|
||||
|
@ -155,6 +147,7 @@ class PDFStreamParser(PDFParser):
|
|||
return
|
||||
|
||||
KEYWORD_OBJ = KWD(b'obj')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
|
|
|
@ -13,7 +13,7 @@ from . import settings
|
|||
from .utils import apply_png_predictor
|
||||
from .utils import isnumber
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -29,29 +29,31 @@ LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
|
|||
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
||||
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
|
||||
|
||||
## PDF Objects
|
||||
##
|
||||
|
||||
class PDFObject(PSObject):
|
||||
pass
|
||||
|
||||
|
||||
class PDFException(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTypeError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFValueError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFObjectNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNotImplementedError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
##
|
||||
class PDFObjRef(PDFObject):
|
||||
|
||||
def __init__(self, doc, objid, _):
|
||||
|
@ -60,7 +62,6 @@ class PDFObjRef(PDFObject):
|
|||
raise PDFValueError('PDF object id cannot be 0.')
|
||||
self.doc = doc
|
||||
self.objid = objid
|
||||
#self.genno = genno # Never used.
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -73,7 +74,6 @@ class PDFObjRef(PDFObject):
|
|||
return default
|
||||
|
||||
|
||||
# resolve
|
||||
def resolve1(x, default=None):
|
||||
"""Resolves an object.
|
||||
|
||||
|
@ -114,7 +114,6 @@ def decipher_all(decipher, objid, genno, x):
|
|||
return x
|
||||
|
||||
|
||||
# Type cheking
|
||||
def int_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, int):
|
||||
|
@ -179,8 +178,6 @@ def stream_value(x):
|
|||
return x
|
||||
|
||||
|
||||
## PDFStream type
|
||||
##
|
||||
class PDFStream(PDFObject):
|
||||
|
||||
def __init__(self, attrs, rawdata, decipher=None):
|
||||
|
@ -201,10 +198,12 @@ class PDFStream(PDFObject):
|
|||
def __repr__(self):
|
||||
if self.data is None:
|
||||
assert self.rawdata is not None
|
||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
|
||||
return '<PDFStream(%r): raw=%d, %r>' % \
|
||||
(self.objid, len(self.rawdata), self.attrs)
|
||||
else:
|
||||
assert self.data is not None
|
||||
return '<PDFStream(%r): len=%d, %r>' % (self.objid, len(self.data), self.attrs)
|
||||
return '<PDFStream(%r): len=%d, %r>' % \
|
||||
(self.objid, len(self.data), self.attrs)
|
||||
|
||||
def __contains__(self, name):
|
||||
return name in self.attrs
|
||||
|
@ -239,10 +238,12 @@ class PDFStream(PDFObject):
|
|||
if hasattr(fltr, 'resolve'):
|
||||
fltr = fltr.resolve()[0]
|
||||
_filters.append(fltr)
|
||||
return list(zip(_filters, params)) #solves https://github.com/pdfminer/pdfminer.six/issues/15
|
||||
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15
|
||||
return list(zip(_filters, params))
|
||||
|
||||
def decode(self):
|
||||
assert self.data is None and self.rawdata is not None, str((self.data, self.rawdata))
|
||||
assert self.data is None \
|
||||
and self.rawdata is not None, str((self.data, self.rawdata))
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
|
@ -252,14 +253,15 @@ class PDFStream(PDFObject):
|
|||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
for (f,params) in filters:
|
||||
for (f, params) in filters:
|
||||
if f in LITERALS_FLATE_DECODE:
|
||||
# will get errors if the document is encrypted.
|
||||
try:
|
||||
data = zlib.decompress(data)
|
||||
except zlib.error as e:
|
||||
if settings.STRICT:
|
||||
raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
|
||||
error_msg = 'Invalid zlib bytes: %r, %r' % (e, data)
|
||||
raise PDFException(error_msg)
|
||||
data = b''
|
||||
elif f in LITERALS_LZW_DECODE:
|
||||
data = lzwdecode(data)
|
||||
|
@ -272,7 +274,8 @@ class PDFStream(PDFObject):
|
|||
elif f in LITERALS_CCITTFAX_DECODE:
|
||||
data = ccittfaxdecode(data, params)
|
||||
elif f in LITERALS_DCT_DECODE:
|
||||
# This is probably a JPG stream - it does not need to be decoded twice.
|
||||
# This is probably a JPG stream
|
||||
# it does not need to be decoded twice.
|
||||
# Just return the stream to the user.
|
||||
pass
|
||||
elif f in LITERALS_JBIG2_DECODE:
|
||||
|
@ -292,10 +295,13 @@ class PDFStream(PDFObject):
|
|||
# PNG predictor
|
||||
colors = int_value(params.get('Colors', 1))
|
||||
columns = int_value(params.get('Columns', 1))
|
||||
bitspercomponent = int_value(params.get('BitsPerComponent', 8))
|
||||
data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
|
||||
raw_bits_per_component = params.get('BitsPerComponent', 8)
|
||||
bitspercomponent = int_value(raw_bits_per_component)
|
||||
data = apply_png_predictor(pred, colors, columns,
|
||||
bitspercomponent, data)
|
||||
else:
|
||||
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
||||
error_msg = 'Unsupported predictor: %r' % pred
|
||||
raise PDFNotImplementedError(error_msg)
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
import re
|
||||
import logging
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
from . import settings
|
||||
from .utils import choplist
|
||||
|
@ -13,8 +13,6 @@ from .utils import choplist
|
|||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
## PS Exceptions
|
||||
##
|
||||
class PSException(Exception):
|
||||
pass
|
||||
|
||||
|
@ -35,11 +33,6 @@ class PSValueError(PSException):
|
|||
pass
|
||||
|
||||
|
||||
## Basic PostScript Types
|
||||
##
|
||||
|
||||
## PSObject
|
||||
##
|
||||
class PSObject(object):
|
||||
|
||||
"""Base class for all PS or PDF-related data types."""
|
||||
|
@ -47,8 +40,6 @@ class PSObject(object):
|
|||
pass
|
||||
|
||||
|
||||
## PSLiteral
|
||||
##
|
||||
class PSLiteral(PSObject):
|
||||
|
||||
"""A class that represents a PostScript literal.
|
||||
|
@ -66,12 +57,10 @@ class PSLiteral(PSObject):
|
|||
self.name = name
|
||||
|
||||
def __repr__(self):
|
||||
name=self.name
|
||||
name = self.name
|
||||
return '/%r' % name
|
||||
|
||||
|
||||
## PSKeyword
|
||||
##
|
||||
class PSKeyword(PSObject):
|
||||
|
||||
"""A class that represents a PostScript keyword.
|
||||
|
@ -89,12 +78,10 @@ class PSKeyword(PSObject):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
name=self.name
|
||||
name = self.name
|
||||
return '/%r' % name
|
||||
|
||||
|
||||
## PSSymbolTable
|
||||
##
|
||||
class PSSymbolTable(object):
|
||||
|
||||
"""A utility class for storing PSLiteral/PSKeyword objects.
|
||||
|
@ -115,6 +102,7 @@ class PSSymbolTable(object):
|
|||
self.dict[name] = lit
|
||||
return lit
|
||||
|
||||
|
||||
PSLiteralTable = PSSymbolTable(PSLiteral)
|
||||
PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||
LIT = PSLiteralTable.intern
|
||||
|
@ -132,31 +120,30 @@ def literal_name(x):
|
|||
if settings.STRICT:
|
||||
raise PSTypeError('Literal required: %r' % (x,))
|
||||
else:
|
||||
name=x
|
||||
name = x
|
||||
else:
|
||||
name=x.name
|
||||
name = x.name
|
||||
if six.PY3:
|
||||
try:
|
||||
name = str(name,'utf-8')
|
||||
except:
|
||||
name = str(name, 'utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
return name
|
||||
|
||||
|
||||
def keyword_name(x):
|
||||
if not isinstance(x, PSKeyword):
|
||||
if settings.STRICT:
|
||||
raise PSTypeError('Keyword required: %r' % x)
|
||||
else:
|
||||
name=x
|
||||
name = x
|
||||
else:
|
||||
name=x.name
|
||||
name = x.name
|
||||
if six.PY3:
|
||||
name = str(name,'utf-8','ignore')
|
||||
name = str(name, 'utf-8', 'ignore')
|
||||
return name
|
||||
|
||||
|
||||
## PSBaseParser
|
||||
##
|
||||
EOL = re.compile(br'[\r\n]')
|
||||
SPC = re.compile(br'\s')
|
||||
NONSPC = re.compile(br'\S')
|
||||
|
@ -168,7 +155,16 @@ END_NUMBER = re.compile(br'[^0-9]')
|
|||
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
|
||||
END_STRING = re.compile(br'[()\134]')
|
||||
OCT_STRING = re.compile(br'[0-7]')
|
||||
ESC_STRING = {b'b': 8, b't': 9, b'n': 10, b'f': 12, b'r': 13, b'(': 40, b')': 41, b'\\': 92}
|
||||
ESC_STRING = {
|
||||
b'b': 8,
|
||||
b't': 9,
|
||||
b'n': 10,
|
||||
b'f': 12,
|
||||
b'r': 13,
|
||||
b'(': 40,
|
||||
b')': 41,
|
||||
b'\\': 92
|
||||
}
|
||||
|
||||
|
||||
class PSBaseParser(object):
|
||||
|
@ -183,7 +179,8 @@ class PSBaseParser(object):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos)
|
||||
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
|
||||
self.bufpos)
|
||||
|
||||
def flush(self):
|
||||
return
|
||||
|
@ -343,7 +340,7 @@ class PSBaseParser(object):
|
|||
self._curtoken += s[i:j]
|
||||
self._parse1 = self._parse_main
|
||||
# We ignore comments.
|
||||
#self._tokens.append(self._curtoken)
|
||||
# self._tokens.append(self._curtoken)
|
||||
return j
|
||||
|
||||
def _parse_literal(self, s, i):
|
||||
|
@ -359,8 +356,8 @@ class PSBaseParser(object):
|
|||
self._parse1 = self._parse_literal_hex
|
||||
return j+1
|
||||
try:
|
||||
self._curtoken=str(self._curtoken,'utf-8')
|
||||
except:
|
||||
self._curtoken = str(self._curtoken, 'utf-8')
|
||||
except Exception:
|
||||
pass
|
||||
self._add_token(LIT(self._curtoken))
|
||||
self._parse1 = self._parse_main
|
||||
|
@ -444,7 +441,8 @@ class PSBaseParser(object):
|
|||
return j+1
|
||||
if c == b')':
|
||||
self.paren -= 1
|
||||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||
if self.paren:
|
||||
# WTF, they said balanced parens need no special treatment.
|
||||
self._curtoken += c
|
||||
return j+1
|
||||
self._add_token(self._curtoken)
|
||||
|
@ -490,7 +488,8 @@ class PSBaseParser(object):
|
|||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken))
|
||||
token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),
|
||||
SPC.sub(b'', self._curtoken))
|
||||
self._add_token(token)
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
@ -504,10 +503,7 @@ class PSBaseParser(object):
|
|||
return token
|
||||
|
||||
|
||||
## PSStackParser
|
||||
##
|
||||
class PSStackParser(PSBaseParser):
|
||||
|
||||
def __init__(self, fp):
|
||||
PSBaseParser.__init__(self, fp)
|
||||
self.reset()
|
||||
|
@ -542,7 +538,7 @@ class PSStackParser(PSBaseParser):
|
|||
def add_results(self, *objs):
|
||||
try:
|
||||
log.debug('add_results: %r', objs)
|
||||
except:
|
||||
except Exception:
|
||||
log.debug('add_results: (unprintable object)')
|
||||
self.results.extend(objs)
|
||||
return
|
||||
|
@ -567,13 +563,16 @@ class PSStackParser(PSBaseParser):
|
|||
def nextobject(self):
|
||||
"""Yields a list of objects.
|
||||
|
||||
Returns keywords, literals, strings, numbers, arrays and dictionaries.
|
||||
Arrays and dictionaries are represented as Python lists and dictionaries.
|
||||
Arrays and dictionaries are represented as Python lists and
|
||||
dictionaries.
|
||||
|
||||
:return: keywords, literals, strings, numbers, arrays and dictionaries.
|
||||
"""
|
||||
while not self.results:
|
||||
(pos, token) = self.nexttoken()
|
||||
#print (pos,token), (self.curtype, self.curstack)
|
||||
if isinstance(token, (six.integer_types, float, bool, six.string_types, six.binary_type, PSLiteral)):
|
||||
if isinstance(token, (six.integer_types, float, bool,
|
||||
six.string_types, six.binary_type,
|
||||
PSLiteral)):
|
||||
# normal token
|
||||
self.push((pos, token))
|
||||
elif token == KEYWORD_ARRAY_BEGIN:
|
||||
|
@ -594,9 +593,11 @@ class PSStackParser(PSBaseParser):
|
|||
try:
|
||||
(pos, objs) = self.end_type('d')
|
||||
if len(objs) % 2 != 0:
|
||||
raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
|
||||
error_msg = 'Invalid dictionary construct: %r' % objs
|
||||
raise PSSyntaxError(error_msg)
|
||||
# construct a Python dictionary.
|
||||
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
|
||||
d = dict((literal_name(k), v)
|
||||
for (k, v) in choplist(2, objs) if v is not None)
|
||||
self.push((pos, d))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
|
@ -611,11 +612,13 @@ class PSStackParser(PSBaseParser):
|
|||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
elif isinstance(token,PSKeyword):
|
||||
log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
|
||||
elif isinstance(token, PSKeyword):
|
||||
log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos,
|
||||
token, self.curstack)
|
||||
self.do_keyword(pos, token)
|
||||
else:
|
||||
log.error('unknown token: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
|
||||
log.error('unknown token: pos=%r, token=%r, stack=%r', pos,
|
||||
token, self.curstack)
|
||||
self.do_keyword(pos, token)
|
||||
raise
|
||||
if self.context:
|
||||
|
@ -625,6 +628,6 @@ class PSStackParser(PSBaseParser):
|
|||
obj = self.results.pop(0)
|
||||
try:
|
||||
log.debug('nextobject: %r', obj)
|
||||
except:
|
||||
except Exception:
|
||||
log.debug('nextobject: (unprintable object)')
|
||||
return obj
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
|
||||
|
||||
""" Python implementation of Rijndael encryption algorithm.
|
||||
|
||||
This code is in the public domain.
|
||||
|
@ -24,6 +22,7 @@ def RKLENGTH(keybits):
|
|||
def NROUNDS(keybits):
|
||||
return (keybits)//32+6
|
||||
|
||||
|
||||
Te0 = [
|
||||
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
|
||||
0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
|
||||
|
@ -701,7 +700,7 @@ rcon = [
|
|||
# 128-bit blocks, Rijndael never uses more than 10 rcon values
|
||||
]
|
||||
|
||||
if len(struct.pack('L',0)) == 4:
|
||||
if len(struct.pack('L', 0)) == 4:
|
||||
# 32bit
|
||||
def GETU32(x): return struct.unpack('>L', x)[0]
|
||||
def PUTU32(x): return struct.pack('>L', x)
|
||||
|
@ -734,7 +733,8 @@ def rijndaelSetupEncrypt(key, keybits):
|
|||
rk[p+6] = rk[p+2] ^ rk[p+5]
|
||||
rk[p+7] = rk[p+3] ^ rk[p+6]
|
||||
i += 1
|
||||
if i == 10: return (rk, 10)
|
||||
if i == 10:
|
||||
return (rk, 10)
|
||||
p += 4
|
||||
|
||||
rk[4] = GETU32(key[16:20])
|
||||
|
@ -752,7 +752,8 @@ def rijndaelSetupEncrypt(key, keybits):
|
|||
rk[p+8] = rk[p+2] ^ rk[p+7]
|
||||
rk[p+9] = rk[p+3] ^ rk[p+8]
|
||||
i += 1
|
||||
if i == 8: return (rk, 12)
|
||||
if i == 8:
|
||||
return (rk, 12)
|
||||
rk[p+10] = rk[p+4] ^ rk[p+9]
|
||||
rk[p+11] = rk[p+5] ^ rk[p+10]
|
||||
p += 6
|
||||
|
@ -772,7 +773,8 @@ def rijndaelSetupEncrypt(key, keybits):
|
|||
rk[p+10] = rk[p+2] ^ rk[p+9]
|
||||
rk[p+11] = rk[p+3] ^ rk[p+10]
|
||||
i += 1
|
||||
if i == 7: return (rk, 14)
|
||||
if i == 7:
|
||||
return (rk, 14)
|
||||
temp = rk[p+11]
|
||||
rk[p+12] = (rk[p+4] ^
|
||||
(Te4[(temp >> 24) ] & 0xff000000) ^
|
||||
|
@ -796,15 +798,28 @@ def rijndaelSetupDecrypt(key, keybits):
|
|||
(rk, nrounds) = rijndaelSetupEncrypt(key, keybits)
|
||||
# invert the order of the round keys:
|
||||
i = 0
|
||||
j = 4*nrounds
|
||||
j = 4 * nrounds
|
||||
while i < j:
|
||||
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp
|
||||
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp
|
||||
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp
|
||||
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp
|
||||
temp = rk[i]
|
||||
rk[i] = rk[j]
|
||||
rk[j] = temp
|
||||
|
||||
temp = rk[i + 1]
|
||||
rk[i + 1] = rk[j + 1]
|
||||
rk[j + 1] = temp
|
||||
|
||||
temp = rk[i + 2]
|
||||
rk[i + 2] = rk[j + 2]
|
||||
rk[j + 2] = temp
|
||||
|
||||
temp = rk[i + 3]
|
||||
rk[i + 3] = rk[j + 3]
|
||||
rk[j + 3] = temp
|
||||
|
||||
i += 4
|
||||
j -= 4
|
||||
# apply the inverse MixColumn transform to all round keys but the first and the last:
|
||||
# apply the inverse MixColumn transform to all round keys but the first
|
||||
# and the last:
|
||||
p = 0
|
||||
for i in range(1, nrounds):
|
||||
p += 4
|
||||
|
@ -872,7 +887,8 @@ def rijndaelEncrypt(rk, nrounds, plaintext):
|
|||
rk[p+7])
|
||||
p += 8
|
||||
r -= 1
|
||||
if r == 0: break
|
||||
if r == 0:
|
||||
break
|
||||
s0 = (
|
||||
Te0[(t0 >> 24) ] ^
|
||||
Te1[(t1 >> 16) & 0xff] ^
|
||||
|
@ -975,7 +991,8 @@ def rijndaelDecrypt(rk, nrounds, ciphertext):
|
|||
rk[p+7])
|
||||
p += 8
|
||||
r -= 1
|
||||
if r == 0: break
|
||||
if r == 0:
|
||||
break
|
||||
s0 = (
|
||||
Td0[(t0 >> 24) ] ^
|
||||
Td1[(t3 >> 16) & 0xff] ^
|
||||
|
@ -1049,10 +1066,13 @@ class RijndaelDecryptor(object):
|
|||
"""
|
||||
|
||||
def __init__(self, key, keybits=256):
|
||||
assert len(key) == KEYLENGTH(keybits), str((len(key), KEYLENGTH(keybits)))
|
||||
assert len(key) == KEYLENGTH(keybits), \
|
||||
str((len(key), KEYLENGTH(keybits)))
|
||||
(self.rk, self.nrounds) = rijndaelSetupDecrypt(key, keybits)
|
||||
assert len(self.rk) == RKLENGTH(keybits), str((len(self.rk), RKLENGTH(keybits)))
|
||||
assert self.nrounds == NROUNDS(keybits), str((self.nrounds, NROUNDS(keybits)))
|
||||
assert len(self.rk) == RKLENGTH(keybits), \
|
||||
str((len(self.rk), RKLENGTH(keybits)))
|
||||
assert self.nrounds == NROUNDS(keybits), \
|
||||
str((self.nrounds, NROUNDS(keybits)))
|
||||
return
|
||||
|
||||
def decrypt(self, ciphertext):
|
||||
|
@ -1064,10 +1084,13 @@ class RijndaelDecryptor(object):
|
|||
class RijndaelEncryptor(object):
|
||||
|
||||
def __init__(self, key, keybits=256):
|
||||
assert len(key) == KEYLENGTH(keybits), str((len(key), KEYLENGTH(keybits)))
|
||||
assert len(key) == KEYLENGTH(keybits), \
|
||||
str((len(key), KEYLENGTH(keybits)))
|
||||
(self.rk, self.nrounds) = rijndaelSetupEncrypt(key, keybits)
|
||||
assert len(self.rk) == RKLENGTH(keybits), str((len(self.rk), RKLENGTH(keybits)))
|
||||
assert self.nrounds == NROUNDS(keybits), str((self.nrounds, NROUNDS(keybits)))
|
||||
assert len(self.rk) == RKLENGTH(keybits),\
|
||||
str((len(self.rk), RKLENGTH(keybits)))
|
||||
assert self.nrounds == NROUNDS(keybits), \
|
||||
str((self.nrounds, NROUNDS(keybits)))
|
||||
return
|
||||
|
||||
def encrypt(self, plaintext):
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
# * public domain *
|
||||
#
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
|
||||
def rldecode(data):
|
||||
"""
|
||||
|
@ -25,20 +26,15 @@ def rldecode(data):
|
|||
decoded = b''
|
||||
i = 0
|
||||
while i < len(data):
|
||||
#print 'data[%d]=:%d:' % (i,ord(data[i]))
|
||||
length = six.indexbytes(data,i)
|
||||
length = six.indexbytes(data, i)
|
||||
if length == 128:
|
||||
break
|
||||
if length >= 0 and length < 128:
|
||||
for j in range(i+1,(i+1)+(length+1)):
|
||||
decoded+=six.int2byte(six.indexbytes(data,j))
|
||||
#print 'length=%d, run=%s' % (length+1,run)
|
||||
|
||||
for j in range(i+1, (i+1)+(length+1)):
|
||||
decoded += six.int2byte(six.indexbytes(data, j))
|
||||
i = (i+1) + (length+1)
|
||||
if length > 128:
|
||||
run = six.int2byte(six.indexbytes(data,i+1))*(257-length)
|
||||
#print 'length=%d, run=%s' % (257-length,run)
|
||||
decoded+=run
|
||||
run = six.int2byte(six.indexbytes(data, i+1))*(257-length)
|
||||
decoded += run
|
||||
i = (i+1) + 1
|
||||
return decoded
|
||||
|
||||
|
|
|
@ -5,7 +5,8 @@ import struct
|
|||
|
||||
import six
|
||||
|
||||
# from sys import maxint as INF doesn't work anymore under Python3, but PDF still uses 32 bits ints
|
||||
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
|
||||
# still uses 32 bits ints
|
||||
INF = (1 << 31) - 1
|
||||
|
||||
if six.PY3:
|
||||
|
@ -42,10 +43,15 @@ def shorten_str(s, size):
|
|||
return s
|
||||
|
||||
|
||||
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
|
||||
"""When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
|
||||
def compatible_encode_method(bytesorstring, encoding='utf-8',
|
||||
erraction='ignore'):
|
||||
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
|
||||
|
||||
This does either.
|
||||
"""
|
||||
if six.PY2:
|
||||
assert isinstance(bytesorstring, (str, unicode)), str(type(bytesorstring))
|
||||
error_msg = str(type(bytesorstring))
|
||||
assert isinstance(bytesorstring, (str, unicode)), error_msg
|
||||
return bytesorstring.encode(encoding, erraction)
|
||||
if six.PY3:
|
||||
if isinstance(bytesorstring, str):
|
||||
|
@ -205,21 +211,21 @@ def choplist(n, seq):
|
|||
|
||||
def nunpack(s, default=0):
|
||||
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
||||
l = len(s)
|
||||
if not l:
|
||||
length = len(s)
|
||||
if not length:
|
||||
return default
|
||||
elif l == 1:
|
||||
elif length == 1:
|
||||
return ord(s)
|
||||
elif l == 2:
|
||||
elif length == 2:
|
||||
return struct.unpack('>H', s)[0]
|
||||
elif l == 3:
|
||||
elif length == 3:
|
||||
return struct.unpack('>L', b'\x00' + s)[0]
|
||||
elif l == 4:
|
||||
elif length == 4:
|
||||
return struct.unpack('>L', s)[0]
|
||||
elif l == 8:
|
||||
elif length == 8:
|
||||
return struct.unpack('>Q', s)[0]
|
||||
else:
|
||||
raise TypeError('invalid length: %d' % l)
|
||||
raise TypeError('invalid length: %d' % length)
|
||||
|
||||
|
||||
PDFDocEncoding = ''.join(six.unichr(x) for x in (
|
||||
|
@ -270,7 +276,8 @@ def enc(x, codec='ascii'):
|
|||
"""Encodes a string for SGML/XML/HTML"""
|
||||
if six.PY3 and isinstance(x, bytes):
|
||||
return ''
|
||||
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
|
||||
x = x.replace('&', '&').replace('>', '>').replace('<', '<') \
|
||||
.replace('"', '"')
|
||||
if codec:
|
||||
x = x.encode(codec, 'xmlcharrefreplace')
|
||||
return x
|
||||
|
@ -290,7 +297,9 @@ def vecBetweenBoxes(obj1, obj2):
|
|||
"""A distance function between two TextBoxes.
|
||||
|
||||
Consider the bounding rectangle for obj1 and obj2.
|
||||
Return vector between 2 boxes boundaries if they don't overlap, otherwise returns vector betweeen boxes centers
|
||||
Return vector between 2 boxes boundaries if they don't overlap, otherwise
|
||||
returns vector betweeen boxes centers
|
||||
|
||||
+------+..........+ (x1, y1)
|
||||
| obj1 | :
|
||||
+------+www+------+
|
||||
|
@ -385,6 +394,7 @@ class Plane(object):
|
|||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
|
||||
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 \
|
||||
or y1 <= obj.y0:
|
||||
continue
|
||||
yield obj
|
||||
|
|
|
@ -2,6 +2,7 @@ import os
|
|||
|
||||
|
||||
def absolute_sample_path(relative_sample_path):
|
||||
sample_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../samples'))
|
||||
sample_dir = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), '../samples'))
|
||||
sample_file = os.path.join(sample_dir, relative_sample_path)
|
||||
return sample_file
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
"""Tests based on the Adobe Glyph List Specification
|
||||
See: https://github.com/adobe-type-tools/agl-specification#2-the-mapping
|
||||
|
||||
While not in the specification, lowercase unicode often occurs in pdf's. Therefore lowercase unittest variants are
|
||||
added.
|
||||
While not in the specification, lowercase unicode often occurs in pdf's.
|
||||
Therefore lowercase unittest variants are added.
|
||||
"""
|
||||
from nose.tools import assert_raises
|
||||
|
||||
|
@ -10,110 +10,142 @@ from pdfminer.encodingdb import name2unicode
|
|||
|
||||
|
||||
def test_name2unicode_name_in_agl():
|
||||
"""The name "Lcommaaccent" has a single component, which is mapped to the string U+013B by AGL"""
|
||||
"""The name "Lcommaaccent" has a single component,
|
||||
which is mapped to the string U+013B by AGL"""
|
||||
assert u'\u013B' == name2unicode('Lcommaaccent')
|
||||
|
||||
|
||||
def test_name2unicode_uni():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B"
|
||||
all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('uni013B')
|
||||
|
||||
|
||||
def test_name2unicode_uni_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B"
|
||||
all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('uni013b')
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits():
|
||||
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||
"""The name "uni20AC0308" has a single component,
|
||||
which is mapped to the string U+20AC U+0308"""
|
||||
assert u'\u20AC\u0308' == name2unicode('uni20AC0308')
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
|
||||
"""The name "uni20AC0308" has a single component, which is mapped to the string U+20AC U+0308"""
|
||||
"""The name "uni20AC0308" has a single component,
|
||||
which is mapped to the string U+20AC U+0308"""
|
||||
assert u'\u20AC\u0308' == name2unicode('uni20ac0308')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string():
|
||||
"""The name "uni20ac" has a single component, which is mapped to a euro-sign.
|
||||
"""The name "uni20ac" has a single component,
|
||||
which is mapped to a euro-sign.
|
||||
|
||||
According to the specification this should be mapped to an empty string, but we also want to support lowercase
|
||||
hexadecimals
|
||||
"""
|
||||
According to the specification this should be mapped to an empty string,
|
||||
but we also want to support lowercase hexadecimals"""
|
||||
assert u'\u20ac' == name2unicode('uni20ac')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long():
|
||||
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||
"""The name "uniD801DC0C" has a single component,
|
||||
which is mapped to an empty string
|
||||
|
||||
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||
Neither D801 nor DC0C are in the appropriate set.
|
||||
This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C.
|
||||
This character can be correctly mapped by using the
|
||||
glyph name "u1040C.
|
||||
"""
|
||||
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long_lowercase():
|
||||
"""The name "uniD801DC0C" has a single component, which is mapped to an empty string
|
||||
"""The name "uniD801DC0C" has a single component,
|
||||
which is mapped to an empty string
|
||||
|
||||
Neither D801 nor DC0C are in the appropriate set. This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C. This character can be correctly mapped by using the
|
||||
Neither D801 nor DC0C are in the appropriate set.
|
||||
This form cannot be used to map to the character which is
|
||||
expressed as D801 DC0C in UTF-16, specifically U+1040C.
|
||||
This character can be correctly mapped by using the
|
||||
glyph name "u1040C."""
|
||||
assert_raises(KeyError, name2unicode, 'uniD801DC0C')
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('uniF6FB')
|
||||
""""Ogoneksmall" and "uniF6FB" both map
|
||||
to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == \
|
||||
name2unicode('uniF6FB')
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua_lowercase():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('unif6fb')
|
||||
""""Ogoneksmall" and "uniF6FB" both map
|
||||
to the string that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == \
|
||||
name2unicode('unif6fb')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('u013B')
|
||||
"""The components "Lcommaaccent," "uni013B,"
|
||||
and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == \
|
||||
name2unicode('u013B')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the string U+013B"""
|
||||
assert u'\u013B' == name2unicode('u013b')
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B"
|
||||
all map to the string U+013B"""
|
||||
assert u'\u013B' == \
|
||||
name2unicode('u013b')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == name2unicode('u1040C')
|
||||
"""The name "u1040C" has a single component, which
|
||||
is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == \
|
||||
name2unicode('u1040C')
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits_lowercase():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == name2unicode('u1040c')
|
||||
"""The name "u1040C" has a single component, which
|
||||
is mapped to the string U+1040C"""
|
||||
assert u'\U0001040C' == \
|
||||
name2unicode('u1040c')
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate"
|
||||
is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == \
|
||||
name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components_lowercase():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate"
|
||||
is mapped to the string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert u'\u013B\u20AC\u0308\U0001040C' == \
|
||||
name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
|
||||
|
||||
|
||||
def test_name2unicode_foo():
|
||||
"""The name 'foo' maps to an empty string, because 'foo' is not in AGL, and because it does not start with a 'u.'"""
|
||||
"""The name 'foo' maps to an empty string,
|
||||
because 'foo' is not in AGL,
|
||||
and because it does not start with a 'u.'"""
|
||||
assert_raises(KeyError, name2unicode, 'foo')
|
||||
|
||||
|
||||
def test_name2unicode_notdef():
|
||||
"""The name ".notdef" is reduced to an empty string (step 1) and mapped to an empty string (step 3)"""
|
||||
"""The name ".notdef" is reduced to an empty string (step 1)
|
||||
and mapped to an empty string (step 3)"""
|
||||
assert_raises(KeyError, name2unicode, '.notdef')
|
||||
|
||||
|
||||
def test_name2unicode_pua_ogoneksmall():
|
||||
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to U+F6FB."""
|
||||
""""
|
||||
Ogoneksmall" and "uniF6FB" both map to the string
|
||||
that corresponds to U+F6FB."""
|
||||
assert u'\uF6FB' == name2unicode('Ogoneksmall')
|
||||
|
||||
|
||||
|
|
|
@ -11,7 +11,9 @@ def run(sample_path):
|
|||
|
||||
|
||||
test_strings = {
|
||||
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\f",
|
||||
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
|
||||
"H e l l o \n\nW o r l d\n\n"
|
||||
"H e l l o \n\nW o r l d\n\n\f",
|
||||
"simple2.pdf": "\f",
|
||||
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
|
||||
}
|
||||
|
|
|
@ -86,7 +86,7 @@ class TestPDFEncoding():
|
|||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||
def test_encoding_DLIdentV_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('DLIdent-V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from nose.tools import assert_equal
|
||||
|
||||
from pdfminer.ccitt import *
|
||||
from pdfminer.ccitt import CCITTG4Parser
|
||||
|
||||
|
||||
class TestCCITTG4Parser():
|
||||
|
|
|
@ -4,11 +4,11 @@ import binascii
|
|||
|
||||
from nose.tools import assert_equal
|
||||
|
||||
from pdfminer.arcfour import *
|
||||
from pdfminer.ascii85 import *
|
||||
from pdfminer.lzw import *
|
||||
from pdfminer.rijndael import *
|
||||
from pdfminer.runlength import *
|
||||
from pdfminer.arcfour import Arcfour
|
||||
from pdfminer.ascii85 import asciihexdecode, ascii85decode
|
||||
from pdfminer.lzw import lzwdecode
|
||||
from pdfminer.rijndael import RijndaelEncryptor
|
||||
from pdfminer.runlength import rldecode
|
||||
|
||||
|
||||
def hex(b):
|
||||
|
@ -23,35 +23,47 @@ def dehex(b):
|
|||
|
||||
class TestAscii85():
|
||||
def test_ascii85decode(self):
|
||||
"""The sample string is taken from: http://en.wikipedia.org/w/index.php?title=Ascii85"""
|
||||
assert_equal(ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q'), b'Man is distinguished')
|
||||
assert_equal(ascii85decode(b'E,9)oF*2M7/c~>'), b'pleasure.')
|
||||
"""The sample string is taken from:
|
||||
http://en.wikipedia.org/w/index.php?title=Ascii85"""
|
||||
assert_equal(ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q'),
|
||||
b'Man is distinguished')
|
||||
assert_equal(ascii85decode(b'E,9)oF*2M7/c~>'),
|
||||
b'pleasure.')
|
||||
|
||||
def test_asciihexdecode(self):
|
||||
assert_equal(asciihexdecode(b'61 62 2e6364 65'), b'ab.cde')
|
||||
assert_equal(asciihexdecode(b'61 62 2e6364 657>'), b'ab.cdep')
|
||||
assert_equal(asciihexdecode(b'7>'), b'p')
|
||||
assert_equal(asciihexdecode(b'61 62 2e6364 65'),
|
||||
b'ab.cde')
|
||||
assert_equal(asciihexdecode(b'61 62 2e6364 657>'),
|
||||
b'ab.cdep')
|
||||
assert_equal(asciihexdecode(b'7>'),
|
||||
b'p')
|
||||
|
||||
|
||||
class TestArcfour():
|
||||
def test(self):
|
||||
assert_equal(hex(Arcfour(b'Key').process(b'Plaintext')), b'bbf316e8d940af0ad3')
|
||||
assert_equal(hex(Arcfour(b'Wiki').process(b'pedia')), b'1021bf0420')
|
||||
assert_equal(hex(Arcfour(b'Secret').process(b'Attack at dawn')), b'45a01f645fc35b383552544b9bf5')
|
||||
assert_equal(hex(Arcfour(b'Key').process(b'Plaintext')),
|
||||
b'bbf316e8d940af0ad3')
|
||||
assert_equal(hex(Arcfour(b'Wiki').process(b'pedia')),
|
||||
b'1021bf0420')
|
||||
assert_equal(hex(Arcfour(b'Secret').process(b'Attack at dawn')),
|
||||
b'45a01f645fc35b383552544b9bf5')
|
||||
|
||||
|
||||
class TestLzw():
|
||||
def test_lzwdecode(self):
|
||||
assert_equal(lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'), b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42')
|
||||
assert_equal(lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'),
|
||||
b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42')
|
||||
|
||||
|
||||
class TestRunlength():
|
||||
def test_rldecode(self):
|
||||
assert_equal(rldecode(b'\x05123456\xfa7\x04abcde\x80junk'), b'1234567777777abcde')
|
||||
assert_equal(rldecode(b'\x05123456\xfa7\x04abcde\x80junk'),
|
||||
b'1234567777777abcde')
|
||||
|
||||
|
||||
class TestRijndaelEncryptor():
|
||||
def test_RijndaelEncryptor(self):
|
||||
key = dehex(b'00010203050607080a0b0c0d0f101112')
|
||||
plaintext = dehex(b'506812a45f08c889b97f5980038b8359')
|
||||
assert_equal(hex(RijndaelEncryptor(key, 128).encrypt(plaintext)), b'd8f532538289ef7d06b506a4fd5be9c9')
|
||||
assert_equal(hex(RijndaelEncryptor(key, 128).encrypt(plaintext)),
|
||||
b'd8f532538289ef7d06b506a4fd5be9c9')
|
||||
|
|
|
@ -2,9 +2,9 @@ import logging
|
|||
|
||||
from nose.tools import assert_equal
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from pdfminer.psparser import KWD, LIT, PSBaseParser, PSStackParser, PSEOF
|
||||
|
||||
from pdfminer.psparser import *
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestPSBaseParser:
|
||||
|
@ -31,18 +31,19 @@ func/a/b{(c)do*}def
|
|||
'''
|
||||
|
||||
TOKENS = [
|
||||
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')),
|
||||
(21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
|
||||
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
|
||||
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
|
||||
(226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')),
|
||||
(234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')),
|
||||
(242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')),
|
||||
(256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'),
|
||||
(272, KWD(b'>>'))
|
||||
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')),
|
||||
(19, KWD(b'@')), (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')),
|
||||
(30, LIT('Some_Name')), (41, LIT('foo_xbaa')), (54, 0), (56, 1),
|
||||
(59, -2), (62, 0.5), (65, 1.234), (71, b'abc'), (77, b''),
|
||||
(80, b'abc ( def ) ghi'), (98, b'def \x00 4ghi'),
|
||||
(118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'),
|
||||
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '),
|
||||
(211, b'\xab\xcd\x00\x124\x05'), (226, KWD(b'func')), (230, LIT('a')),
|
||||
(232, LIT('b')), (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')),
|
||||
(241, KWD(b'}')), (242, KWD(b'def')), (246, KWD(b'[')), (248, 1),
|
||||
(250, b'z'), (254, KWD(b'!')), (256, KWD(b']')), (258, KWD(b'<<')),
|
||||
(261, LIT('foo')), (266, b'bar'), (272, KWD(b'>>'))
|
||||
]
|
||||
|
||||
OBJS = [
|
||||
|
@ -50,10 +51,10 @@ func/a/b{(c)do*}def
|
|||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
|
||||
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
|
||||
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
|
||||
(230, LIT('a')), (232, LIT('b')), (234, [b'c']), (246, [1, b'z']),
|
||||
(258, {'foo': b'bar'}),
|
||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'),
|
||||
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '),
|
||||
(211, b'\xab\xcd\x00\x124\x05'), (230, LIT('a')), (232, LIT('b')),
|
||||
(234, [b'c']), (246, [1, b'z']), (258, {'foo': b'bar'}),
|
||||
]
|
||||
|
||||
def get_tokens(self, s):
|
||||
|
|
|
@ -8,7 +8,8 @@ def run(filename, options=None):
|
|||
absolute_path = absolute_sample_path(filename)
|
||||
with NamedTemporaryFile() as output_file:
|
||||
if options:
|
||||
s = 'dumppdf -o %s %s %s' % (output_file.name, options, absolute_path)
|
||||
s = 'dumppdf -o %s %s %s' % (output_file.name,
|
||||
options, absolute_path)
|
||||
else:
|
||||
s = 'dumppdf -o %s %s' % (output_file.name, absolute_path)
|
||||
dumppdf.main(s.split(' ')[1:])
|
||||
|
|
|
@ -10,7 +10,8 @@ def run(sample_path, options=None):
|
|||
absolute_path = absolute_sample_path(sample_path)
|
||||
with NamedTemporaryFile() as output_file:
|
||||
if options:
|
||||
s = 'pdf2txt -o %s %s %s' % (output_file.name, options, absolute_path)
|
||||
s = 'pdf2txt -o %s %s %s' % (output_file.name,
|
||||
options, absolute_path)
|
||||
else:
|
||||
s = 'pdf2txt -o %s %s' % (output_file.name, absolute_path)
|
||||
pdf2txt.main(s.split(' ')[1:])
|
||||
|
@ -34,7 +35,8 @@ class TestDumpPDF():
|
|||
run('sampleOneByteIdentityEncode.pdf')
|
||||
|
||||
def test_nonfree_175(self):
|
||||
"""Regression test for https://github.com/pdfminer/pdfminer.six/issues/65"""
|
||||
"""Regression test for:
|
||||
https://github.com/pdfminer/pdfminer.six/issues/65"""
|
||||
run('nonfree/175.pdf')
|
||||
|
||||
def test_nonfree_dmca(self):
|
||||
|
@ -69,7 +71,8 @@ class TestDumpImages(object):
|
|||
def extract_images(input_file):
|
||||
output_dir = mkdtemp()
|
||||
with NamedTemporaryFile() as output_file:
|
||||
commands = ['-o', output_file.name, '--output-dir', output_dir, input_file]
|
||||
commands = ['-o', output_file.name, '--output-dir',
|
||||
output_dir, input_file]
|
||||
pdf2txt.main(commands)
|
||||
image_files = os.listdir(output_dir)
|
||||
rmtree(output_dir)
|
||||
|
@ -78,9 +81,11 @@ class TestDumpImages(object):
|
|||
def test_nonfree_dmca(self):
|
||||
"""Extract images of pdf containing bmp images
|
||||
|
||||
Regression test for: https://github.com/pdfminer/pdfminer.six/issues/131
|
||||
Regression test for:
|
||||
https://github.com/pdfminer/pdfminer.six/issues/131
|
||||
"""
|
||||
image_files = self.extract_images(absolute_sample_path('../samples/nonfree/dmca.pdf'))
|
||||
image_files = self.extract_images(
|
||||
absolute_sample_path('../samples/nonfree/dmca.pdf'))
|
||||
assert image_files[0].endswith('bmp')
|
||||
|
||||
def test_nonfree_175(self):
|
||||
|
@ -92,7 +97,8 @@ class TestDumpImages(object):
|
|||
|
||||
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
|
||||
"""
|
||||
image_files = self.extract_images(absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf'))
|
||||
image_files = self.extract_images(
|
||||
absolute_sample_path('../samples/contrib/pdf-with-jbig2.pdf'))
|
||||
assert image_files[0].endswith('.jb2')
|
||||
|
||||
def test_contrib_matplotlib(self):
|
||||
|
|
|
@ -22,12 +22,14 @@ class TestPlane(object):
|
|||
assert_equal(result, [obj])
|
||||
|
||||
def test_find_if_object_is_smaller_than_gridsize(self):
|
||||
plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100)
|
||||
plane, obj = self.given_plane_with_one_object(object_size=1,
|
||||
gridsize=100)
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert_equal(result, [obj])
|
||||
|
||||
def test_find_object_if_much_larger_than_gridsize(self):
|
||||
plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10)
|
||||
plane, obj = self.given_plane_with_one_object(object_size=100,
|
||||
gridsize=10)
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert_equal(result, [obj])
|
||||
|
||||
|
|
|
@ -3,13 +3,15 @@
|
|||
import sys
|
||||
import fileinput
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
import six
|
||||
|
||||
|
||||
def main(argv):
|
||||
fonts = {}
|
||||
for line in fileinput.input():
|
||||
f = line.strip().split(' ')
|
||||
if not f: continue
|
||||
if not f:
|
||||
continue
|
||||
k = f[0]
|
||||
if k == 'FontName':
|
||||
fontname = f[1]
|
||||
|
@ -23,21 +25,23 @@ def main(argv):
|
|||
chars[cid] = width
|
||||
elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
|
||||
'Ascender', 'Descender'):
|
||||
k = {'Ascender':'Ascent', 'Descender':'Descent'}.get(k,k)
|
||||
k = {'Ascender': 'Ascent', 'Descender': 'Descent'}.get(k, k)
|
||||
props[k] = float(f[1])
|
||||
elif k in ('FontName', 'FamilyName', 'Weight'):
|
||||
k = {'FamilyName':'FontFamily', 'Weight':'FontWeight'}.get(k,k)
|
||||
k = {'FamilyName': 'FontFamily', 'Weight': 'FontWeight'}.get(k, k)
|
||||
props[k] = f[1]
|
||||
elif k == 'IsFixedPitch':
|
||||
if f[1].lower() == 'true':
|
||||
props['Flags'] = 64
|
||||
elif k == 'FontBBox':
|
||||
props[k] = tuple(map(float, f[1:5]))
|
||||
print ('# -*- python -*-')
|
||||
print ('FONT_METRICS = {')
|
||||
for (fontname,(props,chars)) in six.iteritems(fonts):
|
||||
print (' %r: %r,' % (fontname, (props,chars)))
|
||||
print ('}')
|
||||
print('# -*- python -*-')
|
||||
print('FONT_METRICS = {')
|
||||
for (fontname, (props, chars)) in six.iteritems(fonts):
|
||||
print(' %r: %r,' % (fontname, (props, chars)))
|
||||
print('}')
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -9,8 +9,6 @@ import codecs
|
|||
import six
|
||||
|
||||
|
||||
## CMapConverter
|
||||
##
|
||||
class CMapConverter(object):
|
||||
|
||||
def __init__(self, enc2codec={}):
|
||||
|
@ -49,8 +47,9 @@ class CMapConverter(object):
|
|||
def load(self, fp):
|
||||
encs = None
|
||||
for line in fp:
|
||||
(line,_,_) = line.strip().partition('#')
|
||||
if not line: continue
|
||||
(line, _, _) = line.strip().partition('#')
|
||||
if not line:
|
||||
continue
|
||||
values = line.split('\t')
|
||||
if encs is None:
|
||||
assert values[0] == 'CID', str(values)
|
||||
|
@ -90,16 +89,18 @@ class CMapConverter(object):
|
|||
|
||||
def pick(unimap):
|
||||
chars = list(unimap.items())
|
||||
chars.sort(key=(lambda x:(x[1],-ord(x[0]))), reverse=True)
|
||||
(c,_) = chars[0]
|
||||
chars.sort(key=(lambda x: (x[1], -ord(x[0]))), reverse=True)
|
||||
(c, _) = chars[0]
|
||||
return c
|
||||
|
||||
cid = int(values[0])
|
||||
unimap_h = {}
|
||||
unimap_v = {}
|
||||
for (enc,value) in zip(encs, values):
|
||||
if enc == 'CID': continue
|
||||
if value == '*': continue
|
||||
for (enc, value) in zip(encs, values):
|
||||
if enc == 'CID':
|
||||
continue
|
||||
if value == '*':
|
||||
continue
|
||||
|
||||
# hcodes, vcodes: encoded bytes for each writing mode.
|
||||
hcodes = []
|
||||
|
@ -110,7 +111,7 @@ class CMapConverter(object):
|
|||
code = code[:-1]
|
||||
try:
|
||||
code = codecs.decode(code, 'hex_codec')
|
||||
except:
|
||||
except Exception:
|
||||
code = chr(int(code, 16))
|
||||
if vertical:
|
||||
vcodes.append(code)
|
||||
|
@ -155,14 +156,15 @@ class CMapConverter(object):
|
|||
fp.write(pickle.dumps(data, 2))
|
||||
return
|
||||
|
||||
# main
|
||||
|
||||
def main(argv):
|
||||
import getopt
|
||||
import gzip
|
||||
import os.path
|
||||
|
||||
def usage():
|
||||
print ('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0])
|
||||
print('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]'
|
||||
% argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'c:')
|
||||
|
@ -171,16 +173,18 @@ def main(argv):
|
|||
enc2codec = {}
|
||||
for (k, v) in opts:
|
||||
if k == '-c':
|
||||
(enc,_,codec) = v.partition('=')
|
||||
(enc, _, codec) = v.partition('=')
|
||||
enc2codec[enc] = codec
|
||||
if not args: return usage()
|
||||
if not args:
|
||||
return usage()
|
||||
outdir = args.pop(0)
|
||||
if not args: return usage()
|
||||
if not args:
|
||||
return usage()
|
||||
regname = args.pop(0)
|
||||
|
||||
converter = CMapConverter(enc2codec)
|
||||
for path in args:
|
||||
print ('reading: %r...' % path)
|
||||
print('reading: %r...' % path)
|
||||
fp = open(path)
|
||||
converter.load(fp)
|
||||
fp.close()
|
||||
|
@ -188,17 +192,19 @@ def main(argv):
|
|||
for enc in converter.get_encs():
|
||||
fname = '%s.pickle.gz' % enc
|
||||
path = os.path.join(outdir, fname)
|
||||
print ('writing: %r...' % path)
|
||||
print('writing: %r...' % path)
|
||||
fp = gzip.open(path, 'wb')
|
||||
converter.dump_cmap(fp, enc)
|
||||
fp.close()
|
||||
|
||||
fname = 'to-unicode-%s.pickle.gz' % regname
|
||||
path = os.path.join(outdir, fname)
|
||||
print ('writing: %r...' % path)
|
||||
print('writing: %r...' % path)
|
||||
fp = gzip.open(path, 'wb')
|
||||
converter.dump_unicodemap(fp)
|
||||
fp.close()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
import sys
|
||||
import fileinput
|
||||
|
||||
|
||||
def main(argv):
|
||||
state = 0
|
||||
for line in fileinput.input():
|
||||
|
@ -10,14 +11,17 @@ def main(argv):
|
|||
if not line or line.startswith('#'):
|
||||
if state == 1:
|
||||
state = 2
|
||||
print ('}\n')
|
||||
print (line)
|
||||
print('}\n')
|
||||
print(line)
|
||||
continue
|
||||
if state == 0:
|
||||
print ('\nglyphname2unicode = {')
|
||||
print('\nglyphname2unicode = {')
|
||||
state = 1
|
||||
(name,x) = line.split(';')
|
||||
(name, x) = line.split(';')
|
||||
codes = x.split(' ')
|
||||
print (' %r: u\'%s\',' % (name, ''.join( '\\u%s' % code for code in codes )))
|
||||
print(' %r: u\'%s\',' % (name,
|
||||
''.join('\\u%s' % code for code in codes)))
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -100,11 +100,13 @@ def dumpallobjs(out, doc, codec=None):
|
|||
out.write('<pdf>')
|
||||
for xref in doc.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
if objid in visited: continue
|
||||
if objid in visited:
|
||||
continue
|
||||
visited.add(objid)
|
||||
try:
|
||||
obj = doc.getobj(objid)
|
||||
if obj is None: continue
|
||||
if obj is None:
|
||||
continue
|
||||
out.write('<object id="%d">\n' % objid)
|
||||
dumpxml(out, obj, codec=codec)
|
||||
out.write('\n</object>\n\n')
|
||||
|
@ -183,8 +185,8 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
|||
(filename))
|
||||
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
|
||||
raise PDFValueError(
|
||||
'unable to process PDF: reference for %r is not an EmbeddedFile' %
|
||||
(filename))
|
||||
'unable to process PDF: reference for %r '
|
||||
'is not an EmbeddedFile' % (filename))
|
||||
path = os.path.join(extractdir, filename)
|
||||
if os.path.exists(path):
|
||||
raise IOError('file exists: %r' % path)
|
||||
|
|
|
@ -13,8 +13,8 @@ This is an in-house mapping table for some Latin-1 characters
|
|||
"""
|
||||
|
||||
LATIN2ASCII = {
|
||||
#0x00a0: '',
|
||||
#0x00a7: '',
|
||||
# 0x00a0: '',
|
||||
# 0x00a7: '',
|
||||
|
||||
# iso-8859-1
|
||||
0x00c0: 'A`',
|
||||
|
@ -94,38 +94,45 @@ LATIN2ASCII = {
|
|||
0xfb06: 'st',
|
||||
|
||||
# Symbols
|
||||
#0x2013: '',
|
||||
# 0x2013: '',
|
||||
0x2014: '--',
|
||||
0x2015: '||',
|
||||
0x2018: '`',
|
||||
0x2019: "'",
|
||||
0x201c: '``',
|
||||
0x201d: "''",
|
||||
#0x2022: '',
|
||||
#0x2212: '',
|
||||
# 0x2022: '',
|
||||
# 0x2212: '',
|
||||
|
||||
}
|
||||
|
||||
|
||||
def latin2ascii(s):
|
||||
return ''.join( LATIN2ASCII.get(ord(c),c) for c in s )
|
||||
return ''.join(LATIN2ASCII.get(ord(c), c) for c in s)
|
||||
|
||||
|
||||
def main(argv):
|
||||
import getopt, fileinput
|
||||
import getopt
|
||||
import fileinput
|
||||
|
||||
def usage():
|
||||
print ('usage: %s [-c codec] file ...' % argv[0])
|
||||
print('usage: %s [-c codec] file ...' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'c')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
if not args:
|
||||
return usage()
|
||||
codec = 'utf-8'
|
||||
for (k, v) in opts:
|
||||
if k == '-c': codec = v
|
||||
if k == '-c':
|
||||
codec = v
|
||||
for line in fileinput.input(args):
|
||||
line = latin2ascii(unicode(line, codec, 'ignore'))
|
||||
line = latin2ascii(str(line, codec, 'ignore'))
|
||||
sys.stdout.write(line.encode('ascii', 'replace'))
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
158
tools/pdf2txt.py
158
tools/pdf2txt.py
|
@ -1,12 +1,13 @@
|
|||
"""A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags."""
|
||||
"""A command line tool for extracting text and images from PDF and
|
||||
output it to plain text, html, xml or tags."""
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import six
|
||||
|
||||
import pdfminer.high_level
|
||||
import pdfminer.layout
|
||||
from pdfminer.image import ImageWriter
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
|
@ -17,12 +18,13 @@ OUTPUT_TYPES = ((".htm", "html"),
|
|||
|
||||
|
||||
def extract_text(files=[], outfile='-',
|
||||
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
||||
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
||||
output_type='text', codec='utf-8', strip_control=False,
|
||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||
layoutmode='normal', output_dir=None, debug=False,
|
||||
disable_caching=False, **kwargs):
|
||||
no_laparams=False, all_texts=None, detect_vertical=None,
|
||||
word_margin=None, char_margin=None, line_margin=None,
|
||||
boxes_flow=None, output_type='text', codec='utf-8',
|
||||
strip_control=False, maxpages=0, page_numbers=None,
|
||||
password="", scale=1.0, rotation=0, layoutmode='normal',
|
||||
output_dir=None, debug=False, disable_caching=False,
|
||||
**kwargs):
|
||||
if '_py2_no_more_posargs' in kwargs is not None:
|
||||
raise DeprecationWarning(
|
||||
'The `_py2_no_more_posargs will be removed on January, 2020. At '
|
||||
|
@ -33,11 +35,13 @@ def extract_text(files=[], outfile='-',
|
|||
if not files:
|
||||
raise ValueError("Must provide files to work upon!")
|
||||
|
||||
# If any LAParams group arguments were passed, create an LAParams object and
|
||||
# If any LAParams group arguments were passed,
|
||||
# create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if not no_laparams:
|
||||
laparams = pdfminer.layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||
for param in ("all_texts", "detect_vertical", "word_margin",
|
||||
"char_margin", "line_margin", "boxes_flow"):
|
||||
paramv = locals().get(param, None)
|
||||
if paramv is not None:
|
||||
setattr(laparams, param, paramv)
|
||||
|
@ -64,68 +68,105 @@ def extract_text(files=[], outfile='-',
|
|||
|
||||
def maketheparser():
|
||||
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
||||
parser.add_argument("files", type=str, default=None, nargs="+", help="One or more paths to PDF files.")
|
||||
parser.add_argument(
|
||||
"files", type=str, default=None, nargs="+",
|
||||
help="One or more paths to PDF files.")
|
||||
|
||||
parser.add_argument("--debug", "-d", default=False, action="store_true",
|
||||
parser.add_argument(
|
||||
"--debug", "-d", default=False, action="store_true",
|
||||
help="Use debug logging level.")
|
||||
parser.add_argument("--disable-caching", "-C", default=False, action="store_true",
|
||||
parser.add_argument(
|
||||
"--disable-caching", "-C", default=False, action="store_true",
|
||||
help="If caching or resources, such as fonts, should be disabled.")
|
||||
|
||||
parse_params = parser.add_argument_group('Parser', description='Used during PDF parsing')
|
||||
parse_params.add_argument("--page-numbers", type=int, default=None, nargs="+",
|
||||
parse_params = parser.add_argument_group(
|
||||
'Parser', description='Used during PDF parsing')
|
||||
parse_params.add_argument(
|
||||
"--page-numbers", type=int, default=None, nargs="+",
|
||||
help="A space-seperated list of page numbers to parse.")
|
||||
parse_params.add_argument("--pagenos", "-p", type=str,
|
||||
help="A comma-separated list of page numbers to parse. Included for legacy applications, "
|
||||
"use --page-numbers for more idiomatic argument entry.")
|
||||
parse_params.add_argument("--maxpages", "-m", type=int, default=0,
|
||||
parse_params.add_argument(
|
||||
"--pagenos", "-p", type=str,
|
||||
help="A comma-separated list of page numbers to parse. "
|
||||
"Included for legacy applications, use --page-numbers "
|
||||
"for more idiomatic argument entry.")
|
||||
parse_params.add_argument(
|
||||
"--maxpages", "-m", type=int, default=0,
|
||||
help="The maximum number of pages to parse.")
|
||||
parse_params.add_argument("--password", "-P", type=str, default="",
|
||||
parse_params.add_argument(
|
||||
"--password", "-P", type=str, default="",
|
||||
help="The password to use for decrypting PDF file.")
|
||||
parse_params.add_argument("--rotation", "-R", default=0, type=int,
|
||||
help="The number of degrees to rotate the PDF before other types of processing.")
|
||||
parse_params.add_argument(
|
||||
"--rotation", "-R", default=0, type=int,
|
||||
help="The number of degrees to rotate the PDF "
|
||||
"before other types of processing.")
|
||||
|
||||
la_params = parser.add_argument_group('Layout analysis', description='Used during layout analysis.')
|
||||
la_params.add_argument("--no-laparams", "-n", default=False, action="store_true",
|
||||
la_params = parser.add_argument_group(
|
||||
'Layout analysis', description='Used during layout analysis.')
|
||||
la_params.add_argument(
|
||||
"--no-laparams", "-n", default=False, action="store_true",
|
||||
help="If layout analysis parameters should be ignored.")
|
||||
la_params.add_argument("--detect-vertical", "-V", default=False, action="store_true",
|
||||
la_params.add_argument(
|
||||
"--detect-vertical", "-V", default=False, action="store_true",
|
||||
help="If vertical text should be considered during layout analysis")
|
||||
la_params.add_argument("--char-margin", "-M", type=float, default=2.0,
|
||||
help="If two characters are closer together than this margin they are considered to be part "
|
||||
"of the same word. The margin is specified relative to the width of the character.")
|
||||
la_params.add_argument("--word-margin", "-W", type=float, default=0.1,
|
||||
help="If two words are are closer together than this margin they are considered to be part "
|
||||
"of the same line. A space is added in between for readability. The margin is "
|
||||
"specified relative to the width of the word.")
|
||||
la_params.add_argument("--line-margin", "-L", type=float, default=0.5,
|
||||
help="If two lines are are close together they are considered to be part of the same "
|
||||
"paragraph. The margin is specified relative to the height of a line.")
|
||||
la_params.add_argument("--boxes-flow", "-F", type=float, default=0.5,
|
||||
help="Specifies how much a horizontal and vertical position of a text matters when "
|
||||
"determining the order of lines. The value should be within the range of -1.0 (only "
|
||||
"horizontal position matters) to +1.0 (only vertical position matters).")
|
||||
la_params.add_argument("--all-texts", "-A", default=True, action="store_true",
|
||||
la_params.add_argument(
|
||||
"--char-margin", "-M", type=float, default=2.0,
|
||||
help="If two characters are closer together than this margin they "
|
||||
"are considered to be part of the same word. The margin is "
|
||||
"specified relative to the width of the character.")
|
||||
la_params.add_argument(
|
||||
"--word-margin", "-W", type=float, default=0.1,
|
||||
help="If two words are are closer together than this margin they "
|
||||
"are considered to be part of the same line. A space is added "
|
||||
"in between for readability. The margin is specified relative "
|
||||
"to the width of the word.")
|
||||
la_params.add_argument(
|
||||
"--line-margin", "-L", type=float, default=0.5,
|
||||
help="If two lines are are close together they are considered to "
|
||||
"be part of the same paragraph. The margin is specified "
|
||||
"relative to the height of a line.")
|
||||
la_params.add_argument(
|
||||
"--boxes-flow", "-F", type=float, default=0.5,
|
||||
help="Specifies how much a horizontal and vertical position of a "
|
||||
"text matters when determining the order of lines. The value "
|
||||
"should be within the range of -1.0 (only horizontal position "
|
||||
"matters) to +1.0 (only vertical position matters).")
|
||||
la_params.add_argument(
|
||||
"--all-texts", "-A", default=True, action="store_true",
|
||||
help="If layout analysis should be performed on text in figures.")
|
||||
|
||||
output_params = parser.add_argument_group('Output', description='Used during output generation.')
|
||||
output_params.add_argument("--outfile", "-o", type=str, default="-",
|
||||
help="Path to file where output is written. Or \"-\" (default) to write to stdout.")
|
||||
output_params.add_argument("--output_type", "-t", type=str, default="text",
|
||||
output_params = parser.add_argument_group(
|
||||
'Output', description='Used during output generation.')
|
||||
output_params.add_argument(
|
||||
"--outfile", "-o", type=str, default="-",
|
||||
help="Path to file where output is written. "
|
||||
"Or \"-\" (default) to write to stdout.")
|
||||
output_params.add_argument(
|
||||
"--output_type", "-t", type=str, default="text",
|
||||
help="Type of output to generate {text,html,xml,tag}.")
|
||||
output_params.add_argument("--codec", "-c", type=str, default="utf-8",
|
||||
output_params.add_argument(
|
||||
"--codec", "-c", type=str, default="utf-8",
|
||||
help="Text encoding to use in output file.")
|
||||
output_params.add_argument("--output-dir", "-O", default=None,
|
||||
help="The output directory to put extracted images in. If not given, images are not "
|
||||
"extracted.")
|
||||
output_params.add_argument("--layoutmode", "-Y", default="normal", type=str,
|
||||
help="Type of layout to use when generating html {normal,exact,loose}. If normal, "
|
||||
"each line is positioned separately in the html. If exact, each character is "
|
||||
"positioned separately in the html. If loose, same result as normal but with an "
|
||||
"additional newline after each text line. Only used when output_type is html.")
|
||||
output_params.add_argument("--scale", "-s", type=float, default=1.0,
|
||||
help="The amount of zoom to use when generating html file. Only used when output_type "
|
||||
"is html.")
|
||||
output_params.add_argument("--strip-control", "-S", default=False, action="store_true",
|
||||
help="Remove control statement from text. Only used when output_type is xml.")
|
||||
output_params.add_argument(
|
||||
"--output-dir", "-O", default=None,
|
||||
help="The output directory to put extracted images in. If not given, "
|
||||
"images are not extracted.")
|
||||
output_params.add_argument(
|
||||
"--layoutmode", "-Y", default="normal",
|
||||
type=str, help="Type of layout to use when generating html "
|
||||
"{normal,exact,loose}. If normal,each line is"
|
||||
" positioned separately in the html. If exact"
|
||||
", each character is positioned separately in"
|
||||
" the html. If loose, same result as normal "
|
||||
"but with an additional newline after each "
|
||||
"text line. Only used when output_type is html.")
|
||||
output_params.add_argument(
|
||||
"--scale", "-s", type=float, default=1.0,
|
||||
help="The amount of zoom to use when generating html file. "
|
||||
"Only used when output_type is html.")
|
||||
output_params.add_argument(
|
||||
"--strip-control", "-S", default=False, action="store_true",
|
||||
help="Remove control statement from text. "
|
||||
"Only used when output_type is xml.")
|
||||
return parser
|
||||
|
||||
|
||||
|
@ -150,7 +191,6 @@ def main(args=None):
|
|||
if A.outfile.endswith(override):
|
||||
A.output_type = alttype
|
||||
|
||||
## Test Code
|
||||
outfp = extract_text(**vars(A))
|
||||
outfp.close()
|
||||
return 0
|
||||
|
|
121
tools/pdfdiff.py
121
tools/pdfdiff.py
|
@ -3,13 +3,13 @@
|
|||
"""
|
||||
compares rwo pdf files.
|
||||
"""
|
||||
from pdfminer import high_level, layout
|
||||
import sys
|
||||
import logging
|
||||
import six
|
||||
import pdfminer.settings
|
||||
pdfminer.settings.STRICT = False
|
||||
import pdfminer.high_level
|
||||
import pdfminer.layout
|
||||
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
|
@ -22,34 +22,33 @@ def compare(file1, file2, **kwargs):
|
|||
'upgrade to Python 3. For more information see '
|
||||
'https://github.com/pdfminer/pdfminer .six/issues/194')
|
||||
|
||||
# If any LAParams group arguments were passed, create an LAParams object and
|
||||
# If any LAParams group arguments were passed,
|
||||
# create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if kwargs.get('laparams', None) is None:
|
||||
laparams = pdfminer.layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||
laparams = layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin",
|
||||
"char_margin", "line_margin", "boxes_flow"):
|
||||
paramv = kwargs.get(param, None)
|
||||
if paramv is not None:
|
||||
laparams[param]=paramv
|
||||
kwargs['laparams']=laparams
|
||||
|
||||
s1=six.StringIO()
|
||||
laparams[param] = paramv
|
||||
kwargs['laparams'] = laparams
|
||||
s1 = six.StringIO()
|
||||
with open(file1, "rb") as fp:
|
||||
pdfminer.high_level.extract_text_to_fp(fp, s1, **kwargs)
|
||||
|
||||
s2=six.StringIO()
|
||||
high_level.extract_text_to_fp(fp, s1, **kwargs)
|
||||
s2 = six.StringIO()
|
||||
with open(file2, "rb") as fp:
|
||||
pdfminer.high_level.extract_text_to_fp(fp, s2, **kwargs)
|
||||
|
||||
high_level.extract_text_to_fp(fp, s2, **kwargs)
|
||||
import difflib
|
||||
s1.seek(0)
|
||||
s2.seek(0)
|
||||
s1,s2=s1.readlines(), s2.readlines()
|
||||
s1, s2 = s1.readlines(), s2.readlines()
|
||||
|
||||
import os.path
|
||||
try:
|
||||
extension = os.path.splitext(kwargs['outfile'])[1][1:4]
|
||||
if extension.lower()=='htm':
|
||||
return difflib.HtmlDiff().make_file(s1,s2)
|
||||
if extension.lower() == 'htm':
|
||||
return difflib.HtmlDiff().make_file(s1, s2)
|
||||
except KeyError:
|
||||
pass
|
||||
return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])
|
||||
|
@ -62,35 +61,60 @@ def main(args=None):
|
|||
P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
|
||||
P.add_argument("file2", type=str, default=None, help="File 2 to compare.")
|
||||
P.add_argument("-o", "--outfile", type=str, default="-",
|
||||
help="Output file (default/'-' is stdout) \
|
||||
if .htm or .html, create an HTML table (or a complete HTML file containing the table) \
|
||||
showing a side by side, line by line comparison of text with inter-line \
|
||||
and intra-line change highlights. \
|
||||
The table can be generated in either full or contextual difference mode."
|
||||
)
|
||||
P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown")
|
||||
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
||||
help="Output file(default/'-' is stdout) if .htm or .html,"
|
||||
" create an HTML table (or a complete HTML file "
|
||||
"containing the table) showing a side by side, "
|
||||
"line by line comparison of text with inter-line and "
|
||||
"intra-line change highlights. The table can be "
|
||||
"generated in either full or "
|
||||
"contextual difference mode.")
|
||||
P.add_argument("-N", "--context-lines", default=3, type=int,
|
||||
help="context lines shown")
|
||||
P.add_argument("-d", "--debug", default=False, action="store_true",
|
||||
help="Debug output.")
|
||||
|
||||
# params for pdf2txt
|
||||
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
|
||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
||||
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
||||
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")
|
||||
P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)")
|
||||
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
|
||||
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
|
||||
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
|
||||
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
|
||||
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
|
||||
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
|
||||
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
|
||||
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
|
||||
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
|
||||
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
|
||||
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
|
||||
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
|
||||
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
|
||||
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
||||
P.add_argument("-p", "--pagenos", type=str,
|
||||
help="Comma-separated list of page numbers to parse. "
|
||||
"Included for legacy applications, "
|
||||
"use --page-numbers for more "
|
||||
"idiomatic argument entry.")
|
||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+",
|
||||
help="Alternative to --pagenos with space-separated "
|
||||
"numbers; supercedes --pagenos where it is used.")
|
||||
P.add_argument("-m", "--maxpages", type=int, default=0,
|
||||
help="Maximum pages to parse")
|
||||
P.add_argument("-P", "--password", type=str, default="",
|
||||
help="Decryption password for both PDFs")
|
||||
P.add_argument("-t", "--output_type", type=str, default="text",
|
||||
help="pdf2txt type: text|html|xml|tag (default is text)")
|
||||
P.add_argument("-c", "--codec", type=str, default="utf-8",
|
||||
help="Text encoding")
|
||||
P.add_argument("-s", "--scale", type=float, default=1.0, help="Scale")
|
||||
P.add_argument("-A", "--all-texts", default=None, action="store_true",
|
||||
help="LAParams all texts")
|
||||
P.add_argument("-V", "--detect-vertical", default=None,
|
||||
action="store_true", help="LAParams detect vertical")
|
||||
P.add_argument("-W", "--word-margin", type=float, default=None,
|
||||
help="LAParams word margin")
|
||||
P.add_argument("-M", "--char-margin", type=float, default=None,
|
||||
help="LAParams char margin")
|
||||
P.add_argument("-L", "--line-margin", type=float, default=None,
|
||||
help="LAParams line margin")
|
||||
P.add_argument("-F", "--boxes-flow", type=float, default=None,
|
||||
help="LAParams boxes flow")
|
||||
P.add_argument("-Y", "--layoutmode", default="normal", type=str,
|
||||
help="HTML Layout Mode")
|
||||
P.add_argument("-n", "--no-laparams", default=False,
|
||||
action="store_true", help="Pass None as LAParams")
|
||||
P.add_argument("-R", "--rotation", default=0, type=int,
|
||||
help="Rotation")
|
||||
P.add_argument("-O", "--output-dir", default=None,
|
||||
help="Output directory for images")
|
||||
P.add_argument("-C", "--disable-caching", default=False,
|
||||
action="store_true", help="Disable caching")
|
||||
P.add_argument("-S", "--strip-control", default=False,
|
||||
action="store_true", help="Strip control in XML mode")
|
||||
|
||||
A = P.parse_args(args=args)
|
||||
|
||||
|
@ -106,10 +130,10 @@ def main(args=None):
|
|||
A.password = A.password.decode(sys.stdin.encoding)
|
||||
|
||||
if A.output_type == "text" and A.outfile != "-":
|
||||
for override, alttype in ( (".htm", "html"),
|
||||
for override, alttype in ((".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml" ),
|
||||
(".tag", "tag" ) ):
|
||||
(".xml", "xml"),
|
||||
(".tag", "tag")):
|
||||
if A.outfile.endswith(override):
|
||||
A.output_type = alttype
|
||||
|
||||
|
@ -122,4 +146,5 @@ def main(args=None):
|
|||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__': sys.exit(main())
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
|
@ -1,24 +1,27 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Exercise pdfminer, looking deeply into a PDF document, print some stats to stdout
|
||||
# Exercise pdfminer, looking deeply into a PDF document,
|
||||
# print some stats to stdout
|
||||
# Usage: pdfstats.py <PDF-filename>
|
||||
|
||||
import sys, os
|
||||
import sys
|
||||
import os
|
||||
import collections
|
||||
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import LAParams, LTContainer
|
||||
|
||||
|
||||
_, SCRIPT = os.path.split(__file__)
|
||||
|
||||
|
||||
def msg(*args, **kwargs):
|
||||
print(' '.join(map(str, args)), file=sys.stdout, **kwargs)
|
||||
print(' '.join(map(str, args)), **kwargs) # noqa E999
|
||||
|
||||
|
||||
def flat_iter(obj):
|
||||
yield obj
|
||||
|
@ -26,6 +29,7 @@ def flat_iter(obj):
|
|||
for ob in obj:
|
||||
yield from flat_iter(ob)
|
||||
|
||||
|
||||
def main(args):
|
||||
msg(SCRIPT, args)
|
||||
|
||||
|
@ -49,19 +53,16 @@ def main(args):
|
|||
document = PDFDocument(parser, password)
|
||||
# Check if the document allows text extraction.
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed(filename)
|
||||
raise PDFTextExtractionNotAllowed(infilename)
|
||||
|
||||
# Make a page iterator
|
||||
pages = PDFPage.create_pages(document)
|
||||
|
||||
|
||||
# Set up for some analysis
|
||||
rsrcmgr = PDFResourceManager()
|
||||
laparams = LAParams(
|
||||
detect_vertical=True,
|
||||
all_texts=True,
|
||||
)
|
||||
#device = PDFDevice(rsrcmgr)
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
||||
|
@ -76,5 +77,6 @@ def main(args):
|
|||
msg('page_count', page_count)
|
||||
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
|
|
|
@ -1,14 +1,16 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
def prof_main(argv):
|
||||
import hotshot, hotshot.stats
|
||||
import hotshot.stats
|
||||
|
||||
def usage():
|
||||
print ('usage: %s module.function [args ...]' % argv[0])
|
||||
print('usage: %s module.function [args ...]' % argv[0])
|
||||
return 100
|
||||
args = argv[1:]
|
||||
if len(args) < 1: return usage()
|
||||
if len(args) < 1:
|
||||
return usage()
|
||||
name = args.pop(0)
|
||||
prof = name+'.prof'
|
||||
i = name.rindex('.')
|
||||
|
@ -18,7 +20,7 @@ def prof_main(argv):
|
|||
if args:
|
||||
args.insert(0, argv[0])
|
||||
prof = hotshot.Profile(prof)
|
||||
prof.runcall(lambda : func(args))
|
||||
prof.runcall(lambda: func(args))
|
||||
prof.close()
|
||||
else:
|
||||
stats = hotshot.stats.load(prof)
|
||||
|
@ -27,4 +29,6 @@ def prof_main(argv):
|
|||
stats.print_stats(1000)
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(prof_main(sys.argv))
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(prof_main(sys.argv))
|
||||
|
|
7
tox.ini
7
tox.ini
|
@ -1,11 +1,18 @@
|
|||
[tox]
|
||||
envlist = py{27,34,35,36,37,38}
|
||||
|
||||
[flake8]
|
||||
per-file-ignores =
|
||||
pdfminer/rijndael.py:E202,E222
|
||||
|
||||
[testenv]
|
||||
extras =
|
||||
dev
|
||||
docs
|
||||
whitelist_externals =
|
||||
flake8
|
||||
commands =
|
||||
flake8 pdfminer/ tools/ tests/ --count --statistics
|
||||
nosetests --nologcapture
|
||||
python -m sphinx -b html docs/source docs/build/html
|
||||
python -m sphinx -b doctest docs/source docs/build/doctest
|
||||
|
|
Loading…
Reference in New Issue