Check blackness in github actions (#711)
* Check blackness in github actions * Blacken code * Update github action names * Add contributing guidelines on using black * Add to checklist for PRpull/688/head^2
parent
830acff94c
commit
b9a8920cdf
|
@ -0,0 +1,5 @@
|
|||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore =
|
||||
# See https://github.com/PyCQA/pycodestyle/issues/373
|
||||
E203,
|
|
@ -1,22 +1,17 @@
|
|||
**Pull request**
|
||||
|
||||
Thanks for improving pdfminer.six! Please include the following information to
|
||||
help us discuss and merge this PR:
|
||||
|
||||
- A description of why this PR is needed. What does it fix? What does it
|
||||
improve?
|
||||
- A summary of the things that this PR changes.
|
||||
- Reference the issues that this PR fixes (use the fixes #(issue nr) syntax).
|
||||
If this PR does not fix any issue, create the issue first and mention that
|
||||
you are willing to work on it.
|
||||
Please remove this paragraph and replace it with a description of your PR.
|
||||
Also include links to the issues that it fixes.
|
||||
|
||||
**How Has This Been Tested?**
|
||||
|
||||
Please describe the tests that you ran to verify your changes. Provide
|
||||
instructions so we can reproduce. Include an example pdf if you have one.
|
||||
Please repalce this paragraph with a description of how this PR has been
|
||||
tested. Include the necessary instructions and files such that other can
|
||||
reproduce it.
|
||||
|
||||
**Checklist**
|
||||
|
||||
- [ ] I have formatted my code with [black](https://github.com/psf/black).
|
||||
- [ ] I have added tests that prove my fix is effective or that my feature
|
||||
works
|
||||
- [ ] I have added docstrings to newly created methods and classes
|
||||
|
|
|
@ -15,6 +15,15 @@ env:
|
|||
|
||||
jobs:
|
||||
|
||||
check-code-formatting:
|
||||
name: Check code formatting
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v2
|
||||
- name: Check code formatting
|
||||
uses: psf/black@stable
|
||||
|
||||
check-coding-style:
|
||||
name: Check coding style
|
||||
runs-on: ubuntu-latest
|
||||
|
|
|
@ -31,7 +31,7 @@ Any contribution is appreciated! You might want to:
|
|||
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
|
||||
of features, this will show that your code works correctly.
|
||||
* Code should work for Python 3.6+.
|
||||
* Code should conform to PEP8 coding style.
|
||||
* Code should be formatted with [black](https://github.com/psf/black).
|
||||
* New features should be well documented using docstrings.
|
||||
* Check spelling and grammar.
|
||||
* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])
|
||||
|
@ -68,3 +68,9 @@ Any contribution is appreciated! You might want to:
|
|||
```sh
|
||||
nox -e py36
|
||||
```
|
||||
|
||||
4. After changing the code, run the black formatter.
|
||||
|
||||
```sh
|
||||
black .
|
||||
```
|
||||
|
|
|
@ -16,14 +16,13 @@ from typing import List
|
|||
|
||||
import pdfminer
|
||||
|
||||
sys.path.insert(0, os.path.join(
|
||||
os.path.abspath(os.path.dirname(__file__)), '../../'))
|
||||
sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../"))
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'pdfminer.six'
|
||||
copyright = '2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman'
|
||||
author = 'Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman'
|
||||
project = "pdfminer.six"
|
||||
copyright = "2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
|
||||
author = "Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = pdfminer.__version__
|
||||
|
@ -35,16 +34,16 @@ release = pdfminer.__version__
|
|||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinxarg.ext',
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.doctest',
|
||||
"sphinxarg.ext",
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.doctest",
|
||||
]
|
||||
|
||||
# Root rst file
|
||||
master_doc = 'index'
|
||||
master_doc = "index"
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
|
@ -57,9 +56,9 @@ exclude_patterns: List[str] = []
|
|||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'alabaster'
|
||||
html_theme = "alabaster"
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
html_static_path = ["_static"]
|
||||
|
|
37
noxfile.py
37
noxfile.py
|
@ -6,53 +6,30 @@ PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"]
|
|||
|
||||
@nox.session
|
||||
def lint(session):
|
||||
session.install('flake8')
|
||||
session.run(
|
||||
'flake8',
|
||||
'pdfminer/',
|
||||
'tools/',
|
||||
'tests/',
|
||||
'--count',
|
||||
'--statistics'
|
||||
)
|
||||
session.install("flake8")
|
||||
session.run("flake8", "pdfminer/", "tools/", "tests/", "--count", "--statistics")
|
||||
|
||||
|
||||
@nox.session
|
||||
def types(session):
|
||||
session.install('mypy')
|
||||
session.install("mypy")
|
||||
session.run(
|
||||
'mypy',
|
||||
'--install-types',
|
||||
'--non-interactive',
|
||||
'--show-error-codes',
|
||||
'.'
|
||||
"mypy", "--install-types", "--non-interactive", "--show-error-codes", "."
|
||||
)
|
||||
|
||||
|
||||
@nox.session(python=PYTHON_ALL_VERSIONS)
|
||||
def tests(session):
|
||||
session.install("-e", ".[dev]")
|
||||
session.run('pytest')
|
||||
session.run("pytest")
|
||||
|
||||
|
||||
@nox.session
|
||||
def docs(session):
|
||||
session.install("-e", ".[docs]")
|
||||
session.run(
|
||||
'python',
|
||||
'-m',
|
||||
'sphinx',
|
||||
'-b',
|
||||
'html',
|
||||
'docs/source',
|
||||
'docs/build/html'
|
||||
"python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html"
|
||||
)
|
||||
session.run(
|
||||
'python',
|
||||
'-m',
|
||||
'sphinx',
|
||||
'-b',
|
||||
'doctest',
|
||||
'docs/source',
|
||||
'docs/build/doctest'
|
||||
"python", "-m", "sphinx", "-b", "doctest", "docs/source", "docs/build/doctest"
|
||||
)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
__version__ = '20211012'
|
||||
__version__ = "20211012"
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
print(__version__)
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
|
||||
"""An implementation of RFC4013 SASLprep."""
|
||||
|
||||
__all__ = ['saslprep']
|
||||
__all__ = ["saslprep"]
|
||||
|
||||
import stringprep
|
||||
from typing import Callable, Tuple
|
||||
|
@ -37,7 +37,8 @@ _PROHIBITED: Tuple[Callable[[str], bool], ...] = (
|
|||
stringprep.in_table_c6,
|
||||
stringprep.in_table_c7,
|
||||
stringprep.in_table_c8,
|
||||
stringprep.in_table_c9)
|
||||
stringprep.in_table_c9,
|
||||
)
|
||||
|
||||
|
||||
def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
|
||||
|
@ -63,12 +64,12 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
|
|||
in_table_c12 = stringprep.in_table_c12
|
||||
in_table_b1 = stringprep.in_table_b1
|
||||
data = "".join(
|
||||
["\u0020" if in_table_c12(elt) else elt
|
||||
for elt in data if not in_table_b1(elt)])
|
||||
["\u0020" if in_table_c12(elt) else elt for elt in data if not in_table_b1(elt)]
|
||||
)
|
||||
|
||||
# RFC3454 section 2, step 2 - Normalize
|
||||
# RFC4013 section 2.2 normalization
|
||||
data = unicodedata.ucd_3_2_0.normalize('NFKC', data)
|
||||
data = unicodedata.ucd_3_2_0.normalize("NFKC", data)
|
||||
|
||||
in_table_d1 = stringprep.in_table_d1
|
||||
if in_table_d1(data[0]):
|
||||
|
@ -89,7 +90,6 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
|
|||
# RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
|
||||
for char in data:
|
||||
if any(in_table(char) for in_table in prohibited):
|
||||
raise ValueError(
|
||||
"SASLprep: failed prohibited character check")
|
||||
raise ValueError("SASLprep: failed prohibited character check")
|
||||
|
||||
return data
|
||||
|
|
|
@ -9,7 +9,6 @@ from typing import Sequence
|
|||
|
||||
|
||||
class Arcfour:
|
||||
|
||||
def __init__(self, key: Sequence[int]) -> None:
|
||||
# because Py3 range is not indexable
|
||||
s = [i for i in range(256)]
|
||||
|
@ -24,7 +23,7 @@ class Arcfour:
|
|||
def process(self, data: bytes) -> bytes:
|
||||
(i, j) = (self.i, self.j)
|
||||
s = self.s
|
||||
r = b''
|
||||
r = b""
|
||||
for c in iter(data):
|
||||
i = (i + 1) % 256
|
||||
j = (j + s[i]) % 256
|
||||
|
|
|
@ -21,30 +21,30 @@ def ascii85decode(data: bytes) -> bytes:
|
|||
|
||||
"""
|
||||
n = b = 0
|
||||
out = b''
|
||||
out = b""
|
||||
for i in iter(data):
|
||||
c = bytes((i,))
|
||||
if b'!' <= c and c <= b'u':
|
||||
if b"!" <= c and c <= b"u":
|
||||
n += 1
|
||||
b = b * 85 + (ord(c) - 33)
|
||||
if n == 5:
|
||||
out += struct.pack('>L', b)
|
||||
out += struct.pack(">L", b)
|
||||
n = b = 0
|
||||
elif c == b'z':
|
||||
elif c == b"z":
|
||||
assert n == 0, str(n)
|
||||
out += b'\0\0\0\0'
|
||||
elif c == b'~':
|
||||
out += b"\0\0\0\0"
|
||||
elif c == b"~":
|
||||
if n:
|
||||
for _ in range(5 - n):
|
||||
b = b * 85 + 84
|
||||
out += struct.pack('>L', b)[:n-1]
|
||||
out += struct.pack(">L", b)[: n - 1]
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
# asciihexdecode(data)
|
||||
hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE)
|
||||
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||
hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
|
||||
trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
|
||||
|
||||
|
||||
def asciihexdecode(data: bytes) -> bytes:
|
||||
|
@ -57,15 +57,16 @@ def asciihexdecode(data: bytes) -> bytes:
|
|||
the EOD marker after reading an odd number of hexadecimal digits, it
|
||||
will behave as if a 0 followed the last digit.
|
||||
"""
|
||||
|
||||
def decode(x: bytes) -> bytes:
|
||||
i = int(x, 16)
|
||||
return bytes((i,))
|
||||
|
||||
out = b''
|
||||
out = b""
|
||||
for x in hex_re.findall(data):
|
||||
out += decode(x)
|
||||
|
||||
m = trail_re.search(data)
|
||||
if m:
|
||||
out += decode(m.group(1)+b'0')
|
||||
out += decode(m.group(1) + b"0")
|
||||
return out
|
||||
|
|
|
@ -12,8 +12,18 @@
|
|||
|
||||
|
||||
import array
|
||||
from typing import (Any, Callable, Dict, Iterator, List, MutableSequence,
|
||||
Optional, Sequence, Union, cast)
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
MutableSequence,
|
||||
Optional,
|
||||
Sequence,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
|
||||
def get_bytes(data: bytes) -> Iterator[int]:
|
||||
|
@ -46,7 +56,7 @@ class BitParser:
|
|||
if p[b] is None:
|
||||
p[b] = [None, None]
|
||||
p = p[b]
|
||||
if bits[i] == '1':
|
||||
if bits[i] == "1":
|
||||
b = 1
|
||||
else:
|
||||
b = 0
|
||||
|
@ -74,252 +84,252 @@ class BitParser:
|
|||
class CCITTG4Parser(BitParser):
|
||||
|
||||
MODE = [None, None]
|
||||
BitParser.add(MODE, 0, '1')
|
||||
BitParser.add(MODE, +1, '011')
|
||||
BitParser.add(MODE, -1, '010')
|
||||
BitParser.add(MODE, 'h', '001')
|
||||
BitParser.add(MODE, 'p', '0001')
|
||||
BitParser.add(MODE, +2, '000011')
|
||||
BitParser.add(MODE, -2, '000010')
|
||||
BitParser.add(MODE, +3, '0000011')
|
||||
BitParser.add(MODE, -3, '0000010')
|
||||
BitParser.add(MODE, 'u', '0000001111')
|
||||
BitParser.add(MODE, 'x1', '0000001000')
|
||||
BitParser.add(MODE, 'x2', '0000001001')
|
||||
BitParser.add(MODE, 'x3', '0000001010')
|
||||
BitParser.add(MODE, 'x4', '0000001011')
|
||||
BitParser.add(MODE, 'x5', '0000001100')
|
||||
BitParser.add(MODE, 'x6', '0000001101')
|
||||
BitParser.add(MODE, 'x7', '0000001110')
|
||||
BitParser.add(MODE, 'e', '000000000001000000000001')
|
||||
BitParser.add(MODE, 0, "1")
|
||||
BitParser.add(MODE, +1, "011")
|
||||
BitParser.add(MODE, -1, "010")
|
||||
BitParser.add(MODE, "h", "001")
|
||||
BitParser.add(MODE, "p", "0001")
|
||||
BitParser.add(MODE, +2, "000011")
|
||||
BitParser.add(MODE, -2, "000010")
|
||||
BitParser.add(MODE, +3, "0000011")
|
||||
BitParser.add(MODE, -3, "0000010")
|
||||
BitParser.add(MODE, "u", "0000001111")
|
||||
BitParser.add(MODE, "x1", "0000001000")
|
||||
BitParser.add(MODE, "x2", "0000001001")
|
||||
BitParser.add(MODE, "x3", "0000001010")
|
||||
BitParser.add(MODE, "x4", "0000001011")
|
||||
BitParser.add(MODE, "x5", "0000001100")
|
||||
BitParser.add(MODE, "x6", "0000001101")
|
||||
BitParser.add(MODE, "x7", "0000001110")
|
||||
BitParser.add(MODE, "e", "000000000001000000000001")
|
||||
|
||||
WHITE = [None, None]
|
||||
BitParser.add(WHITE, 0, '00110101')
|
||||
BitParser.add(WHITE, 1, '000111')
|
||||
BitParser.add(WHITE, 2, '0111')
|
||||
BitParser.add(WHITE, 3, '1000')
|
||||
BitParser.add(WHITE, 4, '1011')
|
||||
BitParser.add(WHITE, 5, '1100')
|
||||
BitParser.add(WHITE, 6, '1110')
|
||||
BitParser.add(WHITE, 7, '1111')
|
||||
BitParser.add(WHITE, 8, '10011')
|
||||
BitParser.add(WHITE, 9, '10100')
|
||||
BitParser.add(WHITE, 10, '00111')
|
||||
BitParser.add(WHITE, 11, '01000')
|
||||
BitParser.add(WHITE, 12, '001000')
|
||||
BitParser.add(WHITE, 13, '000011')
|
||||
BitParser.add(WHITE, 14, '110100')
|
||||
BitParser.add(WHITE, 15, '110101')
|
||||
BitParser.add(WHITE, 16, '101010')
|
||||
BitParser.add(WHITE, 17, '101011')
|
||||
BitParser.add(WHITE, 18, '0100111')
|
||||
BitParser.add(WHITE, 19, '0001100')
|
||||
BitParser.add(WHITE, 20, '0001000')
|
||||
BitParser.add(WHITE, 21, '0010111')
|
||||
BitParser.add(WHITE, 22, '0000011')
|
||||
BitParser.add(WHITE, 23, '0000100')
|
||||
BitParser.add(WHITE, 24, '0101000')
|
||||
BitParser.add(WHITE, 25, '0101011')
|
||||
BitParser.add(WHITE, 26, '0010011')
|
||||
BitParser.add(WHITE, 27, '0100100')
|
||||
BitParser.add(WHITE, 28, '0011000')
|
||||
BitParser.add(WHITE, 29, '00000010')
|
||||
BitParser.add(WHITE, 30, '00000011')
|
||||
BitParser.add(WHITE, 31, '00011010')
|
||||
BitParser.add(WHITE, 32, '00011011')
|
||||
BitParser.add(WHITE, 33, '00010010')
|
||||
BitParser.add(WHITE, 34, '00010011')
|
||||
BitParser.add(WHITE, 35, '00010100')
|
||||
BitParser.add(WHITE, 36, '00010101')
|
||||
BitParser.add(WHITE, 37, '00010110')
|
||||
BitParser.add(WHITE, 38, '00010111')
|
||||
BitParser.add(WHITE, 39, '00101000')
|
||||
BitParser.add(WHITE, 40, '00101001')
|
||||
BitParser.add(WHITE, 41, '00101010')
|
||||
BitParser.add(WHITE, 42, '00101011')
|
||||
BitParser.add(WHITE, 43, '00101100')
|
||||
BitParser.add(WHITE, 44, '00101101')
|
||||
BitParser.add(WHITE, 45, '00000100')
|
||||
BitParser.add(WHITE, 46, '00000101')
|
||||
BitParser.add(WHITE, 47, '00001010')
|
||||
BitParser.add(WHITE, 48, '00001011')
|
||||
BitParser.add(WHITE, 49, '01010010')
|
||||
BitParser.add(WHITE, 50, '01010011')
|
||||
BitParser.add(WHITE, 51, '01010100')
|
||||
BitParser.add(WHITE, 52, '01010101')
|
||||
BitParser.add(WHITE, 53, '00100100')
|
||||
BitParser.add(WHITE, 54, '00100101')
|
||||
BitParser.add(WHITE, 55, '01011000')
|
||||
BitParser.add(WHITE, 56, '01011001')
|
||||
BitParser.add(WHITE, 57, '01011010')
|
||||
BitParser.add(WHITE, 58, '01011011')
|
||||
BitParser.add(WHITE, 59, '01001010')
|
||||
BitParser.add(WHITE, 60, '01001011')
|
||||
BitParser.add(WHITE, 61, '00110010')
|
||||
BitParser.add(WHITE, 62, '00110011')
|
||||
BitParser.add(WHITE, 63, '00110100')
|
||||
BitParser.add(WHITE, 64, '11011')
|
||||
BitParser.add(WHITE, 128, '10010')
|
||||
BitParser.add(WHITE, 192, '010111')
|
||||
BitParser.add(WHITE, 256, '0110111')
|
||||
BitParser.add(WHITE, 320, '00110110')
|
||||
BitParser.add(WHITE, 384, '00110111')
|
||||
BitParser.add(WHITE, 448, '01100100')
|
||||
BitParser.add(WHITE, 512, '01100101')
|
||||
BitParser.add(WHITE, 576, '01101000')
|
||||
BitParser.add(WHITE, 640, '01100111')
|
||||
BitParser.add(WHITE, 704, '011001100')
|
||||
BitParser.add(WHITE, 768, '011001101')
|
||||
BitParser.add(WHITE, 832, '011010010')
|
||||
BitParser.add(WHITE, 896, '011010011')
|
||||
BitParser.add(WHITE, 960, '011010100')
|
||||
BitParser.add(WHITE, 1024, '011010101')
|
||||
BitParser.add(WHITE, 1088, '011010110')
|
||||
BitParser.add(WHITE, 1152, '011010111')
|
||||
BitParser.add(WHITE, 1216, '011011000')
|
||||
BitParser.add(WHITE, 1280, '011011001')
|
||||
BitParser.add(WHITE, 1344, '011011010')
|
||||
BitParser.add(WHITE, 1408, '011011011')
|
||||
BitParser.add(WHITE, 1472, '010011000')
|
||||
BitParser.add(WHITE, 1536, '010011001')
|
||||
BitParser.add(WHITE, 1600, '010011010')
|
||||
BitParser.add(WHITE, 1664, '011000')
|
||||
BitParser.add(WHITE, 1728, '010011011')
|
||||
BitParser.add(WHITE, 1792, '00000001000')
|
||||
BitParser.add(WHITE, 1856, '00000001100')
|
||||
BitParser.add(WHITE, 1920, '00000001101')
|
||||
BitParser.add(WHITE, 1984, '000000010010')
|
||||
BitParser.add(WHITE, 2048, '000000010011')
|
||||
BitParser.add(WHITE, 2112, '000000010100')
|
||||
BitParser.add(WHITE, 2176, '000000010101')
|
||||
BitParser.add(WHITE, 2240, '000000010110')
|
||||
BitParser.add(WHITE, 2304, '000000010111')
|
||||
BitParser.add(WHITE, 2368, '000000011100')
|
||||
BitParser.add(WHITE, 2432, '000000011101')
|
||||
BitParser.add(WHITE, 2496, '000000011110')
|
||||
BitParser.add(WHITE, 2560, '000000011111')
|
||||
BitParser.add(WHITE, 0, "00110101")
|
||||
BitParser.add(WHITE, 1, "000111")
|
||||
BitParser.add(WHITE, 2, "0111")
|
||||
BitParser.add(WHITE, 3, "1000")
|
||||
BitParser.add(WHITE, 4, "1011")
|
||||
BitParser.add(WHITE, 5, "1100")
|
||||
BitParser.add(WHITE, 6, "1110")
|
||||
BitParser.add(WHITE, 7, "1111")
|
||||
BitParser.add(WHITE, 8, "10011")
|
||||
BitParser.add(WHITE, 9, "10100")
|
||||
BitParser.add(WHITE, 10, "00111")
|
||||
BitParser.add(WHITE, 11, "01000")
|
||||
BitParser.add(WHITE, 12, "001000")
|
||||
BitParser.add(WHITE, 13, "000011")
|
||||
BitParser.add(WHITE, 14, "110100")
|
||||
BitParser.add(WHITE, 15, "110101")
|
||||
BitParser.add(WHITE, 16, "101010")
|
||||
BitParser.add(WHITE, 17, "101011")
|
||||
BitParser.add(WHITE, 18, "0100111")
|
||||
BitParser.add(WHITE, 19, "0001100")
|
||||
BitParser.add(WHITE, 20, "0001000")
|
||||
BitParser.add(WHITE, 21, "0010111")
|
||||
BitParser.add(WHITE, 22, "0000011")
|
||||
BitParser.add(WHITE, 23, "0000100")
|
||||
BitParser.add(WHITE, 24, "0101000")
|
||||
BitParser.add(WHITE, 25, "0101011")
|
||||
BitParser.add(WHITE, 26, "0010011")
|
||||
BitParser.add(WHITE, 27, "0100100")
|
||||
BitParser.add(WHITE, 28, "0011000")
|
||||
BitParser.add(WHITE, 29, "00000010")
|
||||
BitParser.add(WHITE, 30, "00000011")
|
||||
BitParser.add(WHITE, 31, "00011010")
|
||||
BitParser.add(WHITE, 32, "00011011")
|
||||
BitParser.add(WHITE, 33, "00010010")
|
||||
BitParser.add(WHITE, 34, "00010011")
|
||||
BitParser.add(WHITE, 35, "00010100")
|
||||
BitParser.add(WHITE, 36, "00010101")
|
||||
BitParser.add(WHITE, 37, "00010110")
|
||||
BitParser.add(WHITE, 38, "00010111")
|
||||
BitParser.add(WHITE, 39, "00101000")
|
||||
BitParser.add(WHITE, 40, "00101001")
|
||||
BitParser.add(WHITE, 41, "00101010")
|
||||
BitParser.add(WHITE, 42, "00101011")
|
||||
BitParser.add(WHITE, 43, "00101100")
|
||||
BitParser.add(WHITE, 44, "00101101")
|
||||
BitParser.add(WHITE, 45, "00000100")
|
||||
BitParser.add(WHITE, 46, "00000101")
|
||||
BitParser.add(WHITE, 47, "00001010")
|
||||
BitParser.add(WHITE, 48, "00001011")
|
||||
BitParser.add(WHITE, 49, "01010010")
|
||||
BitParser.add(WHITE, 50, "01010011")
|
||||
BitParser.add(WHITE, 51, "01010100")
|
||||
BitParser.add(WHITE, 52, "01010101")
|
||||
BitParser.add(WHITE, 53, "00100100")
|
||||
BitParser.add(WHITE, 54, "00100101")
|
||||
BitParser.add(WHITE, 55, "01011000")
|
||||
BitParser.add(WHITE, 56, "01011001")
|
||||
BitParser.add(WHITE, 57, "01011010")
|
||||
BitParser.add(WHITE, 58, "01011011")
|
||||
BitParser.add(WHITE, 59, "01001010")
|
||||
BitParser.add(WHITE, 60, "01001011")
|
||||
BitParser.add(WHITE, 61, "00110010")
|
||||
BitParser.add(WHITE, 62, "00110011")
|
||||
BitParser.add(WHITE, 63, "00110100")
|
||||
BitParser.add(WHITE, 64, "11011")
|
||||
BitParser.add(WHITE, 128, "10010")
|
||||
BitParser.add(WHITE, 192, "010111")
|
||||
BitParser.add(WHITE, 256, "0110111")
|
||||
BitParser.add(WHITE, 320, "00110110")
|
||||
BitParser.add(WHITE, 384, "00110111")
|
||||
BitParser.add(WHITE, 448, "01100100")
|
||||
BitParser.add(WHITE, 512, "01100101")
|
||||
BitParser.add(WHITE, 576, "01101000")
|
||||
BitParser.add(WHITE, 640, "01100111")
|
||||
BitParser.add(WHITE, 704, "011001100")
|
||||
BitParser.add(WHITE, 768, "011001101")
|
||||
BitParser.add(WHITE, 832, "011010010")
|
||||
BitParser.add(WHITE, 896, "011010011")
|
||||
BitParser.add(WHITE, 960, "011010100")
|
||||
BitParser.add(WHITE, 1024, "011010101")
|
||||
BitParser.add(WHITE, 1088, "011010110")
|
||||
BitParser.add(WHITE, 1152, "011010111")
|
||||
BitParser.add(WHITE, 1216, "011011000")
|
||||
BitParser.add(WHITE, 1280, "011011001")
|
||||
BitParser.add(WHITE, 1344, "011011010")
|
||||
BitParser.add(WHITE, 1408, "011011011")
|
||||
BitParser.add(WHITE, 1472, "010011000")
|
||||
BitParser.add(WHITE, 1536, "010011001")
|
||||
BitParser.add(WHITE, 1600, "010011010")
|
||||
BitParser.add(WHITE, 1664, "011000")
|
||||
BitParser.add(WHITE, 1728, "010011011")
|
||||
BitParser.add(WHITE, 1792, "00000001000")
|
||||
BitParser.add(WHITE, 1856, "00000001100")
|
||||
BitParser.add(WHITE, 1920, "00000001101")
|
||||
BitParser.add(WHITE, 1984, "000000010010")
|
||||
BitParser.add(WHITE, 2048, "000000010011")
|
||||
BitParser.add(WHITE, 2112, "000000010100")
|
||||
BitParser.add(WHITE, 2176, "000000010101")
|
||||
BitParser.add(WHITE, 2240, "000000010110")
|
||||
BitParser.add(WHITE, 2304, "000000010111")
|
||||
BitParser.add(WHITE, 2368, "000000011100")
|
||||
BitParser.add(WHITE, 2432, "000000011101")
|
||||
BitParser.add(WHITE, 2496, "000000011110")
|
||||
BitParser.add(WHITE, 2560, "000000011111")
|
||||
|
||||
BLACK = [None, None]
|
||||
BitParser.add(BLACK, 0, '0000110111')
|
||||
BitParser.add(BLACK, 1, '010')
|
||||
BitParser.add(BLACK, 2, '11')
|
||||
BitParser.add(BLACK, 3, '10')
|
||||
BitParser.add(BLACK, 4, '011')
|
||||
BitParser.add(BLACK, 5, '0011')
|
||||
BitParser.add(BLACK, 6, '0010')
|
||||
BitParser.add(BLACK, 7, '00011')
|
||||
BitParser.add(BLACK, 8, '000101')
|
||||
BitParser.add(BLACK, 9, '000100')
|
||||
BitParser.add(BLACK, 10, '0000100')
|
||||
BitParser.add(BLACK, 11, '0000101')
|
||||
BitParser.add(BLACK, 12, '0000111')
|
||||
BitParser.add(BLACK, 13, '00000100')
|
||||
BitParser.add(BLACK, 14, '00000111')
|
||||
BitParser.add(BLACK, 15, '000011000')
|
||||
BitParser.add(BLACK, 16, '0000010111')
|
||||
BitParser.add(BLACK, 17, '0000011000')
|
||||
BitParser.add(BLACK, 18, '0000001000')
|
||||
BitParser.add(BLACK, 19, '00001100111')
|
||||
BitParser.add(BLACK, 20, '00001101000')
|
||||
BitParser.add(BLACK, 21, '00001101100')
|
||||
BitParser.add(BLACK, 22, '00000110111')
|
||||
BitParser.add(BLACK, 23, '00000101000')
|
||||
BitParser.add(BLACK, 24, '00000010111')
|
||||
BitParser.add(BLACK, 25, '00000011000')
|
||||
BitParser.add(BLACK, 26, '000011001010')
|
||||
BitParser.add(BLACK, 27, '000011001011')
|
||||
BitParser.add(BLACK, 28, '000011001100')
|
||||
BitParser.add(BLACK, 29, '000011001101')
|
||||
BitParser.add(BLACK, 30, '000001101000')
|
||||
BitParser.add(BLACK, 31, '000001101001')
|
||||
BitParser.add(BLACK, 32, '000001101010')
|
||||
BitParser.add(BLACK, 33, '000001101011')
|
||||
BitParser.add(BLACK, 34, '000011010010')
|
||||
BitParser.add(BLACK, 35, '000011010011')
|
||||
BitParser.add(BLACK, 36, '000011010100')
|
||||
BitParser.add(BLACK, 37, '000011010101')
|
||||
BitParser.add(BLACK, 38, '000011010110')
|
||||
BitParser.add(BLACK, 39, '000011010111')
|
||||
BitParser.add(BLACK, 40, '000001101100')
|
||||
BitParser.add(BLACK, 41, '000001101101')
|
||||
BitParser.add(BLACK, 42, '000011011010')
|
||||
BitParser.add(BLACK, 43, '000011011011')
|
||||
BitParser.add(BLACK, 44, '000001010100')
|
||||
BitParser.add(BLACK, 45, '000001010101')
|
||||
BitParser.add(BLACK, 46, '000001010110')
|
||||
BitParser.add(BLACK, 47, '000001010111')
|
||||
BitParser.add(BLACK, 48, '000001100100')
|
||||
BitParser.add(BLACK, 49, '000001100101')
|
||||
BitParser.add(BLACK, 50, '000001010010')
|
||||
BitParser.add(BLACK, 51, '000001010011')
|
||||
BitParser.add(BLACK, 52, '000000100100')
|
||||
BitParser.add(BLACK, 53, '000000110111')
|
||||
BitParser.add(BLACK, 54, '000000111000')
|
||||
BitParser.add(BLACK, 55, '000000100111')
|
||||
BitParser.add(BLACK, 56, '000000101000')
|
||||
BitParser.add(BLACK, 57, '000001011000')
|
||||
BitParser.add(BLACK, 58, '000001011001')
|
||||
BitParser.add(BLACK, 59, '000000101011')
|
||||
BitParser.add(BLACK, 60, '000000101100')
|
||||
BitParser.add(BLACK, 61, '000001011010')
|
||||
BitParser.add(BLACK, 62, '000001100110')
|
||||
BitParser.add(BLACK, 63, '000001100111')
|
||||
BitParser.add(BLACK, 64, '0000001111')
|
||||
BitParser.add(BLACK, 128, '000011001000')
|
||||
BitParser.add(BLACK, 192, '000011001001')
|
||||
BitParser.add(BLACK, 256, '000001011011')
|
||||
BitParser.add(BLACK, 320, '000000110011')
|
||||
BitParser.add(BLACK, 384, '000000110100')
|
||||
BitParser.add(BLACK, 448, '000000110101')
|
||||
BitParser.add(BLACK, 512, '0000001101100')
|
||||
BitParser.add(BLACK, 576, '0000001101101')
|
||||
BitParser.add(BLACK, 640, '0000001001010')
|
||||
BitParser.add(BLACK, 704, '0000001001011')
|
||||
BitParser.add(BLACK, 768, '0000001001100')
|
||||
BitParser.add(BLACK, 832, '0000001001101')
|
||||
BitParser.add(BLACK, 896, '0000001110010')
|
||||
BitParser.add(BLACK, 960, '0000001110011')
|
||||
BitParser.add(BLACK, 1024, '0000001110100')
|
||||
BitParser.add(BLACK, 1088, '0000001110101')
|
||||
BitParser.add(BLACK, 1152, '0000001110110')
|
||||
BitParser.add(BLACK, 1216, '0000001110111')
|
||||
BitParser.add(BLACK, 1280, '0000001010010')
|
||||
BitParser.add(BLACK, 1344, '0000001010011')
|
||||
BitParser.add(BLACK, 1408, '0000001010100')
|
||||
BitParser.add(BLACK, 1472, '0000001010101')
|
||||
BitParser.add(BLACK, 1536, '0000001011010')
|
||||
BitParser.add(BLACK, 1600, '0000001011011')
|
||||
BitParser.add(BLACK, 1664, '0000001100100')
|
||||
BitParser.add(BLACK, 1728, '0000001100101')
|
||||
BitParser.add(BLACK, 1792, '00000001000')
|
||||
BitParser.add(BLACK, 1856, '00000001100')
|
||||
BitParser.add(BLACK, 1920, '00000001101')
|
||||
BitParser.add(BLACK, 1984, '000000010010')
|
||||
BitParser.add(BLACK, 2048, '000000010011')
|
||||
BitParser.add(BLACK, 2112, '000000010100')
|
||||
BitParser.add(BLACK, 2176, '000000010101')
|
||||
BitParser.add(BLACK, 2240, '000000010110')
|
||||
BitParser.add(BLACK, 2304, '000000010111')
|
||||
BitParser.add(BLACK, 2368, '000000011100')
|
||||
BitParser.add(BLACK, 2432, '000000011101')
|
||||
BitParser.add(BLACK, 2496, '000000011110')
|
||||
BitParser.add(BLACK, 2560, '000000011111')
|
||||
BitParser.add(BLACK, 0, "0000110111")
|
||||
BitParser.add(BLACK, 1, "010")
|
||||
BitParser.add(BLACK, 2, "11")
|
||||
BitParser.add(BLACK, 3, "10")
|
||||
BitParser.add(BLACK, 4, "011")
|
||||
BitParser.add(BLACK, 5, "0011")
|
||||
BitParser.add(BLACK, 6, "0010")
|
||||
BitParser.add(BLACK, 7, "00011")
|
||||
BitParser.add(BLACK, 8, "000101")
|
||||
BitParser.add(BLACK, 9, "000100")
|
||||
BitParser.add(BLACK, 10, "0000100")
|
||||
BitParser.add(BLACK, 11, "0000101")
|
||||
BitParser.add(BLACK, 12, "0000111")
|
||||
BitParser.add(BLACK, 13, "00000100")
|
||||
BitParser.add(BLACK, 14, "00000111")
|
||||
BitParser.add(BLACK, 15, "000011000")
|
||||
BitParser.add(BLACK, 16, "0000010111")
|
||||
BitParser.add(BLACK, 17, "0000011000")
|
||||
BitParser.add(BLACK, 18, "0000001000")
|
||||
BitParser.add(BLACK, 19, "00001100111")
|
||||
BitParser.add(BLACK, 20, "00001101000")
|
||||
BitParser.add(BLACK, 21, "00001101100")
|
||||
BitParser.add(BLACK, 22, "00000110111")
|
||||
BitParser.add(BLACK, 23, "00000101000")
|
||||
BitParser.add(BLACK, 24, "00000010111")
|
||||
BitParser.add(BLACK, 25, "00000011000")
|
||||
BitParser.add(BLACK, 26, "000011001010")
|
||||
BitParser.add(BLACK, 27, "000011001011")
|
||||
BitParser.add(BLACK, 28, "000011001100")
|
||||
BitParser.add(BLACK, 29, "000011001101")
|
||||
BitParser.add(BLACK, 30, "000001101000")
|
||||
BitParser.add(BLACK, 31, "000001101001")
|
||||
BitParser.add(BLACK, 32, "000001101010")
|
||||
BitParser.add(BLACK, 33, "000001101011")
|
||||
BitParser.add(BLACK, 34, "000011010010")
|
||||
BitParser.add(BLACK, 35, "000011010011")
|
||||
BitParser.add(BLACK, 36, "000011010100")
|
||||
BitParser.add(BLACK, 37, "000011010101")
|
||||
BitParser.add(BLACK, 38, "000011010110")
|
||||
BitParser.add(BLACK, 39, "000011010111")
|
||||
BitParser.add(BLACK, 40, "000001101100")
|
||||
BitParser.add(BLACK, 41, "000001101101")
|
||||
BitParser.add(BLACK, 42, "000011011010")
|
||||
BitParser.add(BLACK, 43, "000011011011")
|
||||
BitParser.add(BLACK, 44, "000001010100")
|
||||
BitParser.add(BLACK, 45, "000001010101")
|
||||
BitParser.add(BLACK, 46, "000001010110")
|
||||
BitParser.add(BLACK, 47, "000001010111")
|
||||
BitParser.add(BLACK, 48, "000001100100")
|
||||
BitParser.add(BLACK, 49, "000001100101")
|
||||
BitParser.add(BLACK, 50, "000001010010")
|
||||
BitParser.add(BLACK, 51, "000001010011")
|
||||
BitParser.add(BLACK, 52, "000000100100")
|
||||
BitParser.add(BLACK, 53, "000000110111")
|
||||
BitParser.add(BLACK, 54, "000000111000")
|
||||
BitParser.add(BLACK, 55, "000000100111")
|
||||
BitParser.add(BLACK, 56, "000000101000")
|
||||
BitParser.add(BLACK, 57, "000001011000")
|
||||
BitParser.add(BLACK, 58, "000001011001")
|
||||
BitParser.add(BLACK, 59, "000000101011")
|
||||
BitParser.add(BLACK, 60, "000000101100")
|
||||
BitParser.add(BLACK, 61, "000001011010")
|
||||
BitParser.add(BLACK, 62, "000001100110")
|
||||
BitParser.add(BLACK, 63, "000001100111")
|
||||
BitParser.add(BLACK, 64, "0000001111")
|
||||
BitParser.add(BLACK, 128, "000011001000")
|
||||
BitParser.add(BLACK, 192, "000011001001")
|
||||
BitParser.add(BLACK, 256, "000001011011")
|
||||
BitParser.add(BLACK, 320, "000000110011")
|
||||
BitParser.add(BLACK, 384, "000000110100")
|
||||
BitParser.add(BLACK, 448, "000000110101")
|
||||
BitParser.add(BLACK, 512, "0000001101100")
|
||||
BitParser.add(BLACK, 576, "0000001101101")
|
||||
BitParser.add(BLACK, 640, "0000001001010")
|
||||
BitParser.add(BLACK, 704, "0000001001011")
|
||||
BitParser.add(BLACK, 768, "0000001001100")
|
||||
BitParser.add(BLACK, 832, "0000001001101")
|
||||
BitParser.add(BLACK, 896, "0000001110010")
|
||||
BitParser.add(BLACK, 960, "0000001110011")
|
||||
BitParser.add(BLACK, 1024, "0000001110100")
|
||||
BitParser.add(BLACK, 1088, "0000001110101")
|
||||
BitParser.add(BLACK, 1152, "0000001110110")
|
||||
BitParser.add(BLACK, 1216, "0000001110111")
|
||||
BitParser.add(BLACK, 1280, "0000001010010")
|
||||
BitParser.add(BLACK, 1344, "0000001010011")
|
||||
BitParser.add(BLACK, 1408, "0000001010100")
|
||||
BitParser.add(BLACK, 1472, "0000001010101")
|
||||
BitParser.add(BLACK, 1536, "0000001011010")
|
||||
BitParser.add(BLACK, 1600, "0000001011011")
|
||||
BitParser.add(BLACK, 1664, "0000001100100")
|
||||
BitParser.add(BLACK, 1728, "0000001100101")
|
||||
BitParser.add(BLACK, 1792, "00000001000")
|
||||
BitParser.add(BLACK, 1856, "00000001100")
|
||||
BitParser.add(BLACK, 1920, "00000001101")
|
||||
BitParser.add(BLACK, 1984, "000000010010")
|
||||
BitParser.add(BLACK, 2048, "000000010011")
|
||||
BitParser.add(BLACK, 2112, "000000010100")
|
||||
BitParser.add(BLACK, 2176, "000000010101")
|
||||
BitParser.add(BLACK, 2240, "000000010110")
|
||||
BitParser.add(BLACK, 2304, "000000010111")
|
||||
BitParser.add(BLACK, 2368, "000000011100")
|
||||
BitParser.add(BLACK, 2432, "000000011101")
|
||||
BitParser.add(BLACK, 2496, "000000011110")
|
||||
BitParser.add(BLACK, 2560, "000000011111")
|
||||
|
||||
UNCOMPRESSED = [None, None]
|
||||
BitParser.add(UNCOMPRESSED, '1', '1')
|
||||
BitParser.add(UNCOMPRESSED, '01', '01')
|
||||
BitParser.add(UNCOMPRESSED, '001', '001')
|
||||
BitParser.add(UNCOMPRESSED, '0001', '0001')
|
||||
BitParser.add(UNCOMPRESSED, '00001', '00001')
|
||||
BitParser.add(UNCOMPRESSED, '00000', '000001')
|
||||
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
|
||||
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
|
||||
BitParser.add(UNCOMPRESSED, "1", "1")
|
||||
BitParser.add(UNCOMPRESSED, "01", "01")
|
||||
BitParser.add(UNCOMPRESSED, "001", "001")
|
||||
BitParser.add(UNCOMPRESSED, "0001", "0001")
|
||||
BitParser.add(UNCOMPRESSED, "00001", "00001")
|
||||
BitParser.add(UNCOMPRESSED, "00000", "000001")
|
||||
BitParser.add(UNCOMPRESSED, "T00", "00000011")
|
||||
BitParser.add(UNCOMPRESSED, "T10", "00000010")
|
||||
BitParser.add(UNCOMPRESSED, "T000", "000000011")
|
||||
BitParser.add(UNCOMPRESSED, "T100", "000000010")
|
||||
BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
|
||||
BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
|
||||
BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
|
||||
BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
|
||||
|
||||
class EOFB(Exception):
|
||||
pass
|
||||
|
@ -352,21 +362,21 @@ class CCITTG4Parser(BitParser):
|
|||
return
|
||||
|
||||
def _parse_mode(self, mode: object) -> BitParserState:
|
||||
if mode == 'p':
|
||||
if mode == "p":
|
||||
self._do_pass()
|
||||
self._flush_line()
|
||||
return self.MODE
|
||||
elif mode == 'h':
|
||||
elif mode == "h":
|
||||
self._n1 = 0
|
||||
self._accept = self._parse_horiz1
|
||||
if self._color:
|
||||
return self.WHITE
|
||||
else:
|
||||
return self.BLACK
|
||||
elif mode == 'u':
|
||||
elif mode == "u":
|
||||
self._accept = self._parse_uncompressed
|
||||
return self.UNCOMPRESSED
|
||||
elif mode == 'e':
|
||||
elif mode == "e":
|
||||
raise self.EOFB
|
||||
elif isinstance(mode, int):
|
||||
self._do_vertical(mode)
|
||||
|
@ -406,7 +416,7 @@ class CCITTG4Parser(BitParser):
|
|||
def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
|
||||
if not bits:
|
||||
raise self.InvalidData
|
||||
if bits.startswith('T'):
|
||||
if bits.startswith("T"):
|
||||
self._accept = self._parse_mode
|
||||
self._color = int(bits[1])
|
||||
self._do_uncompressed(bits[2:])
|
||||
|
@ -416,33 +426,37 @@ class CCITTG4Parser(BitParser):
|
|||
return self.UNCOMPRESSED
|
||||
|
||||
def _get_bits(self) -> str:
|
||||
return ''.join(str(b) for b in self._curline[:self._curpos])
|
||||
return "".join(str(b) for b in self._curline[: self._curpos])
|
||||
|
||||
def _get_refline(self, i: int) -> str:
|
||||
if i < 0:
|
||||
return '[]'+''.join(str(b) for b in self._refline)
|
||||
return "[]" + "".join(str(b) for b in self._refline)
|
||||
elif len(self._refline) <= i:
|
||||
return ''.join(str(b) for b in self._refline)+'[]'
|
||||
return "".join(str(b) for b in self._refline) + "[]"
|
||||
else:
|
||||
return (''.join(str(b) for b in self._refline[:i]) +
|
||||
'['+str(self._refline[i])+']' +
|
||||
''.join(str(b) for b in self._refline[i+1:]))
|
||||
return (
|
||||
"".join(str(b) for b in self._refline[:i])
|
||||
+ "["
|
||||
+ str(self._refline[i])
|
||||
+ "]"
|
||||
+ "".join(str(b) for b in self._refline[i + 1 :])
|
||||
)
|
||||
|
||||
def reset(self) -> None:
|
||||
self._y = 0
|
||||
self._curline = array.array('b', [1]*self.width)
|
||||
self._curline = array.array("b", [1] * self.width)
|
||||
self._reset_line()
|
||||
self._accept = self._parse_mode
|
||||
self._state = self.MODE
|
||||
return
|
||||
|
||||
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
||||
print(y, ''.join(str(b) for b in bits))
|
||||
print(y, "".join(str(b) for b in bits))
|
||||
return
|
||||
|
||||
def _reset_line(self) -> None:
|
||||
self._refline = self._curline
|
||||
self._curline = array.array('b', [1]*self.width)
|
||||
self._curline = array.array("b", [1] * self.width)
|
||||
self._curpos = -1
|
||||
self._color = 1
|
||||
return
|
||||
|
@ -460,12 +474,14 @@ class CCITTG4Parser(BitParser):
|
|||
x1 = self._curpos + 1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
if self._color == 1 and self._refline[x1] != self._color:
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color):
|
||||
elif (
|
||||
self._refline[x1 - 1] == self._color
|
||||
and self._refline[x1] != self._color
|
||||
):
|
||||
break
|
||||
x1 += 1
|
||||
x1 += dx
|
||||
|
@ -485,22 +501,26 @@ class CCITTG4Parser(BitParser):
|
|||
x1 = self._curpos + 1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
if self._color == 1 and self._refline[x1] != self._color:
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color):
|
||||
elif (
|
||||
self._refline[x1 - 1] == self._color
|
||||
and self._refline[x1] != self._color
|
||||
):
|
||||
break
|
||||
x1 += 1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 0 and self._refline[x1] == self._color):
|
||||
if self._color == 0 and self._refline[x1] == self._color:
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] != self._color and
|
||||
self._refline[x1] == self._color):
|
||||
elif (
|
||||
self._refline[x1 - 1] != self._color
|
||||
and self._refline[x1] == self._color
|
||||
):
|
||||
break
|
||||
x1 += 1
|
||||
for x in range(self._curpos, x1):
|
||||
|
@ -534,19 +554,19 @@ class CCITTG4Parser(BitParser):
|
|||
|
||||
|
||||
class CCITTFaxDecoder(CCITTG4Parser):
|
||||
|
||||
def __init__(self, width: int, bytealign: bool = False,
|
||||
reversed: bool = False) -> None:
|
||||
def __init__(
|
||||
self, width: int, bytealign: bool = False, reversed: bool = False
|
||||
) -> None:
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.reversed = reversed
|
||||
self._buf = b''
|
||||
self._buf = b""
|
||||
return
|
||||
|
||||
def close(self) -> bytes:
|
||||
return self._buf
|
||||
|
||||
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
||||
arr = array.array('B', [0]*((len(bits)+7)//8))
|
||||
arr = array.array("B", [0] * ((len(bits) + 7) // 8))
|
||||
if self.reversed:
|
||||
bits = [1 - b for b in bits]
|
||||
for (i, b) in enumerate(bits):
|
||||
|
@ -557,11 +577,11 @@ class CCITTFaxDecoder(CCITTG4Parser):
|
|||
|
||||
|
||||
def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
|
||||
K = params.get('K')
|
||||
K = params.get("K")
|
||||
if K == -1:
|
||||
cols = cast(int, params.get('Columns'))
|
||||
bytealign = cast(bool, params.get('EncodedByteAlign'))
|
||||
reversed = cast(bool, params.get('BlackIs1'))
|
||||
cols = cast(int, params.get("Columns"))
|
||||
bytealign = cast(bool, params.get("EncodedByteAlign"))
|
||||
reversed = cast(bool, params.get("BlackIs1"))
|
||||
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
|
||||
else:
|
||||
raise ValueError(K)
|
||||
|
@ -573,12 +593,14 @@ def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
|
|||
def main(argv: List[str]) -> None:
|
||||
if not argv[1:]:
|
||||
import unittest
|
||||
|
||||
unittest.main()
|
||||
return
|
||||
|
||||
class Parser(CCITTG4Parser):
|
||||
def __init__(self, width: int, bytealign: bool = False) -> None:
|
||||
import pygame # type: ignore[import]
|
||||
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.img = pygame.Surface((self.width, 1000))
|
||||
return
|
||||
|
@ -593,11 +615,13 @@ def main(argv: List[str]) -> None:
|
|||
|
||||
def close(self) -> None:
|
||||
import pygame
|
||||
pygame.image.save(self.img, 'out.bmp')
|
||||
|
||||
pygame.image.save(self.img, "out.bmp")
|
||||
return
|
||||
|
||||
for path in argv[1:]:
|
||||
fp = open(path, 'rb')
|
||||
(_, _, k, w, h, _) = path.split('.')
|
||||
fp = open(path, "rb")
|
||||
(_, _, k, w, h, _) = path.split(".")
|
||||
parser = Parser(int(w))
|
||||
parser.feedbytes(fp.read())
|
||||
parser.close()
|
||||
|
|
|
@ -16,8 +16,20 @@ import os.path
|
|||
import pickle as pickle
|
||||
import struct
|
||||
import sys
|
||||
from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List,
|
||||
MutableMapping, Optional, TextIO, Tuple, Union, cast)
|
||||
from typing import (
|
||||
Any,
|
||||
BinaryIO,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
MutableMapping,
|
||||
Optional,
|
||||
TextIO,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
from .encodingdb import name2unicode
|
||||
from .psparser import KWD
|
||||
|
@ -45,7 +57,7 @@ class CMapBase:
|
|||
self.attrs: MutableMapping[str, object] = kwargs.copy()
|
||||
|
||||
def is_vertical(self) -> bool:
|
||||
return self.attrs.get('WMode', 0) != 0
|
||||
return self.attrs.get("WMode", 0) != 0
|
||||
|
||||
def set_attr(self, k: str, v: object) -> None:
|
||||
self.attrs[k] = v
|
||||
|
@ -53,8 +65,7 @@ class CMapBase:
|
|||
def add_code2cid(self, code: str, cid: int) -> None:
|
||||
pass
|
||||
|
||||
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
|
||||
) -> None:
|
||||
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
|
||||
pass
|
||||
|
||||
def use_cmap(self, cmap: "CMapBase") -> None:
|
||||
|
@ -65,13 +76,12 @@ class CMapBase:
|
|||
|
||||
|
||||
class CMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs: Union[str, int]) -> None:
|
||||
CMapBase.__init__(self, **kwargs)
|
||||
self.code2cid: Dict[int, object] = {}
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<CMap: %s>' % self.attrs.get('CMapName')
|
||||
return "<CMap: %s>" % self.attrs.get("CMapName")
|
||||
|
||||
def use_cmap(self, cmap: CMapBase) -> None:
|
||||
assert isinstance(cmap, CMap), str(type(cmap))
|
||||
|
@ -84,10 +94,11 @@ class CMap(CMapBase):
|
|||
copy(d, v)
|
||||
else:
|
||||
dst[k] = v
|
||||
|
||||
copy(self.code2cid, cmap.code2cid)
|
||||
|
||||
def decode(self, code: bytes) -> Iterator[int]:
|
||||
log.debug('decode: %r, %r', self, code)
|
||||
log.debug("decode: %r, %r", self, code)
|
||||
d = self.code2cid
|
||||
for i in iter(code):
|
||||
if i in d:
|
||||
|
@ -100,70 +111,70 @@ class CMap(CMapBase):
|
|||
else:
|
||||
d = self.code2cid
|
||||
|
||||
def dump(self, out: TextIO = sys.stdout,
|
||||
def dump(
|
||||
self,
|
||||
out: TextIO = sys.stdout,
|
||||
code2cid: Optional[Dict[int, object]] = None,
|
||||
code: Tuple[int, ...] = ()) -> None:
|
||||
code: Tuple[int, ...] = (),
|
||||
) -> None:
|
||||
if code2cid is None:
|
||||
code2cid = self.code2cid
|
||||
code = ()
|
||||
for (k, v) in sorted(code2cid.items()):
|
||||
c = code + (k,)
|
||||
if isinstance(v, int):
|
||||
out.write('code %r = cid %d\n' % (c, v))
|
||||
out.write("code %r = cid %d\n" % (c, v))
|
||||
else:
|
||||
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
|
||||
|
||||
|
||||
class IdentityCMap(CMapBase):
|
||||
|
||||
def decode(self, code: bytes) -> Tuple[int, ...]:
|
||||
n = len(code) // 2
|
||||
if n:
|
||||
return struct.unpack('>%dH' % n, code)
|
||||
return struct.unpack(">%dH" % n, code)
|
||||
else:
|
||||
return ()
|
||||
|
||||
|
||||
class IdentityCMapByte(IdentityCMap):
|
||||
|
||||
def decode(self, code: bytes) -> Tuple[int, ...]:
|
||||
n = len(code)
|
||||
if n:
|
||||
return struct.unpack('>%dB' % n, code)
|
||||
return struct.unpack(">%dB" % n, code)
|
||||
else:
|
||||
return ()
|
||||
|
||||
|
||||
class UnicodeMap(CMapBase):
|
||||
|
||||
def __init__(self, **kwargs: Union[str, int]) -> None:
|
||||
CMapBase.__init__(self, **kwargs)
|
||||
self.cid2unichr: Dict[int, str] = {}
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
|
||||
return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
|
||||
|
||||
def get_unichr(self, cid: int) -> str:
|
||||
log.debug('get_unichr: %r, %r', self, cid)
|
||||
log.debug("get_unichr: %r, %r", self, cid)
|
||||
return self.cid2unichr[cid]
|
||||
|
||||
def dump(self, out: TextIO = sys.stdout) -> None:
|
||||
for (k, v) in sorted(self.cid2unichr.items()):
|
||||
out.write('cid %d = unicode %r\n' % (k, v))
|
||||
out.write("cid %d = unicode %r\n" % (k, v))
|
||||
|
||||
|
||||
class IdentityUnicodeMap(UnicodeMap):
|
||||
def get_unichr(self, cid: int) -> str:
|
||||
"""Interpret character id as unicode codepoint"""
|
||||
log.debug('get_unichr: %r, %r', self, cid)
|
||||
log.debug("get_unichr: %r, %r", self, cid)
|
||||
return chr(cid)
|
||||
|
||||
|
||||
class FileCMap(CMap):
|
||||
|
||||
def add_code2cid(self, code: str, cid: int) -> None:
|
||||
assert isinstance(code, str) and isinstance(cid, int),\
|
||||
str((type(code), type(cid)))
|
||||
assert isinstance(code, str) and isinstance(cid, int), str(
|
||||
(type(code), type(cid))
|
||||
)
|
||||
d = self.code2cid
|
||||
for c in code[:-1]:
|
||||
ci = ord(c)
|
||||
|
@ -178,9 +189,7 @@ class FileCMap(CMap):
|
|||
|
||||
|
||||
class FileUnicodeMap(UnicodeMap):
|
||||
|
||||
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
|
||||
) -> None:
|
||||
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
|
||||
assert isinstance(cid, int), str(type(cid))
|
||||
if isinstance(code, PSLiteral):
|
||||
# Interpret as an Adobe glyph name.
|
||||
|
@ -188,7 +197,7 @@ class FileUnicodeMap(UnicodeMap):
|
|||
self.cid2unichr[cid] = name2unicode(code.name)
|
||||
elif isinstance(code, bytes):
|
||||
# Interpret as UTF-16BE.
|
||||
self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
|
||||
self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore")
|
||||
elif isinstance(code, int):
|
||||
self.cid2unichr[cid] = chr(code)
|
||||
else:
|
||||
|
@ -196,21 +205,19 @@ class FileUnicodeMap(UnicodeMap):
|
|||
|
||||
|
||||
class PyCMap(CMap):
|
||||
|
||||
def __init__(self, name: str, module: Any) -> None:
|
||||
super().__init__(CMapName=name)
|
||||
self.code2cid = module.CODE2CID
|
||||
if module.IS_VERTICAL:
|
||||
self.attrs['WMode'] = 1
|
||||
self.attrs["WMode"] = 1
|
||||
|
||||
|
||||
class PyUnicodeMap(UnicodeMap):
|
||||
|
||||
def __init__(self, name: str, module: Any, vertical: bool) -> None:
|
||||
super().__init__(CMapName=name)
|
||||
if vertical:
|
||||
self.cid2unichr = module.CID2UNICHR_V
|
||||
self.attrs['WMode'] = 1
|
||||
self.attrs["WMode"] = 1
|
||||
else:
|
||||
self.cid2unichr = module.CID2UNICHR_H
|
||||
|
||||
|
@ -226,10 +233,12 @@ class CMapDB:
|
|||
@classmethod
|
||||
def _load_data(cls, name: str) -> Any:
|
||||
name = name.replace("\0", "")
|
||||
filename = '%s.pickle.gz' % name
|
||||
log.debug('loading: %r', name)
|
||||
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||
os.path.join(os.path.dirname(__file__), 'cmap'),)
|
||||
filename = "%s.pickle.gz" % name
|
||||
log.debug("loading: %r", name)
|
||||
cmap_paths = (
|
||||
os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
|
||||
os.path.join(os.path.dirname(__file__), "cmap"),
|
||||
)
|
||||
for directory in cmap_paths:
|
||||
path = os.path.join(directory, filename)
|
||||
if os.path.exists(path):
|
||||
|
@ -243,13 +252,13 @@ class CMapDB:
|
|||
|
||||
@classmethod
|
||||
def get_cmap(cls, name: str) -> CMapBase:
|
||||
if name == 'Identity-H':
|
||||
if name == "Identity-H":
|
||||
return IdentityCMap(WMode=0)
|
||||
elif name == 'Identity-V':
|
||||
elif name == "Identity-V":
|
||||
return IdentityCMap(WMode=1)
|
||||
elif name == 'OneByteIdentityH':
|
||||
elif name == "OneByteIdentityH":
|
||||
return IdentityCMapByte(WMode=0)
|
||||
elif name == 'OneByteIdentityV':
|
||||
elif name == "OneByteIdentityV":
|
||||
return IdentityCMapByte(WMode=1)
|
||||
try:
|
||||
return cls._cmap_cache[name]
|
||||
|
@ -265,14 +274,12 @@ class CMapDB:
|
|||
return cls._umap_cache[name][vertical]
|
||||
except KeyError:
|
||||
pass
|
||||
data = cls._load_data('to-unicode-%s' % name)
|
||||
cls._umap_cache[name] = [PyUnicodeMap(name, data, v)
|
||||
for v in (False, True)]
|
||||
data = cls._load_data("to-unicode-%s" % name)
|
||||
cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
|
||||
return cls._umap_cache[name][vertical]
|
||||
|
||||
|
||||
class CMapParser(PSStackParser[PSKeyword]):
|
||||
|
||||
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
|
||||
PSStackParser.__init__(self, fp)
|
||||
self.cmap = cmap
|
||||
|
@ -287,22 +294,22 @@ class CMapParser(PSStackParser[PSKeyword]):
|
|||
pass
|
||||
return
|
||||
|
||||
KEYWORD_BEGINCMAP = KWD(b'begincmap')
|
||||
KEYWORD_ENDCMAP = KWD(b'endcmap')
|
||||
KEYWORD_USECMAP = KWD(b'usecmap')
|
||||
KEYWORD_DEF = KWD(b'def')
|
||||
KEYWORD_BEGINCODESPACERANGE = KWD(b'begincodespacerange')
|
||||
KEYWORD_ENDCODESPACERANGE = KWD(b'endcodespacerange')
|
||||
KEYWORD_BEGINCIDRANGE = KWD(b'begincidrange')
|
||||
KEYWORD_ENDCIDRANGE = KWD(b'endcidrange')
|
||||
KEYWORD_BEGINCIDCHAR = KWD(b'begincidchar')
|
||||
KEYWORD_ENDCIDCHAR = KWD(b'endcidchar')
|
||||
KEYWORD_BEGINBFRANGE = KWD(b'beginbfrange')
|
||||
KEYWORD_ENDBFRANGE = KWD(b'endbfrange')
|
||||
KEYWORD_BEGINBFCHAR = KWD(b'beginbfchar')
|
||||
KEYWORD_ENDBFCHAR = KWD(b'endbfchar')
|
||||
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
|
||||
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
|
||||
KEYWORD_BEGINCMAP = KWD(b"begincmap")
|
||||
KEYWORD_ENDCMAP = KWD(b"endcmap")
|
||||
KEYWORD_USECMAP = KWD(b"usecmap")
|
||||
KEYWORD_DEF = KWD(b"def")
|
||||
KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
|
||||
KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
|
||||
KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
|
||||
KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
|
||||
KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
|
||||
KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
|
||||
KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
|
||||
KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
|
||||
KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
|
||||
KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
|
||||
KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
|
||||
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
|
||||
|
||||
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||
if token is self.KEYWORD_BEGINCMAP:
|
||||
|
@ -346,8 +353,12 @@ class CMapParser(PSStackParser[PSKeyword]):
|
|||
if token is self.KEYWORD_ENDCIDRANGE:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, cid) in choplist(3, objs):
|
||||
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
|
||||
not isinstance(cid, int) or len(s) != len(e)):
|
||||
if (
|
||||
not isinstance(s, bytes)
|
||||
or not isinstance(e, bytes)
|
||||
or not isinstance(cid, int)
|
||||
or len(s) != len(e)
|
||||
):
|
||||
continue
|
||||
sprefix = s[:-4]
|
||||
eprefix = e[:-4]
|
||||
|
@ -359,7 +370,7 @@ class CMapParser(PSStackParser[PSKeyword]):
|
|||
e1 = nunpack(evar)
|
||||
vlen = len(svar)
|
||||
for i in range(e1 - s1 + 1):
|
||||
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
|
||||
x = sprefix + struct.pack(">L", s1 + i)[-vlen:]
|
||||
self.cmap.add_cid2unichr(cid + i, x)
|
||||
return
|
||||
|
||||
|
@ -379,8 +390,11 @@ class CMapParser(PSStackParser[PSKeyword]):
|
|||
if token is self.KEYWORD_ENDBFRANGE:
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, code) in choplist(3, objs):
|
||||
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
|
||||
len(s) != len(e)):
|
||||
if (
|
||||
not isinstance(s, bytes)
|
||||
or not isinstance(e, bytes)
|
||||
or len(s) != len(e)
|
||||
):
|
||||
continue
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
|
@ -394,7 +408,7 @@ class CMapParser(PSStackParser[PSKeyword]):
|
|||
prefix = code[:-4]
|
||||
vlen = len(var)
|
||||
for i in range(e1 - s1 + 1):
|
||||
x = prefix+struct.pack('>L', base+i)[-vlen:]
|
||||
x = prefix + struct.pack(">L", base + i)[-vlen:]
|
||||
self.cmap.add_cid2unichr(s1 + i, x)
|
||||
return
|
||||
|
||||
|
@ -422,7 +436,7 @@ class CMapParser(PSStackParser[PSKeyword]):
|
|||
def main(argv: List[str]) -> None:
|
||||
args = argv[1:]
|
||||
for fname in args:
|
||||
fp = open(fname, 'rb')
|
||||
fp = open(fname, "rb")
|
||||
cmap = FileUnicodeMap()
|
||||
CMapParser(cmap, fp).run()
|
||||
fp.close()
|
||||
|
@ -430,5 +444,5 @@ def main(argv: List[str]) -> None:
|
|||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
||||
|
|
|
@ -1,8 +1,19 @@
|
|||
import io
|
||||
import logging
|
||||
import re
|
||||
from typing import (BinaryIO, Dict, Generic, List, Optional, Sequence, TextIO,
|
||||
Tuple, TypeVar, Union, cast)
|
||||
from typing import (
|
||||
BinaryIO,
|
||||
Dict,
|
||||
Generic,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
TextIO,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pdfminer.pdfcolor import PDFColorSpace
|
||||
from . import utils
|
||||
|
@ -46,7 +57,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None
|
||||
laparams: Optional[LAParams] = None,
|
||||
) -> None:
|
||||
PDFTextDevice.__init__(self, rsrcmgr)
|
||||
self.pageno = pageno
|
||||
|
@ -80,9 +91,11 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
|
||||
def render_image(self, name: str, stream: PDFStream) -> None:
|
||||
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
||||
item = LTImage(name, stream,
|
||||
(self.cur_item.x0, self.cur_item.y0,
|
||||
self.cur_item.x1, self.cur_item.y1))
|
||||
item = LTImage(
|
||||
name,
|
||||
stream,
|
||||
(self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
|
||||
)
|
||||
self.cur_item.add(item)
|
||||
|
||||
def paint_path(
|
||||
|
@ -91,14 +104,14 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
stroke: bool,
|
||||
fill: bool,
|
||||
evenodd: bool,
|
||||
path: Sequence[PathSegment]
|
||||
path: Sequence[PathSegment],
|
||||
) -> None:
|
||||
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
||||
shape = ''.join(x[0] for x in path)
|
||||
shape = "".join(x[0] for x in path)
|
||||
|
||||
if shape.count('m') > 1:
|
||||
if shape.count("m") > 1:
|
||||
# recurse if there are multiple m's in this shape
|
||||
for m in re.finditer(r'm[^m]+', shape):
|
||||
for m in re.finditer(r"m[^m]+", shape):
|
||||
subpath = path[m.start(0) : m.end(0)]
|
||||
self.paint_path(gstate, stroke, fill, evenodd, subpath)
|
||||
|
||||
|
@ -110,38 +123,68 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
# And, per Section 4.4's Table 4.9, all other path commands place
|
||||
# their point-position in their final two arguments. (Any preceding
|
||||
# arguments represent control points on Bézier curves.)
|
||||
raw_pts = [cast(Point, p[-2:] if p[0] != 'h' else path[0][-2:])
|
||||
for p in path]
|
||||
raw_pts = [
|
||||
cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
|
||||
]
|
||||
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
||||
|
||||
if shape in {'mlh', 'ml'}:
|
||||
if shape in {"mlh", "ml"}:
|
||||
# single line segment
|
||||
#
|
||||
# Note: 'ml', in conditional above, is a frequent anomaly
|
||||
# that we want to support.
|
||||
line = LTLine(gstate.linewidth, pts[0], pts[1], stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
line = LTLine(
|
||||
gstate.linewidth,
|
||||
pts[0],
|
||||
pts[1],
|
||||
stroke,
|
||||
fill,
|
||||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
)
|
||||
self.cur_item.add(line)
|
||||
|
||||
elif shape in {'mlllh', 'mllll'}:
|
||||
elif shape in {"mlllh", "mllll"}:
|
||||
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
|
||||
|
||||
is_closed_loop = (pts[0] == pts[4])
|
||||
has_square_coordinates = \
|
||||
(x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) \
|
||||
or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
|
||||
is_closed_loop = pts[0] == pts[4]
|
||||
has_square_coordinates = (
|
||||
x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
|
||||
) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
|
||||
if is_closed_loop and has_square_coordinates:
|
||||
rect = LTRect(gstate.linewidth, (*pts[0], *pts[2]), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
rect = LTRect(
|
||||
gstate.linewidth,
|
||||
(*pts[0], *pts[2]),
|
||||
stroke,
|
||||
fill,
|
||||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
)
|
||||
self.cur_item.add(rect)
|
||||
else:
|
||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill,
|
||||
evenodd, gstate.scolor, gstate.ncolor)
|
||||
curve = LTCurve(
|
||||
gstate.linewidth,
|
||||
pts,
|
||||
stroke,
|
||||
fill,
|
||||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
else:
|
||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||
gstate.scolor, gstate.ncolor)
|
||||
curve = LTCurve(
|
||||
gstate.linewidth,
|
||||
pts,
|
||||
stroke,
|
||||
fill,
|
||||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
def render_char(
|
||||
|
@ -153,7 +196,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
rise: float,
|
||||
cid: int,
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: PDFGraphicState
|
||||
graphicstate: PDFGraphicState,
|
||||
) -> float:
|
||||
try:
|
||||
text = font.to_unichr(cid)
|
||||
|
@ -162,14 +205,24 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
text = self.handle_undefined_char(font, cid)
|
||||
textwidth = font.char_width(cid)
|
||||
textdisp = font.char_disp(cid)
|
||||
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
|
||||
textdisp, ncs, graphicstate)
|
||||
item = LTChar(
|
||||
matrix,
|
||||
font,
|
||||
fontsize,
|
||||
scaling,
|
||||
rise,
|
||||
text,
|
||||
textwidth,
|
||||
textdisp,
|
||||
ncs,
|
||||
graphicstate,
|
||||
)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
|
||||
log.debug('undefined: %r, %r', font, cid)
|
||||
return '(cid:%d)' % cid
|
||||
log.debug("undefined: %r, %r", font, cid)
|
||||
return "(cid:%d)" % cid
|
||||
|
||||
def receive_layout(self, ltpage: LTPage) -> None:
|
||||
pass
|
||||
|
@ -180,10 +233,9 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
|
|||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None
|
||||
laparams: Optional[LAParams] = None,
|
||||
) -> None:
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||
laparams=laparams)
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||
self.result: Optional[LTPage] = None
|
||||
|
||||
def receive_layout(self, ltpage: LTPage) -> None:
|
||||
|
@ -195,7 +247,7 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
|
|||
|
||||
|
||||
# Some PDFConverter children support only binary I/O
|
||||
IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO)
|
||||
IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
|
||||
|
||||
|
||||
class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
|
||||
|
@ -203,12 +255,11 @@ class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
|
|||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
outfp: IOType,
|
||||
codec: str = 'utf-8',
|
||||
codec: str = "utf-8",
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None
|
||||
laparams: Optional[LAParams] = None,
|
||||
) -> None:
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||
laparams=laparams)
|
||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||
self.outfp: IOType = outfp
|
||||
self.codec = codec
|
||||
self.outfp_binary = self._is_binary_stream(self.outfp)
|
||||
|
@ -216,9 +267,9 @@ class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
|
|||
@staticmethod
|
||||
def _is_binary_stream(outfp: AnyIO) -> bool:
|
||||
"""Test if an stream is binary or not"""
|
||||
if 'b' in getattr(outfp, 'mode', ''):
|
||||
if "b" in getattr(outfp, "mode", ""):
|
||||
return True
|
||||
elif hasattr(outfp, 'mode'):
|
||||
elif hasattr(outfp, "mode"):
|
||||
# output stream has a mode, but it does not contain 'b'
|
||||
return False
|
||||
elif isinstance(outfp, io.BytesIO):
|
||||
|
@ -236,19 +287,18 @@ class TextConverter(PDFConverter[AnyIO]):
|
|||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
outfp: AnyIO,
|
||||
codec: str = 'utf-8',
|
||||
codec: str = "utf-8",
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None,
|
||||
showpageno: bool = False,
|
||||
imagewriter: Optional[ImageWriter] = None
|
||||
imagewriter: Optional[ImageWriter] = None,
|
||||
) -> None:
|
||||
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
self.imagewriter = imagewriter
|
||||
|
||||
def write_text(self, text: str) -> None:
|
||||
text = utils.compatible_encode_method(text, self.codec, 'ignore')
|
||||
text = utils.compatible_encode_method(text, self.codec, "ignore")
|
||||
if self.outfp_binary:
|
||||
cast(BinaryIO, self.outfp).write(text.encode())
|
||||
else:
|
||||
|
@ -262,14 +312,15 @@ class TextConverter(PDFConverter[AnyIO]):
|
|||
elif isinstance(item, LTText):
|
||||
self.write_text(item.get_text())
|
||||
if isinstance(item, LTTextBox):
|
||||
self.write_text('\n')
|
||||
self.write_text("\n")
|
||||
elif isinstance(item, LTImage):
|
||||
if self.imagewriter is not None:
|
||||
self.imagewriter.export_image(item)
|
||||
|
||||
if self.showpageno:
|
||||
self.write_text('Page %s\n' % ltpage.pageid)
|
||||
self.write_text("Page %s\n" % ltpage.pageid)
|
||||
render(ltpage)
|
||||
self.write_text('\f')
|
||||
self.write_text("\f")
|
||||
|
||||
# Some dummy functions to save memory/CPU when all that is wanted
|
||||
# is text. This stops all the image and drawing output from being
|
||||
|
@ -286,54 +337,55 @@ class TextConverter(PDFConverter[AnyIO]):
|
|||
stroke: bool,
|
||||
fill: bool,
|
||||
evenodd: bool,
|
||||
path: Sequence[PathSegment]
|
||||
path: Sequence[PathSegment],
|
||||
) -> None:
|
||||
return
|
||||
|
||||
|
||||
class HTMLConverter(PDFConverter[AnyIO]):
|
||||
RECT_COLORS = {
|
||||
'figure': 'yellow',
|
||||
'textline': 'magenta',
|
||||
'textbox': 'cyan',
|
||||
'textgroup': 'red',
|
||||
'curve': 'black',
|
||||
'page': 'gray',
|
||||
"figure": "yellow",
|
||||
"textline": "magenta",
|
||||
"textbox": "cyan",
|
||||
"textgroup": "red",
|
||||
"curve": "black",
|
||||
"page": "gray",
|
||||
}
|
||||
|
||||
TEXT_COLORS = {
|
||||
'textbox': 'blue',
|
||||
'char': 'black',
|
||||
"textbox": "blue",
|
||||
"char": "black",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
outfp: AnyIO,
|
||||
codec: str = 'utf-8',
|
||||
codec: str = "utf-8",
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None,
|
||||
scale: float = 1,
|
||||
fontscale: float = 1.0,
|
||||
layoutmode: str = 'normal',
|
||||
layoutmode: str = "normal",
|
||||
showpageno: bool = True,
|
||||
pagemargin: int = 50,
|
||||
imagewriter: Optional[ImageWriter] = None,
|
||||
debug: int = 0,
|
||||
rect_colors: Optional[Dict[str, str]] = None,
|
||||
text_colors: Optional[Dict[str, str]] = None
|
||||
text_colors: Optional[Dict[str, str]] = None,
|
||||
) -> None:
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
PDFConverter.__init__(
|
||||
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
|
||||
)
|
||||
|
||||
# write() assumes a codec for binary I/O, or no codec for text I/O.
|
||||
if self.outfp_binary == (not self.codec):
|
||||
raise ValueError("Codec is required for a binary I/O output")
|
||||
|
||||
if text_colors is None:
|
||||
text_colors = {'char': 'black'}
|
||||
text_colors = {"char": "black"}
|
||||
if rect_colors is None:
|
||||
rect_colors = {'curve': 'black', 'page': 'gray'}
|
||||
rect_colors = {"curve": "black", "page": "gray"}
|
||||
|
||||
self.scale = scale
|
||||
self.fontscale = fontscale
|
||||
|
@ -360,23 +412,27 @@ class HTMLConverter(PDFConverter[AnyIO]):
|
|||
return
|
||||
|
||||
def write_header(self) -> None:
|
||||
self.write('<html><head>\n')
|
||||
self.write("<html><head>\n")
|
||||
if self.codec:
|
||||
s = '<meta http-equiv="Content-Type" content="text/html; ' \
|
||||
s = (
|
||||
'<meta http-equiv="Content-Type" content="text/html; '
|
||||
'charset=%s">\n' % self.codec
|
||||
)
|
||||
else:
|
||||
s = '<meta http-equiv="Content-Type" content="text/html">\n'
|
||||
self.write(s)
|
||||
self.write('</head><body>\n')
|
||||
self.write("</head><body>\n")
|
||||
return
|
||||
|
||||
def write_footer(self) -> None:
|
||||
page_links = ['<a href="#{}">{}</a>'.format(i, i)
|
||||
for i in range(1, self.pageno)]
|
||||
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
|
||||
', '.join(page_links)
|
||||
page_links = [
|
||||
'<a href="#{}">{}</a>'.format(i, i) for i in range(1, self.pageno)
|
||||
]
|
||||
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
|
||||
page_links
|
||||
)
|
||||
self.write(s)
|
||||
self.write('</body></html>\n')
|
||||
self.write("</body></html>\n")
|
||||
return
|
||||
|
||||
def write_text(self, text: str) -> None:
|
||||
|
@ -384,71 +440,67 @@ class HTMLConverter(PDFConverter[AnyIO]):
|
|||
return
|
||||
|
||||
def place_rect(
|
||||
self,
|
||||
color: str,
|
||||
borderwidth: int,
|
||||
x: float,
|
||||
y: float,
|
||||
w: float,
|
||||
h: float
|
||||
self, color: str, borderwidth: int, x: float, y: float, w: float, h: float
|
||||
) -> None:
|
||||
color2 = self.rect_colors.get(color)
|
||||
if color2 is not None:
|
||||
s = '<span style="position:absolute; border: %s %dpx solid; ' \
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \
|
||||
(color2, borderwidth, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale,
|
||||
h * self.scale)
|
||||
self.write(
|
||||
s)
|
||||
s = (
|
||||
'<span style="position:absolute; border: %s %dpx solid; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
|
||||
% (
|
||||
color2,
|
||||
borderwidth,
|
||||
x * self.scale,
|
||||
(self._yoffset - y) * self.scale,
|
||||
w * self.scale,
|
||||
h * self.scale,
|
||||
)
|
||||
)
|
||||
self.write(s)
|
||||
return
|
||||
|
||||
def place_border(
|
||||
self,
|
||||
color: str,
|
||||
borderwidth: int,
|
||||
item: LTComponent
|
||||
) -> None:
|
||||
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
|
||||
self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
|
||||
return
|
||||
|
||||
def place_image(
|
||||
self,
|
||||
item: LTImage,
|
||||
borderwidth: int,
|
||||
x: float,
|
||||
y: float,
|
||||
w: float,
|
||||
h: float
|
||||
self, item: LTImage, borderwidth: int, x: float, y: float, w: float, h: float
|
||||
) -> None:
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
s = '<img src="%s" border="%d" style="position:absolute; ' \
|
||||
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
|
||||
(enc(name), borderwidth, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale,
|
||||
h * self.scale)
|
||||
s = (
|
||||
'<img src="%s" border="%d" style="position:absolute; '
|
||||
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
|
||||
% (
|
||||
enc(name),
|
||||
borderwidth,
|
||||
x * self.scale,
|
||||
(self._yoffset - y) * self.scale,
|
||||
w * self.scale,
|
||||
h * self.scale,
|
||||
)
|
||||
)
|
||||
self.write(s)
|
||||
return
|
||||
|
||||
def place_text(
|
||||
self,
|
||||
color: str,
|
||||
text: str,
|
||||
x: float,
|
||||
y: float,
|
||||
size: float
|
||||
self, color: str, text: str, x: float, y: float, size: float
|
||||
) -> None:
|
||||
color2 = self.text_colors.get(color)
|
||||
if color2 is not None:
|
||||
s = '<span style="position:absolute; color:%s; left:%dpx; ' \
|
||||
'top:%dpx; font-size:%dpx;">' % \
|
||||
(color2, x * self.scale, (self._yoffset - y) * self.scale,
|
||||
size * self.scale * self.fontscale)
|
||||
s = (
|
||||
'<span style="position:absolute; color:%s; left:%dpx; '
|
||||
'top:%dpx; font-size:%dpx;">'
|
||||
% (
|
||||
color2,
|
||||
x * self.scale,
|
||||
(self._yoffset - y) * self.scale,
|
||||
size * self.scale * self.fontscale,
|
||||
)
|
||||
)
|
||||
self.write(s)
|
||||
self.write_text(text)
|
||||
self.write('</span>\n')
|
||||
self.write("</span>\n")
|
||||
return
|
||||
|
||||
def begin_div(
|
||||
|
@ -459,47 +511,57 @@ class HTMLConverter(PDFConverter[AnyIO]):
|
|||
y: float,
|
||||
w: float,
|
||||
h: float,
|
||||
writing_mode: str = 'False'
|
||||
writing_mode: str = "False",
|
||||
) -> None:
|
||||
self._fontstack.append(self._font)
|
||||
self._font = None
|
||||
s = '<div style="position:absolute; border: %s %dpx solid; ' \
|
||||
'writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; ' \
|
||||
'height:%dpx;">' % \
|
||||
(color, borderwidth, writing_mode, x * self.scale,
|
||||
(self._yoffset - y) * self.scale, w * self.scale, h * self.scale)
|
||||
s = (
|
||||
'<div style="position:absolute; border: %s %dpx solid; '
|
||||
"writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
|
||||
'height:%dpx;">'
|
||||
% (
|
||||
color,
|
||||
borderwidth,
|
||||
writing_mode,
|
||||
x * self.scale,
|
||||
(self._yoffset - y) * self.scale,
|
||||
w * self.scale,
|
||||
h * self.scale,
|
||||
)
|
||||
)
|
||||
self.write(s)
|
||||
return
|
||||
|
||||
def end_div(self, color: str) -> None:
|
||||
if self._font is not None:
|
||||
self.write('</span>')
|
||||
self.write("</span>")
|
||||
self._font = self._fontstack.pop()
|
||||
self.write('</div>')
|
||||
self.write("</div>")
|
||||
return
|
||||
|
||||
def put_text(self, text: str, fontname: str, fontsize: float) -> None:
|
||||
font = (fontname, fontsize)
|
||||
if font != self._font:
|
||||
if self._font is not None:
|
||||
self.write('</span>')
|
||||
self.write("</span>")
|
||||
# Remove subset tag from fontname, see PDF Reference 5.5.3
|
||||
fontname_without_subset_tag = fontname.split('+')[-1]
|
||||
self.write('<span style="font-family: %s; font-size:%dpx">' %
|
||||
(fontname_without_subset_tag,
|
||||
fontsize * self.scale * self.fontscale))
|
||||
fontname_without_subset_tag = fontname.split("+")[-1]
|
||||
self.write(
|
||||
'<span style="font-family: %s; font-size:%dpx">'
|
||||
% (fontname_without_subset_tag, fontsize * self.scale * self.fontscale)
|
||||
)
|
||||
self._font = font
|
||||
self.write_text(text)
|
||||
return
|
||||
|
||||
def put_newline(self) -> None:
|
||||
self.write('<br>')
|
||||
self.write("<br>")
|
||||
return
|
||||
|
||||
def receive_layout(self, ltpage: LTPage) -> None:
|
||||
def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
|
||||
if isinstance(item, LTTextGroup):
|
||||
self.place_border('textgroup', 1, item)
|
||||
self.place_border("textgroup", 1, item)
|
||||
for child in item:
|
||||
show_group(child)
|
||||
return
|
||||
|
@ -508,63 +570,74 @@ class HTMLConverter(PDFConverter[AnyIO]):
|
|||
child: LTItem
|
||||
if isinstance(item, LTPage):
|
||||
self._yoffset += item.y1
|
||||
self.place_border('page', 1, item)
|
||||
self.place_border("page", 1, item)
|
||||
if self.showpageno:
|
||||
self.write('<div style="position:absolute; top:%dpx;">' %
|
||||
((self._yoffset-item.y1)*self.scale))
|
||||
self.write('<a name="{}">Page {}</a></div>\n'
|
||||
.format(item.pageid, item.pageid))
|
||||
self.write(
|
||||
'<div style="position:absolute; top:%dpx;">'
|
||||
% ((self._yoffset - item.y1) * self.scale)
|
||||
)
|
||||
self.write(
|
||||
'<a name="{}">Page {}</a></div>\n'.format(
|
||||
item.pageid, item.pageid
|
||||
)
|
||||
)
|
||||
for child in item:
|
||||
render(child)
|
||||
if item.groups is not None:
|
||||
for group in item.groups:
|
||||
show_group(group)
|
||||
elif isinstance(item, LTCurve):
|
||||
self.place_border('curve', 1, item)
|
||||
self.place_border("curve", 1, item)
|
||||
elif isinstance(item, LTFigure):
|
||||
self.begin_div('figure', 1, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.end_div('figure')
|
||||
self.end_div("figure")
|
||||
elif isinstance(item, LTImage):
|
||||
self.place_image(item, 1, item.x0, item.y1, item.width,
|
||||
item.height)
|
||||
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
|
||||
else:
|
||||
if self.layoutmode == 'exact':
|
||||
if self.layoutmode == "exact":
|
||||
if isinstance(item, LTTextLine):
|
||||
self.place_border('textline', 1, item)
|
||||
self.place_border("textline", 1, item)
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.place_border('textbox', 1, item)
|
||||
self.place_text('textbox', str(item.index+1), item.x0,
|
||||
item.y1, 20)
|
||||
self.place_border("textbox", 1, item)
|
||||
self.place_text(
|
||||
"textbox", str(item.index + 1), item.x0, item.y1, 20
|
||||
)
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTChar):
|
||||
self.place_border('char', 1, item)
|
||||
self.place_text('char', item.get_text(), item.x0,
|
||||
item.y1, item.size)
|
||||
self.place_border("char", 1, item)
|
||||
self.place_text(
|
||||
"char", item.get_text(), item.x0, item.y1, item.size
|
||||
)
|
||||
else:
|
||||
if isinstance(item, LTTextLine):
|
||||
for child in item:
|
||||
render(child)
|
||||
if self.layoutmode != 'loose':
|
||||
if self.layoutmode != "loose":
|
||||
self.put_newline()
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.begin_div('textbox', 1, item.x0, item.y1,
|
||||
item.width, item.height,
|
||||
item.get_writing_mode())
|
||||
self.begin_div(
|
||||
"textbox",
|
||||
1,
|
||||
item.x0,
|
||||
item.y1,
|
||||
item.width,
|
||||
item.height,
|
||||
item.get_writing_mode(),
|
||||
)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.end_div('textbox')
|
||||
self.end_div("textbox")
|
||||
elif isinstance(item, LTChar):
|
||||
self.put_text(item.get_text(), item.fontname,
|
||||
item.size)
|
||||
self.put_text(item.get_text(), item.fontname, item.size)
|
||||
elif isinstance(item, LTText):
|
||||
self.write_text(item.get_text())
|
||||
return
|
||||
|
||||
render(ltpage)
|
||||
self._yoffset += self.pagemargin
|
||||
return
|
||||
|
@ -576,20 +649,21 @@ class HTMLConverter(PDFConverter[AnyIO]):
|
|||
|
||||
class XMLConverter(PDFConverter[AnyIO]):
|
||||
|
||||
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||
CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
outfp: AnyIO,
|
||||
codec: str = 'utf-8',
|
||||
codec: str = "utf-8",
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None,
|
||||
imagewriter: Optional[ImageWriter] = None,
|
||||
stripcontrol: bool = False
|
||||
stripcontrol: bool = False,
|
||||
) -> None:
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||
laparams=laparams)
|
||||
PDFConverter.__init__(
|
||||
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
|
||||
)
|
||||
|
||||
# write() assumes a codec for binary I/O, or no codec for text I/O.
|
||||
if self.outfp_binary == (not self.codec):
|
||||
|
@ -612,100 +686,125 @@ class XMLConverter(PDFConverter[AnyIO]):
|
|||
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
|
||||
else:
|
||||
self.write('<?xml version="1.0" ?>\n')
|
||||
self.write('<pages>\n')
|
||||
self.write("<pages>\n")
|
||||
return
|
||||
|
||||
def write_footer(self) -> None:
|
||||
self.write('</pages>\n')
|
||||
self.write("</pages>\n")
|
||||
return
|
||||
|
||||
def write_text(self, text: str) -> None:
|
||||
if self.stripcontrol:
|
||||
text = self.CONTROL.sub('', text)
|
||||
text = self.CONTROL.sub("", text)
|
||||
self.write(enc(text))
|
||||
return
|
||||
|
||||
def receive_layout(self, ltpage: LTPage) -> None:
|
||||
def show_group(item: LTItem) -> None:
|
||||
if isinstance(item, LTTextBox):
|
||||
self.write('<textbox id="%d" bbox="%s" />\n' %
|
||||
(item.index, bbox2str(item.bbox)))
|
||||
self.write(
|
||||
'<textbox id="%d" bbox="%s" />\n'
|
||||
% (item.index, bbox2str(item.bbox))
|
||||
)
|
||||
elif isinstance(item, LTTextGroup):
|
||||
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
||||
for child in item:
|
||||
show_group(child)
|
||||
self.write('</textgroup>\n')
|
||||
self.write("</textgroup>\n")
|
||||
return
|
||||
|
||||
def render(item: LTItem) -> None:
|
||||
child: LTItem
|
||||
if isinstance(item, LTPage):
|
||||
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
|
||||
(item.pageid, bbox2str(item.bbox), item.rotate)
|
||||
s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
|
||||
item.pageid,
|
||||
bbox2str(item.bbox),
|
||||
item.rotate,
|
||||
)
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
if item.groups is not None:
|
||||
self.write('<layout>\n')
|
||||
self.write("<layout>\n")
|
||||
for group in item.groups:
|
||||
show_group(group)
|
||||
self.write('</layout>\n')
|
||||
self.write('</page>\n')
|
||||
self.write("</layout>\n")
|
||||
self.write("</page>\n")
|
||||
elif isinstance(item, LTLine):
|
||||
s = '<line linewidth="%d" bbox="%s" />\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox))
|
||||
s = '<line linewidth="%d" bbox="%s" />\n' % (
|
||||
item.linewidth,
|
||||
bbox2str(item.bbox),
|
||||
)
|
||||
self.write(s)
|
||||
elif isinstance(item, LTRect):
|
||||
s = '<rect linewidth="%d" bbox="%s" />\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox))
|
||||
s = '<rect linewidth="%d" bbox="%s" />\n' % (
|
||||
item.linewidth,
|
||||
bbox2str(item.bbox),
|
||||
)
|
||||
self.write(s)
|
||||
elif isinstance(item, LTCurve):
|
||||
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % \
|
||||
(item.linewidth, bbox2str(item.bbox), item.get_pts())
|
||||
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
|
||||
item.linewidth,
|
||||
bbox2str(item.bbox),
|
||||
item.get_pts(),
|
||||
)
|
||||
self.write(s)
|
||||
elif isinstance(item, LTFigure):
|
||||
s = '<figure name="%s" bbox="%s">\n' % \
|
||||
(item.name, bbox2str(item.bbox))
|
||||
s = '<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox))
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</figure>\n')
|
||||
self.write("</figure>\n")
|
||||
elif isinstance(item, LTTextLine):
|
||||
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</textline>\n')
|
||||
self.write("</textline>\n")
|
||||
elif isinstance(item, LTTextBox):
|
||||
wmode = ''
|
||||
wmode = ""
|
||||
if isinstance(item, LTTextBoxVertical):
|
||||
wmode = ' wmode="vertical"'
|
||||
s = '<textbox id="%d" bbox="%s"%s>\n' %\
|
||||
(item.index, bbox2str(item.bbox), wmode)
|
||||
s = '<textbox id="%d" bbox="%s"%s>\n' % (
|
||||
item.index,
|
||||
bbox2str(item.bbox),
|
||||
wmode,
|
||||
)
|
||||
self.write(s)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('</textbox>\n')
|
||||
self.write("</textbox>\n")
|
||||
elif isinstance(item, LTChar):
|
||||
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
|
||||
'ncolour="%s" size="%.3f">' % \
|
||||
(enc(item.fontname), bbox2str(item.bbox),
|
||||
item.ncs.name, item.graphicstate.ncolor, item.size)
|
||||
s = (
|
||||
'<text font="%s" bbox="%s" colourspace="%s" '
|
||||
'ncolour="%s" size="%.3f">'
|
||||
% (
|
||||
enc(item.fontname),
|
||||
bbox2str(item.bbox),
|
||||
item.ncs.name,
|
||||
item.graphicstate.ncolor,
|
||||
item.size,
|
||||
)
|
||||
)
|
||||
self.write(s)
|
||||
self.write_text(item.get_text())
|
||||
self.write('</text>\n')
|
||||
self.write("</text>\n")
|
||||
elif isinstance(item, LTText):
|
||||
self.write('<text>%s</text>\n' % item.get_text())
|
||||
self.write("<text>%s</text>\n" % item.get_text())
|
||||
elif isinstance(item, LTImage):
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
self.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||
(enc(name), item.width, item.height))
|
||||
self.write(
|
||||
'<image src="%s" width="%d" height="%d" />\n'
|
||||
% (enc(name), item.width, item.height)
|
||||
)
|
||||
else:
|
||||
self.write('<image width="%d" height="%d" />\n' %
|
||||
(item.width, item.height))
|
||||
self.write(
|
||||
'<image width="%d" height="%d" />\n' % (item.width, item.height)
|
||||
)
|
||||
else:
|
||||
assert False, str(('Unhandled', item))
|
||||
assert False, str(("Unhandled", item))
|
||||
return
|
||||
|
||||
render(ltpage)
|
||||
return
|
||||
|
||||
|
|
|
@ -11,18 +11,19 @@ class NumberTree:
|
|||
|
||||
See Section 3.8.6 of the PDF Reference.
|
||||
"""
|
||||
|
||||
def __init__(self, obj: Any):
|
||||
self._obj = dict_value(obj)
|
||||
self.nums: Optional[Iterable[Any]] = None
|
||||
self.kids: Optional[Iterable[Any]] = None
|
||||
self.limits: Optional[Iterable[Any]] = None
|
||||
|
||||
if 'Nums' in self._obj:
|
||||
self.nums = list_value(self._obj['Nums'])
|
||||
if 'Kids' in self._obj:
|
||||
self.kids = list_value(self._obj['Kids'])
|
||||
if 'Limits' in self._obj:
|
||||
self.limits = list_value(self._obj['Limits'])
|
||||
if "Nums" in self._obj:
|
||||
self.nums = list_value(self._obj["Nums"])
|
||||
if "Kids" in self._obj:
|
||||
self.kids = list_value(self._obj["Kids"])
|
||||
if "Limits" in self._obj:
|
||||
self.limits = list_value(self._obj["Limits"])
|
||||
|
||||
def _parse(self) -> List[Tuple[int, Any]]:
|
||||
items = []
|
||||
|
@ -44,7 +45,7 @@ class NumberTree:
|
|||
|
||||
if settings.STRICT:
|
||||
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
|
||||
raise PDFSyntaxError('Number tree elements are out of order')
|
||||
raise PDFSyntaxError("Number tree elements are out of order")
|
||||
else:
|
||||
values.sort(key=lambda t: t[0])
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from .glyphlist import glyphname2unicode
|
|||
from .latin_enc import ENCODING
|
||||
from .psparser import PSLiteral
|
||||
|
||||
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
||||
HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -25,39 +25,41 @@ def name2unicode(name: str) -> str:
|
|||
:returns unicode character if name resembles something,
|
||||
otherwise a KeyError
|
||||
"""
|
||||
name = name.split('.')[0]
|
||||
components = name.split('_')
|
||||
name = name.split(".")[0]
|
||||
components = name.split("_")
|
||||
|
||||
if len(components) > 1:
|
||||
return ''.join(map(name2unicode, components))
|
||||
return "".join(map(name2unicode, components))
|
||||
|
||||
else:
|
||||
if name in glyphname2unicode:
|
||||
return glyphname2unicode[name]
|
||||
|
||||
elif name.startswith('uni'):
|
||||
name_without_uni = name.strip('uni')
|
||||
elif name.startswith("uni"):
|
||||
name_without_uni = name.strip("uni")
|
||||
|
||||
if HEXADECIMAL.match(name_without_uni) and \
|
||||
len(name_without_uni) % 4 == 0:
|
||||
unicode_digits = [int(name_without_uni[i:i + 4], base=16)
|
||||
for i in range(0, len(name_without_uni), 4)]
|
||||
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
|
||||
unicode_digits = [
|
||||
int(name_without_uni[i : i + 4], base=16)
|
||||
for i in range(0, len(name_without_uni), 4)
|
||||
]
|
||||
for digit in unicode_digits:
|
||||
raise_key_error_for_invalid_unicode(digit)
|
||||
characters = map(chr, unicode_digits)
|
||||
return ''.join(characters)
|
||||
return "".join(characters)
|
||||
|
||||
elif name.startswith('u'):
|
||||
name_without_u = name.strip('u')
|
||||
elif name.startswith("u"):
|
||||
name_without_u = name.strip("u")
|
||||
|
||||
if HEXADECIMAL.match(name_without_u) and \
|
||||
4 <= len(name_without_u) <= 6:
|
||||
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
|
||||
unicode_digit = int(name_without_u, base=16)
|
||||
raise_key_error_for_invalid_unicode(unicode_digit)
|
||||
return chr(unicode_digit)
|
||||
|
||||
raise KeyError('Could not convert unicode name "%s" to character because '
|
||||
'it does not match specification' % name)
|
||||
raise KeyError(
|
||||
'Could not convert unicode name "%s" to character because '
|
||||
"it does not match specification" % name
|
||||
)
|
||||
|
||||
|
||||
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
|
||||
|
@ -67,8 +69,10 @@ def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
|
|||
:raises KeyError if unicode digit is invalid
|
||||
"""
|
||||
if 55295 < unicode_digit < 57344:
|
||||
raise KeyError('Unicode digit %d is invalid because '
|
||||
'it is in the range D800 through DFFF' % unicode_digit)
|
||||
raise KeyError(
|
||||
"Unicode digit %d is invalid because "
|
||||
"it is in the range D800 through DFFF" % unicode_digit
|
||||
)
|
||||
|
||||
|
||||
class EncodingDB:
|
||||
|
@ -89,17 +93,15 @@ class EncodingDB:
|
|||
pdf2unicode[pdf] = c
|
||||
|
||||
encodings = {
|
||||
'StandardEncoding': std2unicode,
|
||||
'MacRomanEncoding': mac2unicode,
|
||||
'WinAnsiEncoding': win2unicode,
|
||||
'PDFDocEncoding': pdf2unicode,
|
||||
"StandardEncoding": std2unicode,
|
||||
"MacRomanEncoding": mac2unicode,
|
||||
"WinAnsiEncoding": win2unicode,
|
||||
"PDFDocEncoding": pdf2unicode,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_encoding(
|
||||
cls,
|
||||
name: str,
|
||||
diff: Optional[Iterable[object]] = None
|
||||
cls, name: str, diff: Optional[Iterable[object]] = None
|
||||
) -> Dict[int, str]:
|
||||
cid2unicode = cls.encodings.get(name, cls.std2unicode)
|
||||
if diff:
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -5,8 +5,7 @@ import sys
|
|||
from io import StringIO
|
||||
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
|
||||
|
||||
from .converter import XMLConverter, HTMLConverter, TextConverter, \
|
||||
PDFPageAggregator
|
||||
from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
|
||||
from .image import ImageWriter
|
||||
from .layout import LAParams, LTPage
|
||||
from .pdfdevice import PDFDevice, TagExtractor
|
||||
|
@ -18,20 +17,20 @@ from .utils import open_filename, FileOrName, AnyIO
|
|||
def extract_text_to_fp(
|
||||
inf: BinaryIO,
|
||||
outfp: AnyIO,
|
||||
output_type: str = 'text',
|
||||
codec: str = 'utf-8',
|
||||
output_type: str = "text",
|
||||
codec: str = "utf-8",
|
||||
laparams: Optional[LAParams] = None,
|
||||
maxpages: int = 0,
|
||||
page_numbers: Optional[Container[int]] = None,
|
||||
password: str = "",
|
||||
scale: float = 1.0,
|
||||
rotation: int = 0,
|
||||
layoutmode: str = 'normal',
|
||||
layoutmode: str = "normal",
|
||||
output_dir: Optional[str] = None,
|
||||
strip_control: bool = False,
|
||||
debug: bool = False,
|
||||
disable_caching: bool = False,
|
||||
**kwargs: Any
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Parses text from inf-file and writes to outfp file-like object.
|
||||
|
||||
|
@ -72,39 +71,52 @@ def extract_text_to_fp(
|
|||
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
||||
device: Optional[PDFDevice] = None
|
||||
|
||||
if output_type != 'text' and outfp == sys.stdout:
|
||||
if output_type != "text" and outfp == sys.stdout:
|
||||
outfp = sys.stdout.buffer
|
||||
|
||||
if output_type == 'text':
|
||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
if output_type == "text":
|
||||
device = TextConverter(
|
||||
rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter
|
||||
)
|
||||
|
||||
elif output_type == 'xml':
|
||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
elif output_type == "xml":
|
||||
device = XMLConverter(
|
||||
rsrcmgr,
|
||||
outfp,
|
||||
codec=codec,
|
||||
laparams=laparams,
|
||||
imagewriter=imagewriter,
|
||||
stripcontrol=strip_control)
|
||||
stripcontrol=strip_control,
|
||||
)
|
||||
|
||||
elif output_type == 'html':
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||
layoutmode=layoutmode, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
elif output_type == "html":
|
||||
device = HTMLConverter(
|
||||
rsrcmgr,
|
||||
outfp,
|
||||
codec=codec,
|
||||
scale=scale,
|
||||
layoutmode=layoutmode,
|
||||
laparams=laparams,
|
||||
imagewriter=imagewriter,
|
||||
)
|
||||
|
||||
elif output_type == 'tag':
|
||||
elif output_type == "tag":
|
||||
# Binary I/O is required, but we have no good way to test it here.
|
||||
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
|
||||
|
||||
else:
|
||||
msg = f"Output type can be text, html, xml or tag but is " \
|
||||
f"{output_type}"
|
||||
msg = f"Output type can be text, html, xml or tag but is " f"{output_type}"
|
||||
raise ValueError(msg)
|
||||
|
||||
assert device is not None
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.get_pages(inf,
|
||||
for page in PDFPage.get_pages(
|
||||
inf,
|
||||
page_numbers,
|
||||
maxpages=maxpages,
|
||||
password=password,
|
||||
caching=not disable_caching):
|
||||
caching=not disable_caching,
|
||||
):
|
||||
page.rotate = (page.rotate + rotation) % 360
|
||||
interpreter.process_page(page)
|
||||
|
||||
|
@ -113,12 +125,12 @@ def extract_text_to_fp(
|
|||
|
||||
def extract_text(
|
||||
pdf_file: FileOrName,
|
||||
password: str = '',
|
||||
password: str = "",
|
||||
page_numbers: Optional[Container[int]] = None,
|
||||
maxpages: int = 0,
|
||||
caching: bool = True,
|
||||
codec: str = 'utf-8',
|
||||
laparams: Optional[LAParams] = None
|
||||
codec: str = "utf-8",
|
||||
laparams: Optional[LAParams] = None,
|
||||
) -> str:
|
||||
"""Parse and return the text contained in a PDF file.
|
||||
|
||||
|
@ -139,8 +151,7 @@ def extract_text(
|
|||
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
|
||||
fp = cast(BinaryIO, fp) # we opened in binary mode
|
||||
rsrcmgr = PDFResourceManager(caching=caching)
|
||||
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
||||
laparams=laparams)
|
||||
device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
||||
for page in PDFPage.get_pages(
|
||||
|
@ -157,11 +168,11 @@ def extract_text(
|
|||
|
||||
def extract_pages(
|
||||
pdf_file: FileOrName,
|
||||
password: str = '',
|
||||
password: str = "",
|
||||
page_numbers: Optional[Container[int]] = None,
|
||||
maxpages: int = 0,
|
||||
caching: bool = True,
|
||||
laparams: Optional[LAParams] = None
|
||||
laparams: Optional[LAParams] = None,
|
||||
) -> Iterator[LTPage]:
|
||||
"""Extract and yield LTPage objects
|
||||
|
||||
|
@ -183,8 +194,9 @@ def extract_pages(
|
|||
resource_manager = PDFResourceManager(caching=caching)
|
||||
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
|
||||
password=password, caching=caching):
|
||||
for page in PDFPage.get_pages(
|
||||
fp, page_numbers, maxpages=maxpages, password=password, caching=caching
|
||||
):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
yield layout
|
||||
|
|
|
@ -9,8 +9,7 @@ from .layout import LTImage
|
|||
from .pdfcolor import LITERAL_DEVICE_CMYK
|
||||
from .pdfcolor import LITERAL_DEVICE_GRAY
|
||||
from .pdfcolor import LITERAL_DEVICE_RGB
|
||||
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, \
|
||||
LITERALS_JPX_DECODE
|
||||
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_JPX_DECODE
|
||||
|
||||
|
||||
def align32(x: int) -> int:
|
||||
|
@ -18,13 +17,7 @@ def align32(x: int) -> int:
|
|||
|
||||
|
||||
class BMPWriter:
|
||||
def __init__(
|
||||
self,
|
||||
fp: BinaryIO,
|
||||
bits: int,
|
||||
width: int,
|
||||
height: int
|
||||
) -> None:
|
||||
def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
|
||||
self.fp = fp
|
||||
self.bits = bits
|
||||
self.width = width
|
||||
|
@ -40,22 +33,35 @@ class BMPWriter:
|
|||
self.linesize = align32((self.width * self.bits + 7) // 8)
|
||||
self.datasize = self.linesize * self.height
|
||||
headersize = 14 + 40 + ncols * 4
|
||||
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height,
|
||||
1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
|
||||
info = struct.pack(
|
||||
"<IiiHHIIIIII",
|
||||
40,
|
||||
self.width,
|
||||
self.height,
|
||||
1,
|
||||
self.bits,
|
||||
0,
|
||||
self.datasize,
|
||||
0,
|
||||
0,
|
||||
ncols,
|
||||
0,
|
||||
)
|
||||
assert len(info) == 40, str(len(info))
|
||||
header = struct.pack('<ccIHHI', b'B', b'M',
|
||||
headersize+self.datasize, 0, 0, headersize)
|
||||
header = struct.pack(
|
||||
"<ccIHHI", b"B", b"M", headersize + self.datasize, 0, 0, headersize
|
||||
)
|
||||
assert len(header) == 14, str(len(header))
|
||||
self.fp.write(header)
|
||||
self.fp.write(info)
|
||||
if ncols == 2:
|
||||
# B&W color table
|
||||
for i in (0, 255):
|
||||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
self.fp.write(struct.pack("BBBx", i, i, i))
|
||||
elif ncols == 256:
|
||||
# grayscale color table
|
||||
for i in range(256):
|
||||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
self.fp.write(struct.pack("BBBx", i, i, i))
|
||||
self.pos0 = self.fp.tell()
|
||||
self.pos1 = self.pos0 + self.datasize
|
||||
|
||||
|
@ -80,43 +86,46 @@ class ImageWriter:
|
|||
|
||||
is_jbig2 = self.is_jbig2_image(image)
|
||||
ext = self._get_image_extension(image, width, height, is_jbig2)
|
||||
name, path = self._create_unique_image_name(self.outdir,
|
||||
image.name, ext)
|
||||
name, path = self._create_unique_image_name(self.outdir, image.name, ext)
|
||||
|
||||
fp = open(path, 'wb')
|
||||
if ext == '.jpg':
|
||||
fp = open(path, "wb")
|
||||
if ext == ".jpg":
|
||||
raw_data = image.stream.get_rawdata()
|
||||
assert raw_data is not None
|
||||
if LITERAL_DEVICE_CMYK in image.colorspace:
|
||||
from PIL import Image # type: ignore[import]
|
||||
from PIL import ImageChops
|
||||
|
||||
ifp = BytesIO(raw_data)
|
||||
i = Image.open(ifp)
|
||||
i = ImageChops.invert(i)
|
||||
i = i.convert('RGB')
|
||||
i.save(fp, 'JPEG')
|
||||
i = i.convert("RGB")
|
||||
i.save(fp, "JPEG")
|
||||
else:
|
||||
fp.write(raw_data)
|
||||
elif ext == '.jp2':
|
||||
elif ext == ".jp2":
|
||||
# if we just write the raw data, most image programs
|
||||
# that I have tried cannot open the file. However,
|
||||
# open and saving with PIL produces a file that
|
||||
# seems to be easily opened by other programs
|
||||
from PIL import Image
|
||||
|
||||
raw_data = image.stream.get_rawdata()
|
||||
assert raw_data is not None
|
||||
ifp = BytesIO(raw_data)
|
||||
i = Image.open(ifp)
|
||||
i.save(fp, 'JPEG2000')
|
||||
i.save(fp, "JPEG2000")
|
||||
elif is_jbig2:
|
||||
input_stream = BytesIO()
|
||||
global_streams = self.jbig2_global(image)
|
||||
if len(global_streams) > 1:
|
||||
msg = 'There should never be more than one JBIG2Globals ' \
|
||||
'associated with a JBIG2 embedded image'
|
||||
msg = (
|
||||
"There should never be more than one JBIG2Globals "
|
||||
"associated with a JBIG2 embedded image"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
if len(global_streams) == 1:
|
||||
input_stream.write(global_streams[0].get_data().rstrip(b'\n'))
|
||||
input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
|
||||
input_stream.write(image.stream.get_data())
|
||||
input_stream.seek(0)
|
||||
reader = JBIG2StreamReader(input_stream)
|
||||
|
@ -168,43 +177,42 @@ class ImageWriter:
|
|||
filters = image.stream.get_filters()
|
||||
for filter_name, params in filters:
|
||||
if filter_name in LITERALS_JBIG2_DECODE:
|
||||
global_streams.append(params['JBIG2Globals'].resolve())
|
||||
global_streams.append(params["JBIG2Globals"].resolve())
|
||||
return global_streams
|
||||
|
||||
@staticmethod
|
||||
def _get_image_extension(
|
||||
image: LTImage,
|
||||
width: int,
|
||||
height: int,
|
||||
is_jbig2: bool
|
||||
image: LTImage, width: int, height: int, is_jbig2: bool
|
||||
) -> str:
|
||||
filters = image.stream.get_filters()
|
||||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||
ext = '.jpg'
|
||||
ext = ".jpg"
|
||||
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
|
||||
ext = '.jp2'
|
||||
ext = ".jp2"
|
||||
elif is_jbig2:
|
||||
ext = '.jb2'
|
||||
elif (image.bits == 1 or
|
||||
image.bits == 8 and
|
||||
(LITERAL_DEVICE_RGB in image.colorspace or
|
||||
LITERAL_DEVICE_GRAY in image.colorspace)):
|
||||
ext = '.%dx%d.bmp' % (width, height)
|
||||
ext = ".jb2"
|
||||
elif (
|
||||
image.bits == 1
|
||||
or image.bits == 8
|
||||
and (
|
||||
LITERAL_DEVICE_RGB in image.colorspace
|
||||
or LITERAL_DEVICE_GRAY in image.colorspace
|
||||
)
|
||||
):
|
||||
ext = ".%dx%d.bmp" % (width, height)
|
||||
else:
|
||||
ext = '.%d.%dx%d.img' % (image.bits, width, height)
|
||||
ext = ".%d.%dx%d.img" % (image.bits, width, height)
|
||||
return ext
|
||||
|
||||
@staticmethod
|
||||
def _create_unique_image_name(
|
||||
dirname: str,
|
||||
image_name: str,
|
||||
ext: str
|
||||
dirname: str, image_name: str, ext: str
|
||||
) -> Tuple[str, str]:
|
||||
name = image_name + ext
|
||||
path = os.path.join(dirname, name)
|
||||
img_index = 0
|
||||
while os.path.exists(path):
|
||||
name = '%s.%d%s' % (image_name, img_index, ext)
|
||||
name = "%s.%d%s" % (image_name, img_index, ext)
|
||||
path = os.path.join(dirname, name)
|
||||
img_index += 1
|
||||
return name, path
|
||||
|
|
|
@ -19,10 +19,10 @@ HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
|
|||
SEG_TYPE_MASK = 0b00111111
|
||||
|
||||
REF_COUNT_SHORT_MASK = 0b11100000
|
||||
REF_COUNT_LONG_MASK = 0x1fffffff
|
||||
REF_COUNT_LONG_MASK = 0x1FFFFFFF
|
||||
REF_COUNT_LONG = 7
|
||||
|
||||
DATA_LEN_UNKNOWN = 0xffffffff
|
||||
DATA_LEN_UNKNOWN = 0xFFFFFFFF
|
||||
|
||||
# segment types
|
||||
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
|
||||
|
@ -30,7 +30,7 @@ SEG_TYPE_END_OF_PAGE = 49
|
|||
SEG_TYPE_END_OF_FILE = 51
|
||||
|
||||
# file literals
|
||||
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
|
||||
FILE_HEADER_ID = b"\x97\x4A\x42\x32\x0D\x0A\x1A\x0A"
|
||||
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
|
||||
|
||||
|
||||
|
@ -66,12 +66,14 @@ def unpack_int(format: str, buffer: bytes) -> int:
|
|||
|
||||
JBIG2SegmentFlags = Dict[str, Union[int, bool]]
|
||||
JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]]
|
||||
JBIG2Segment = Dict[str, Union[bool, int, bytes, JBIG2SegmentFlags,
|
||||
JBIG2RetentionFlags]]
|
||||
JBIG2Segment = Dict[
|
||||
str, Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2RetentionFlags]
|
||||
]
|
||||
|
||||
|
||||
class JBIG2StreamReader:
|
||||
"""Read segments from a JBIG2 byte stream"""
|
||||
|
||||
def __init__(self, stream: BinaryIO) -> None:
|
||||
self.stream = stream
|
||||
|
||||
|
@ -96,29 +98,23 @@ class JBIG2StreamReader:
|
|||
return segments
|
||||
|
||||
def is_eof(self) -> bool:
|
||||
if self.stream.read(1) == b'':
|
||||
if self.stream.read(1) == b"":
|
||||
return True
|
||||
else:
|
||||
self.stream.seek(-1, os.SEEK_CUR)
|
||||
return False
|
||||
|
||||
def parse_flags(
|
||||
self,
|
||||
segment: JBIG2Segment,
|
||||
flags: int,
|
||||
field: bytes
|
||||
self, segment: JBIG2Segment, flags: int, field: bytes
|
||||
) -> JBIG2SegmentFlags:
|
||||
return {
|
||||
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
|
||||
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
|
||||
"type": masked_value(SEG_TYPE_MASK, flags)
|
||||
"type": masked_value(SEG_TYPE_MASK, flags),
|
||||
}
|
||||
|
||||
def parse_retention_flags(
|
||||
self,
|
||||
segment: JBIG2Segment,
|
||||
flags: int,
|
||||
field: bytes
|
||||
self, segment: JBIG2Segment, flags: int, field: bytes
|
||||
) -> JBIG2RetentionFlags:
|
||||
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
|
||||
retain_segments = []
|
||||
|
@ -159,31 +155,23 @@ class JBIG2StreamReader:
|
|||
"ref_segments": ref_segments,
|
||||
}
|
||||
|
||||
def parse_page_assoc(
|
||||
self,
|
||||
segment: JBIG2Segment,
|
||||
page: int,
|
||||
field: bytes
|
||||
) -> int:
|
||||
def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int:
|
||||
if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
|
||||
field += self.stream.read(3)
|
||||
page = unpack_int(">L", field)
|
||||
return page
|
||||
|
||||
def parse_data_length(
|
||||
self,
|
||||
segment: JBIG2Segment,
|
||||
length: int,
|
||||
field: bytes
|
||||
self, segment: JBIG2Segment, length: int, field: bytes
|
||||
) -> int:
|
||||
if length:
|
||||
if (cast(JBIG2SegmentFlags, segment["flags"])["type"] ==
|
||||
SEG_TYPE_IMMEDIATE_GEN_REGION) \
|
||||
and (length == DATA_LEN_UNKNOWN):
|
||||
if (
|
||||
cast(JBIG2SegmentFlags, segment["flags"])["type"]
|
||||
== SEG_TYPE_IMMEDIATE_GEN_REGION
|
||||
) and (length == DATA_LEN_UNKNOWN):
|
||||
|
||||
raise NotImplementedError(
|
||||
"Working with unknown segment length "
|
||||
"is not implemented yet"
|
||||
"Working with unknown segment length " "is not implemented yet"
|
||||
)
|
||||
else:
|
||||
segment["raw_data"] = self.stream.read(length)
|
||||
|
@ -195,18 +183,16 @@ class JBIG2StreamWriter:
|
|||
"""Write JBIG2 segments to a file in JBIG2 format"""
|
||||
|
||||
EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
|
||||
'ref_count': 0,
|
||||
'ref_segments': cast(List[int], []),
|
||||
'retain_segments': cast(List[bool], [])
|
||||
"ref_count": 0,
|
||||
"ref_segments": cast(List[int], []),
|
||||
"retain_segments": cast(List[bool], []),
|
||||
}
|
||||
|
||||
def __init__(self, stream: BinaryIO) -> None:
|
||||
self.stream = stream
|
||||
|
||||
def write_segments(
|
||||
self,
|
||||
segments: Iterable[JBIG2Segment],
|
||||
fix_last_page: bool = True
|
||||
self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True
|
||||
) -> int:
|
||||
data_len = 0
|
||||
current_page: Optional[int] = None
|
||||
|
@ -222,8 +208,10 @@ class JBIG2StreamWriter:
|
|||
if fix_last_page:
|
||||
seg_page = cast(int, segment.get("page_assoc"))
|
||||
|
||||
if cast(JBIG2SegmentFlags, segment["flags"])["type"] == \
|
||||
SEG_TYPE_END_OF_PAGE:
|
||||
if (
|
||||
cast(JBIG2SegmentFlags, segment["flags"])["type"]
|
||||
== SEG_TYPE_END_OF_PAGE
|
||||
):
|
||||
current_page = None
|
||||
elif seg_page:
|
||||
current_page = seg_page
|
||||
|
@ -237,9 +225,7 @@ class JBIG2StreamWriter:
|
|||
return data_len
|
||||
|
||||
def write_file(
|
||||
self,
|
||||
segments: Iterable[JBIG2Segment],
|
||||
fix_last_page: bool = True
|
||||
self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True
|
||||
) -> int:
|
||||
header = FILE_HEADER_ID
|
||||
header_flags = FILE_HEAD_FLAG_SEQUENTIAL
|
||||
|
@ -270,7 +256,7 @@ class JBIG2StreamWriter:
|
|||
return data_len
|
||||
|
||||
def encode_segment(self, segment: JBIG2Segment) -> bytes:
|
||||
data = b''
|
||||
data = b""
|
||||
for field_format, name in SEG_STRUCT:
|
||||
value = segment.get(name)
|
||||
encoder = getattr(self, "encode_%s" % name, None)
|
||||
|
@ -281,27 +267,26 @@ class JBIG2StreamWriter:
|
|||
data += field
|
||||
return data
|
||||
|
||||
def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment
|
||||
) -> bytes:
|
||||
def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes:
|
||||
flags = 0
|
||||
if value.get("deferred"):
|
||||
flags |= HEADER_FLAG_DEFERRED
|
||||
|
||||
if "page_assoc_long" in value:
|
||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
||||
if value["page_assoc_long"] else flags
|
||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags
|
||||
else:
|
||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
||||
if cast(int, segment.get("page", 0)) > 255 else flags
|
||||
flags |= (
|
||||
HEADER_FLAG_PAGE_ASSOC_LONG
|
||||
if cast(int, segment.get("page", 0)) > 255
|
||||
else flags
|
||||
)
|
||||
|
||||
flags |= mask_value(SEG_TYPE_MASK, value["type"])
|
||||
|
||||
return pack(">B", flags)
|
||||
|
||||
def encode_retention_flags(
|
||||
self,
|
||||
value: JBIG2RetentionFlags,
|
||||
segment: JBIG2Segment
|
||||
self, value: JBIG2RetentionFlags, segment: JBIG2Segment
|
||||
) -> bytes:
|
||||
flags = []
|
||||
flags_format = ">B"
|
||||
|
@ -318,10 +303,7 @@ class JBIG2StreamWriter:
|
|||
else:
|
||||
bytes_count = math.ceil((ref_count + 1) / 8)
|
||||
flags_format = ">L" + ("B" * bytes_count)
|
||||
flags_dword = mask_value(
|
||||
REF_COUNT_SHORT_MASK,
|
||||
REF_COUNT_LONG
|
||||
) << 24
|
||||
flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24
|
||||
flags.append(flags_dword)
|
||||
|
||||
for byte_index in range(bytes_count):
|
||||
|
@ -353,26 +335,22 @@ class JBIG2StreamWriter:
|
|||
data += cast(bytes, segment["raw_data"])
|
||||
return data
|
||||
|
||||
def get_eop_segment(
|
||||
self,
|
||||
seg_number: int,
|
||||
page_number: int
|
||||
) -> JBIG2Segment:
|
||||
def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment:
|
||||
return {
|
||||
'data_length': 0,
|
||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE},
|
||||
'number': seg_number,
|
||||
'page_assoc': page_number,
|
||||
'raw_data': b'',
|
||||
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS
|
||||
"data_length": 0,
|
||||
"flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE},
|
||||
"number": seg_number,
|
||||
"page_assoc": page_number,
|
||||
"raw_data": b"",
|
||||
"retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
|
||||
}
|
||||
|
||||
def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
|
||||
return {
|
||||
'data_length': 0,
|
||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE},
|
||||
'number': seg_number,
|
||||
'page_assoc': 0,
|
||||
'raw_data': b'',
|
||||
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS
|
||||
"data_length": 0,
|
||||
"flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE},
|
||||
"number": seg_number,
|
||||
"page_assoc": 0,
|
||||
"raw_data": b"",
|
||||
"retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
|
||||
}
|
||||
|
|
|
@ -7,241 +7,240 @@ This table is extracted from PDF Reference Manual 1.6, pp.925
|
|||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
EncodingRow = \
|
||||
Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
|
||||
EncodingRow = Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
|
||||
|
||||
ENCODING: List[EncodingRow] = [
|
||||
# (name, std, mac, win, pdf)
|
||||
('A', 65, 65, 65, 65),
|
||||
('AE', 225, 174, 198, 198),
|
||||
('Aacute', None, 231, 193, 193),
|
||||
('Acircumflex', None, 229, 194, 194),
|
||||
('Adieresis', None, 128, 196, 196),
|
||||
('Agrave', None, 203, 192, 192),
|
||||
('Aring', None, 129, 197, 197),
|
||||
('Atilde', None, 204, 195, 195),
|
||||
('B', 66, 66, 66, 66),
|
||||
('C', 67, 67, 67, 67),
|
||||
('Ccedilla', None, 130, 199, 199),
|
||||
('D', 68, 68, 68, 68),
|
||||
('E', 69, 69, 69, 69),
|
||||
('Eacute', None, 131, 201, 201),
|
||||
('Ecircumflex', None, 230, 202, 202),
|
||||
('Edieresis', None, 232, 203, 203),
|
||||
('Egrave', None, 233, 200, 200),
|
||||
('Eth', None, None, 208, 208),
|
||||
('Euro', None, None, 128, 160),
|
||||
('F', 70, 70, 70, 70),
|
||||
('G', 71, 71, 71, 71),
|
||||
('H', 72, 72, 72, 72),
|
||||
('I', 73, 73, 73, 73),
|
||||
('Iacute', None, 234, 205, 205),
|
||||
('Icircumflex', None, 235, 206, 206),
|
||||
('Idieresis', None, 236, 207, 207),
|
||||
('Igrave', None, 237, 204, 204),
|
||||
('J', 74, 74, 74, 74),
|
||||
('K', 75, 75, 75, 75),
|
||||
('L', 76, 76, 76, 76),
|
||||
('Lslash', 232, None, None, 149),
|
||||
('M', 77, 77, 77, 77),
|
||||
('N', 78, 78, 78, 78),
|
||||
('Ntilde', None, 132, 209, 209),
|
||||
('O', 79, 79, 79, 79),
|
||||
('OE', 234, 206, 140, 150),
|
||||
('Oacute', None, 238, 211, 211),
|
||||
('Ocircumflex', None, 239, 212, 212),
|
||||
('Odieresis', None, 133, 214, 214),
|
||||
('Ograve', None, 241, 210, 210),
|
||||
('Oslash', 233, 175, 216, 216),
|
||||
('Otilde', None, 205, 213, 213),
|
||||
('P', 80, 80, 80, 80),
|
||||
('Q', 81, 81, 81, 81),
|
||||
('R', 82, 82, 82, 82),
|
||||
('S', 83, 83, 83, 83),
|
||||
('Scaron', None, None, 138, 151),
|
||||
('T', 84, 84, 84, 84),
|
||||
('Thorn', None, None, 222, 222),
|
||||
('U', 85, 85, 85, 85),
|
||||
('Uacute', None, 242, 218, 218),
|
||||
('Ucircumflex', None, 243, 219, 219),
|
||||
('Udieresis', None, 134, 220, 220),
|
||||
('Ugrave', None, 244, 217, 217),
|
||||
('V', 86, 86, 86, 86),
|
||||
('W', 87, 87, 87, 87),
|
||||
('X', 88, 88, 88, 88),
|
||||
('Y', 89, 89, 89, 89),
|
||||
('Yacute', None, None, 221, 221),
|
||||
('Ydieresis', None, 217, 159, 152),
|
||||
('Z', 90, 90, 90, 90),
|
||||
('Zcaron', None, None, 142, 153),
|
||||
('a', 97, 97, 97, 97),
|
||||
('aacute', None, 135, 225, 225),
|
||||
('acircumflex', None, 137, 226, 226),
|
||||
('acute', 194, 171, 180, 180),
|
||||
('adieresis', None, 138, 228, 228),
|
||||
('ae', 241, 190, 230, 230),
|
||||
('agrave', None, 136, 224, 224),
|
||||
('ampersand', 38, 38, 38, 38),
|
||||
('aring', None, 140, 229, 229),
|
||||
('asciicircum', 94, 94, 94, 94),
|
||||
('asciitilde', 126, 126, 126, 126),
|
||||
('asterisk', 42, 42, 42, 42),
|
||||
('at', 64, 64, 64, 64),
|
||||
('atilde', None, 139, 227, 227),
|
||||
('b', 98, 98, 98, 98),
|
||||
('backslash', 92, 92, 92, 92),
|
||||
('bar', 124, 124, 124, 124),
|
||||
('braceleft', 123, 123, 123, 123),
|
||||
('braceright', 125, 125, 125, 125),
|
||||
('bracketleft', 91, 91, 91, 91),
|
||||
('bracketright', 93, 93, 93, 93),
|
||||
('breve', 198, 249, None, 24),
|
||||
('brokenbar', None, None, 166, 166),
|
||||
('bullet', 183, 165, 149, 128),
|
||||
('c', 99, 99, 99, 99),
|
||||
('caron', 207, 255, None, 25),
|
||||
('ccedilla', None, 141, 231, 231),
|
||||
('cedilla', 203, 252, 184, 184),
|
||||
('cent', 162, 162, 162, 162),
|
||||
('circumflex', 195, 246, 136, 26),
|
||||
('colon', 58, 58, 58, 58),
|
||||
('comma', 44, 44, 44, 44),
|
||||
('copyright', None, 169, 169, 169),
|
||||
('currency', 168, 219, 164, 164),
|
||||
('d', 100, 100, 100, 100),
|
||||
('dagger', 178, 160, 134, 129),
|
||||
('daggerdbl', 179, 224, 135, 130),
|
||||
('degree', None, 161, 176, 176),
|
||||
('dieresis', 200, 172, 168, 168),
|
||||
('divide', None, 214, 247, 247),
|
||||
('dollar', 36, 36, 36, 36),
|
||||
('dotaccent', 199, 250, None, 27),
|
||||
('dotlessi', 245, 245, None, 154),
|
||||
('e', 101, 101, 101, 101),
|
||||
('eacute', None, 142, 233, 233),
|
||||
('ecircumflex', None, 144, 234, 234),
|
||||
('edieresis', None, 145, 235, 235),
|
||||
('egrave', None, 143, 232, 232),
|
||||
('eight', 56, 56, 56, 56),
|
||||
('ellipsis', 188, 201, 133, 131),
|
||||
('emdash', 208, 209, 151, 132),
|
||||
('endash', 177, 208, 150, 133),
|
||||
('equal', 61, 61, 61, 61),
|
||||
('eth', None, None, 240, 240),
|
||||
('exclam', 33, 33, 33, 33),
|
||||
('exclamdown', 161, 193, 161, 161),
|
||||
('f', 102, 102, 102, 102),
|
||||
('fi', 174, 222, None, 147),
|
||||
('five', 53, 53, 53, 53),
|
||||
('fl', 175, 223, None, 148),
|
||||
('florin', 166, 196, 131, 134),
|
||||
('four', 52, 52, 52, 52),
|
||||
('fraction', 164, 218, None, 135),
|
||||
('g', 103, 103, 103, 103),
|
||||
('germandbls', 251, 167, 223, 223),
|
||||
('grave', 193, 96, 96, 96),
|
||||
('greater', 62, 62, 62, 62),
|
||||
('guillemotleft', 171, 199, 171, 171),
|
||||
('guillemotright', 187, 200, 187, 187),
|
||||
('guilsinglleft', 172, 220, 139, 136),
|
||||
('guilsinglright', 173, 221, 155, 137),
|
||||
('h', 104, 104, 104, 104),
|
||||
('hungarumlaut', 205, 253, None, 28),
|
||||
('hyphen', 45, 45, 45, 45),
|
||||
('i', 105, 105, 105, 105),
|
||||
('iacute', None, 146, 237, 237),
|
||||
('icircumflex', None, 148, 238, 238),
|
||||
('idieresis', None, 149, 239, 239),
|
||||
('igrave', None, 147, 236, 236),
|
||||
('j', 106, 106, 106, 106),
|
||||
('k', 107, 107, 107, 107),
|
||||
('l', 108, 108, 108, 108),
|
||||
('less', 60, 60, 60, 60),
|
||||
('logicalnot', None, 194, 172, 172),
|
||||
('lslash', 248, None, None, 155),
|
||||
('m', 109, 109, 109, 109),
|
||||
('macron', 197, 248, 175, 175),
|
||||
('minus', None, None, None, 138),
|
||||
('mu', None, 181, 181, 181),
|
||||
('multiply', None, None, 215, 215),
|
||||
('n', 110, 110, 110, 110),
|
||||
('nbspace', None, 202, 160, None),
|
||||
('nine', 57, 57, 57, 57),
|
||||
('ntilde', None, 150, 241, 241),
|
||||
('numbersign', 35, 35, 35, 35),
|
||||
('o', 111, 111, 111, 111),
|
||||
('oacute', None, 151, 243, 243),
|
||||
('ocircumflex', None, 153, 244, 244),
|
||||
('odieresis', None, 154, 246, 246),
|
||||
('oe', 250, 207, 156, 156),
|
||||
('ogonek', 206, 254, None, 29),
|
||||
('ograve', None, 152, 242, 242),
|
||||
('one', 49, 49, 49, 49),
|
||||
('onehalf', None, None, 189, 189),
|
||||
('onequarter', None, None, 188, 188),
|
||||
('onesuperior', None, None, 185, 185),
|
||||
('ordfeminine', 227, 187, 170, 170),
|
||||
('ordmasculine', 235, 188, 186, 186),
|
||||
('oslash', 249, 191, 248, 248),
|
||||
('otilde', None, 155, 245, 245),
|
||||
('p', 112, 112, 112, 112),
|
||||
('paragraph', 182, 166, 182, 182),
|
||||
('parenleft', 40, 40, 40, 40),
|
||||
('parenright', 41, 41, 41, 41),
|
||||
('percent', 37, 37, 37, 37),
|
||||
('period', 46, 46, 46, 46),
|
||||
('periodcentered', 180, 225, 183, 183),
|
||||
('perthousand', 189, 228, 137, 139),
|
||||
('plus', 43, 43, 43, 43),
|
||||
('plusminus', None, 177, 177, 177),
|
||||
('q', 113, 113, 113, 113),
|
||||
('question', 63, 63, 63, 63),
|
||||
('questiondown', 191, 192, 191, 191),
|
||||
('quotedbl', 34, 34, 34, 34),
|
||||
('quotedblbase', 185, 227, 132, 140),
|
||||
('quotedblleft', 170, 210, 147, 141),
|
||||
('quotedblright', 186, 211, 148, 142),
|
||||
('quoteleft', 96, 212, 145, 143),
|
||||
('quoteright', 39, 213, 146, 144),
|
||||
('quotesinglbase', 184, 226, 130, 145),
|
||||
('quotesingle', 169, 39, 39, 39),
|
||||
('r', 114, 114, 114, 114),
|
||||
('registered', None, 168, 174, 174),
|
||||
('ring', 202, 251, None, 30),
|
||||
('s', 115, 115, 115, 115),
|
||||
('scaron', None, None, 154, 157),
|
||||
('section', 167, 164, 167, 167),
|
||||
('semicolon', 59, 59, 59, 59),
|
||||
('seven', 55, 55, 55, 55),
|
||||
('six', 54, 54, 54, 54),
|
||||
('slash', 47, 47, 47, 47),
|
||||
('space', 32, 32, 32, 32),
|
||||
('space', None, 202, 160, None),
|
||||
('space', None, 202, 173, None),
|
||||
('sterling', 163, 163, 163, 163),
|
||||
('t', 116, 116, 116, 116),
|
||||
('thorn', None, None, 254, 254),
|
||||
('three', 51, 51, 51, 51),
|
||||
('threequarters', None, None, 190, 190),
|
||||
('threesuperior', None, None, 179, 179),
|
||||
('tilde', 196, 247, 152, 31),
|
||||
('trademark', None, 170, 153, 146),
|
||||
('two', 50, 50, 50, 50),
|
||||
('twosuperior', None, None, 178, 178),
|
||||
('u', 117, 117, 117, 117),
|
||||
('uacute', None, 156, 250, 250),
|
||||
('ucircumflex', None, 158, 251, 251),
|
||||
('udieresis', None, 159, 252, 252),
|
||||
('ugrave', None, 157, 249, 249),
|
||||
('underscore', 95, 95, 95, 95),
|
||||
('v', 118, 118, 118, 118),
|
||||
('w', 119, 119, 119, 119),
|
||||
('x', 120, 120, 120, 120),
|
||||
('y', 121, 121, 121, 121),
|
||||
('yacute', None, None, 253, 253),
|
||||
('ydieresis', None, 216, 255, 255),
|
||||
('yen', 165, 180, 165, 165),
|
||||
('z', 122, 122, 122, 122),
|
||||
('zcaron', None, None, 158, 158),
|
||||
('zero', 48, 48, 48, 48),
|
||||
("A", 65, 65, 65, 65),
|
||||
("AE", 225, 174, 198, 198),
|
||||
("Aacute", None, 231, 193, 193),
|
||||
("Acircumflex", None, 229, 194, 194),
|
||||
("Adieresis", None, 128, 196, 196),
|
||||
("Agrave", None, 203, 192, 192),
|
||||
("Aring", None, 129, 197, 197),
|
||||
("Atilde", None, 204, 195, 195),
|
||||
("B", 66, 66, 66, 66),
|
||||
("C", 67, 67, 67, 67),
|
||||
("Ccedilla", None, 130, 199, 199),
|
||||
("D", 68, 68, 68, 68),
|
||||
("E", 69, 69, 69, 69),
|
||||
("Eacute", None, 131, 201, 201),
|
||||
("Ecircumflex", None, 230, 202, 202),
|
||||
("Edieresis", None, 232, 203, 203),
|
||||
("Egrave", None, 233, 200, 200),
|
||||
("Eth", None, None, 208, 208),
|
||||
("Euro", None, None, 128, 160),
|
||||
("F", 70, 70, 70, 70),
|
||||
("G", 71, 71, 71, 71),
|
||||
("H", 72, 72, 72, 72),
|
||||
("I", 73, 73, 73, 73),
|
||||
("Iacute", None, 234, 205, 205),
|
||||
("Icircumflex", None, 235, 206, 206),
|
||||
("Idieresis", None, 236, 207, 207),
|
||||
("Igrave", None, 237, 204, 204),
|
||||
("J", 74, 74, 74, 74),
|
||||
("K", 75, 75, 75, 75),
|
||||
("L", 76, 76, 76, 76),
|
||||
("Lslash", 232, None, None, 149),
|
||||
("M", 77, 77, 77, 77),
|
||||
("N", 78, 78, 78, 78),
|
||||
("Ntilde", None, 132, 209, 209),
|
||||
("O", 79, 79, 79, 79),
|
||||
("OE", 234, 206, 140, 150),
|
||||
("Oacute", None, 238, 211, 211),
|
||||
("Ocircumflex", None, 239, 212, 212),
|
||||
("Odieresis", None, 133, 214, 214),
|
||||
("Ograve", None, 241, 210, 210),
|
||||
("Oslash", 233, 175, 216, 216),
|
||||
("Otilde", None, 205, 213, 213),
|
||||
("P", 80, 80, 80, 80),
|
||||
("Q", 81, 81, 81, 81),
|
||||
("R", 82, 82, 82, 82),
|
||||
("S", 83, 83, 83, 83),
|
||||
("Scaron", None, None, 138, 151),
|
||||
("T", 84, 84, 84, 84),
|
||||
("Thorn", None, None, 222, 222),
|
||||
("U", 85, 85, 85, 85),
|
||||
("Uacute", None, 242, 218, 218),
|
||||
("Ucircumflex", None, 243, 219, 219),
|
||||
("Udieresis", None, 134, 220, 220),
|
||||
("Ugrave", None, 244, 217, 217),
|
||||
("V", 86, 86, 86, 86),
|
||||
("W", 87, 87, 87, 87),
|
||||
("X", 88, 88, 88, 88),
|
||||
("Y", 89, 89, 89, 89),
|
||||
("Yacute", None, None, 221, 221),
|
||||
("Ydieresis", None, 217, 159, 152),
|
||||
("Z", 90, 90, 90, 90),
|
||||
("Zcaron", None, None, 142, 153),
|
||||
("a", 97, 97, 97, 97),
|
||||
("aacute", None, 135, 225, 225),
|
||||
("acircumflex", None, 137, 226, 226),
|
||||
("acute", 194, 171, 180, 180),
|
||||
("adieresis", None, 138, 228, 228),
|
||||
("ae", 241, 190, 230, 230),
|
||||
("agrave", None, 136, 224, 224),
|
||||
("ampersand", 38, 38, 38, 38),
|
||||
("aring", None, 140, 229, 229),
|
||||
("asciicircum", 94, 94, 94, 94),
|
||||
("asciitilde", 126, 126, 126, 126),
|
||||
("asterisk", 42, 42, 42, 42),
|
||||
("at", 64, 64, 64, 64),
|
||||
("atilde", None, 139, 227, 227),
|
||||
("b", 98, 98, 98, 98),
|
||||
("backslash", 92, 92, 92, 92),
|
||||
("bar", 124, 124, 124, 124),
|
||||
("braceleft", 123, 123, 123, 123),
|
||||
("braceright", 125, 125, 125, 125),
|
||||
("bracketleft", 91, 91, 91, 91),
|
||||
("bracketright", 93, 93, 93, 93),
|
||||
("breve", 198, 249, None, 24),
|
||||
("brokenbar", None, None, 166, 166),
|
||||
("bullet", 183, 165, 149, 128),
|
||||
("c", 99, 99, 99, 99),
|
||||
("caron", 207, 255, None, 25),
|
||||
("ccedilla", None, 141, 231, 231),
|
||||
("cedilla", 203, 252, 184, 184),
|
||||
("cent", 162, 162, 162, 162),
|
||||
("circumflex", 195, 246, 136, 26),
|
||||
("colon", 58, 58, 58, 58),
|
||||
("comma", 44, 44, 44, 44),
|
||||
("copyright", None, 169, 169, 169),
|
||||
("currency", 168, 219, 164, 164),
|
||||
("d", 100, 100, 100, 100),
|
||||
("dagger", 178, 160, 134, 129),
|
||||
("daggerdbl", 179, 224, 135, 130),
|
||||
("degree", None, 161, 176, 176),
|
||||
("dieresis", 200, 172, 168, 168),
|
||||
("divide", None, 214, 247, 247),
|
||||
("dollar", 36, 36, 36, 36),
|
||||
("dotaccent", 199, 250, None, 27),
|
||||
("dotlessi", 245, 245, None, 154),
|
||||
("e", 101, 101, 101, 101),
|
||||
("eacute", None, 142, 233, 233),
|
||||
("ecircumflex", None, 144, 234, 234),
|
||||
("edieresis", None, 145, 235, 235),
|
||||
("egrave", None, 143, 232, 232),
|
||||
("eight", 56, 56, 56, 56),
|
||||
("ellipsis", 188, 201, 133, 131),
|
||||
("emdash", 208, 209, 151, 132),
|
||||
("endash", 177, 208, 150, 133),
|
||||
("equal", 61, 61, 61, 61),
|
||||
("eth", None, None, 240, 240),
|
||||
("exclam", 33, 33, 33, 33),
|
||||
("exclamdown", 161, 193, 161, 161),
|
||||
("f", 102, 102, 102, 102),
|
||||
("fi", 174, 222, None, 147),
|
||||
("five", 53, 53, 53, 53),
|
||||
("fl", 175, 223, None, 148),
|
||||
("florin", 166, 196, 131, 134),
|
||||
("four", 52, 52, 52, 52),
|
||||
("fraction", 164, 218, None, 135),
|
||||
("g", 103, 103, 103, 103),
|
||||
("germandbls", 251, 167, 223, 223),
|
||||
("grave", 193, 96, 96, 96),
|
||||
("greater", 62, 62, 62, 62),
|
||||
("guillemotleft", 171, 199, 171, 171),
|
||||
("guillemotright", 187, 200, 187, 187),
|
||||
("guilsinglleft", 172, 220, 139, 136),
|
||||
("guilsinglright", 173, 221, 155, 137),
|
||||
("h", 104, 104, 104, 104),
|
||||
("hungarumlaut", 205, 253, None, 28),
|
||||
("hyphen", 45, 45, 45, 45),
|
||||
("i", 105, 105, 105, 105),
|
||||
("iacute", None, 146, 237, 237),
|
||||
("icircumflex", None, 148, 238, 238),
|
||||
("idieresis", None, 149, 239, 239),
|
||||
("igrave", None, 147, 236, 236),
|
||||
("j", 106, 106, 106, 106),
|
||||
("k", 107, 107, 107, 107),
|
||||
("l", 108, 108, 108, 108),
|
||||
("less", 60, 60, 60, 60),
|
||||
("logicalnot", None, 194, 172, 172),
|
||||
("lslash", 248, None, None, 155),
|
||||
("m", 109, 109, 109, 109),
|
||||
("macron", 197, 248, 175, 175),
|
||||
("minus", None, None, None, 138),
|
||||
("mu", None, 181, 181, 181),
|
||||
("multiply", None, None, 215, 215),
|
||||
("n", 110, 110, 110, 110),
|
||||
("nbspace", None, 202, 160, None),
|
||||
("nine", 57, 57, 57, 57),
|
||||
("ntilde", None, 150, 241, 241),
|
||||
("numbersign", 35, 35, 35, 35),
|
||||
("o", 111, 111, 111, 111),
|
||||
("oacute", None, 151, 243, 243),
|
||||
("ocircumflex", None, 153, 244, 244),
|
||||
("odieresis", None, 154, 246, 246),
|
||||
("oe", 250, 207, 156, 156),
|
||||
("ogonek", 206, 254, None, 29),
|
||||
("ograve", None, 152, 242, 242),
|
||||
("one", 49, 49, 49, 49),
|
||||
("onehalf", None, None, 189, 189),
|
||||
("onequarter", None, None, 188, 188),
|
||||
("onesuperior", None, None, 185, 185),
|
||||
("ordfeminine", 227, 187, 170, 170),
|
||||
("ordmasculine", 235, 188, 186, 186),
|
||||
("oslash", 249, 191, 248, 248),
|
||||
("otilde", None, 155, 245, 245),
|
||||
("p", 112, 112, 112, 112),
|
||||
("paragraph", 182, 166, 182, 182),
|
||||
("parenleft", 40, 40, 40, 40),
|
||||
("parenright", 41, 41, 41, 41),
|
||||
("percent", 37, 37, 37, 37),
|
||||
("period", 46, 46, 46, 46),
|
||||
("periodcentered", 180, 225, 183, 183),
|
||||
("perthousand", 189, 228, 137, 139),
|
||||
("plus", 43, 43, 43, 43),
|
||||
("plusminus", None, 177, 177, 177),
|
||||
("q", 113, 113, 113, 113),
|
||||
("question", 63, 63, 63, 63),
|
||||
("questiondown", 191, 192, 191, 191),
|
||||
("quotedbl", 34, 34, 34, 34),
|
||||
("quotedblbase", 185, 227, 132, 140),
|
||||
("quotedblleft", 170, 210, 147, 141),
|
||||
("quotedblright", 186, 211, 148, 142),
|
||||
("quoteleft", 96, 212, 145, 143),
|
||||
("quoteright", 39, 213, 146, 144),
|
||||
("quotesinglbase", 184, 226, 130, 145),
|
||||
("quotesingle", 169, 39, 39, 39),
|
||||
("r", 114, 114, 114, 114),
|
||||
("registered", None, 168, 174, 174),
|
||||
("ring", 202, 251, None, 30),
|
||||
("s", 115, 115, 115, 115),
|
||||
("scaron", None, None, 154, 157),
|
||||
("section", 167, 164, 167, 167),
|
||||
("semicolon", 59, 59, 59, 59),
|
||||
("seven", 55, 55, 55, 55),
|
||||
("six", 54, 54, 54, 54),
|
||||
("slash", 47, 47, 47, 47),
|
||||
("space", 32, 32, 32, 32),
|
||||
("space", None, 202, 160, None),
|
||||
("space", None, 202, 173, None),
|
||||
("sterling", 163, 163, 163, 163),
|
||||
("t", 116, 116, 116, 116),
|
||||
("thorn", None, None, 254, 254),
|
||||
("three", 51, 51, 51, 51),
|
||||
("threequarters", None, None, 190, 190),
|
||||
("threesuperior", None, None, 179, 179),
|
||||
("tilde", 196, 247, 152, 31),
|
||||
("trademark", None, 170, 153, 146),
|
||||
("two", 50, 50, 50, 50),
|
||||
("twosuperior", None, None, 178, 178),
|
||||
("u", 117, 117, 117, 117),
|
||||
("uacute", None, 156, 250, 250),
|
||||
("ucircumflex", None, 158, 251, 251),
|
||||
("udieresis", None, 159, 252, 252),
|
||||
("ugrave", None, 157, 249, 249),
|
||||
("underscore", 95, 95, 95, 95),
|
||||
("v", 118, 118, 118, 118),
|
||||
("w", 119, 119, 119, 119),
|
||||
("x", 120, 120, 120, 120),
|
||||
("y", 121, 121, 121, 121),
|
||||
("yacute", None, None, 253, 253),
|
||||
("ydieresis", None, 216, 255, 255),
|
||||
("yen", 165, 180, 165, 165),
|
||||
("z", 122, 122, 122, 122),
|
||||
("zcaron", None, None, 158, 158),
|
||||
("zero", 48, 48, 48, 48),
|
||||
]
|
||||
|
|
|
@ -1,7 +1,19 @@
|
|||
import heapq
|
||||
import logging
|
||||
from typing import (Dict, Generic, Iterable, Iterator, List, Optional,
|
||||
Sequence, Set, Tuple, TypeVar, Union, cast)
|
||||
from typing import (
|
||||
Dict,
|
||||
Generic,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Set,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
from .pdfcolor import PDFColorSpace
|
||||
from .pdffont import PDFFont
|
||||
|
@ -25,7 +37,6 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class IndexAssigner:
|
||||
|
||||
def __init__(self, index: int = 0) -> None:
|
||||
self.index = index
|
||||
|
||||
|
@ -74,7 +85,7 @@ class LAParams:
|
|||
word_margin: float = 0.1,
|
||||
boxes_flow: Optional[float] = 0.5,
|
||||
detect_vertical: bool = False,
|
||||
all_texts: bool = False
|
||||
all_texts: bool = False,
|
||||
) -> None:
|
||||
self.line_overlap = line_overlap
|
||||
self.char_margin = char_margin
|
||||
|
@ -88,19 +99,22 @@ class LAParams:
|
|||
|
||||
def _validate(self) -> None:
|
||||
if self.boxes_flow is not None:
|
||||
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
|
||||
"number between -1 and +1")
|
||||
if not (isinstance(self.boxes_flow, int) or
|
||||
isinstance(self.boxes_flow, float)):
|
||||
boxes_flow_err_msg = (
|
||||
"LAParam boxes_flow should be None, or a " "number between -1 and +1"
|
||||
)
|
||||
if not (
|
||||
isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
|
||||
):
|
||||
raise TypeError(boxes_flow_err_msg)
|
||||
if not -1 <= self.boxes_flow <= 1:
|
||||
raise ValueError(boxes_flow_err_msg)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
||||
'word_margin=%.1f all_texts=%r>' % \
|
||||
(self.char_margin, self.line_margin, self.word_margin,
|
||||
self.all_texts)
|
||||
return (
|
||||
"<LAParams: char_margin=%.1f, line_margin=%.1f, "
|
||||
"word_margin=%.1f all_texts=%r>"
|
||||
% (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
|
||||
)
|
||||
|
||||
|
||||
class LTItem:
|
||||
|
@ -115,8 +129,7 @@ class LTText:
|
|||
"""Interface for things that have text"""
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s %r>' %
|
||||
(self.__class__.__name__, self.get_text()))
|
||||
return "<%s %r>" % (self.__class__.__name__, self.get_text())
|
||||
|
||||
def get_text(self) -> str:
|
||||
"""Text contained in this object"""
|
||||
|
@ -131,8 +144,7 @@ class LTComponent(LTItem):
|
|||
self.set_bbox(bbox)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s %s>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox)))
|
||||
return "<%s %s>" % (self.__class__.__name__, bbox2str(self.bbox))
|
||||
|
||||
# Disable comparison.
|
||||
def __lt__(self, _: object) -> bool:
|
||||
|
@ -208,7 +220,7 @@ class LTCurve(LTComponent):
|
|||
fill: bool = False,
|
||||
evenodd: bool = False,
|
||||
stroking_color: Optional[Color] = None,
|
||||
non_stroking_color: Optional[Color] = None
|
||||
non_stroking_color: Optional[Color] = None,
|
||||
) -> None:
|
||||
LTComponent.__init__(self, get_bound(pts))
|
||||
self.pts = pts
|
||||
|
@ -220,7 +232,7 @@ class LTCurve(LTComponent):
|
|||
self.non_stroking_color = non_stroking_color
|
||||
|
||||
def get_pts(self) -> str:
|
||||
return ','.join('%.3f,%.3f' % p for p in self.pts)
|
||||
return ",".join("%.3f,%.3f" % p for p in self.pts)
|
||||
|
||||
|
||||
class LTLine(LTCurve):
|
||||
|
@ -238,10 +250,18 @@ class LTLine(LTCurve):
|
|||
fill: bool = False,
|
||||
evenodd: bool = False,
|
||||
stroking_color: Optional[Color] = None,
|
||||
non_stroking_color: Optional[Color] = None
|
||||
non_stroking_color: Optional[Color] = None,
|
||||
) -> None:
|
||||
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
|
||||
stroking_color, non_stroking_color)
|
||||
LTCurve.__init__(
|
||||
self,
|
||||
linewidth,
|
||||
[p0, p1],
|
||||
stroke,
|
||||
fill,
|
||||
evenodd,
|
||||
stroking_color,
|
||||
non_stroking_color,
|
||||
)
|
||||
|
||||
|
||||
class LTRect(LTCurve):
|
||||
|
@ -258,12 +278,19 @@ class LTRect(LTCurve):
|
|||
fill: bool = False,
|
||||
evenodd: bool = False,
|
||||
stroking_color: Optional[Color] = None,
|
||||
non_stroking_color: Optional[Color] = None
|
||||
non_stroking_color: Optional[Color] = None,
|
||||
) -> None:
|
||||
(x0, y0, x1, y1) = bbox
|
||||
LTCurve.__init__(self, linewidth,
|
||||
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke,
|
||||
fill, evenodd, stroking_color, non_stroking_color)
|
||||
LTCurve.__init__(
|
||||
self,
|
||||
linewidth,
|
||||
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
|
||||
stroke,
|
||||
fill,
|
||||
evenodd,
|
||||
stroking_color,
|
||||
non_stroking_color,
|
||||
)
|
||||
|
||||
|
||||
class LTImage(LTComponent):
|
||||
|
@ -276,18 +303,20 @@ class LTImage(LTComponent):
|
|||
LTComponent.__init__(self, bbox)
|
||||
self.name = name
|
||||
self.stream = stream
|
||||
self.srcsize = (stream.get_any(('W', 'Width')),
|
||||
stream.get_any(('H', 'Height')))
|
||||
self.imagemask = stream.get_any(('IM', 'ImageMask'))
|
||||
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
|
||||
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
|
||||
self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
|
||||
self.imagemask = stream.get_any(("IM", "ImageMask"))
|
||||
self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
|
||||
self.colorspace = stream.get_any(("CS", "ColorSpace"))
|
||||
if not isinstance(self.colorspace, list):
|
||||
self.colorspace = [self.colorspace]
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s(%s) %s %r>' %
|
||||
(self.__class__.__name__, self.name,
|
||||
bbox2str(self.bbox), self.srcsize))
|
||||
return "<%s(%s) %s %r>" % (
|
||||
self.__class__.__name__,
|
||||
self.name,
|
||||
bbox2str(self.bbox),
|
||||
self.srcsize,
|
||||
)
|
||||
|
||||
|
||||
class LTAnno(LTItem, LTText):
|
||||
|
@ -320,7 +349,7 @@ class LTChar(LTComponent, LTText):
|
|||
textwidth: float,
|
||||
textdisp: Union[float, Tuple[Optional[float], float]],
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: PDFGraphicState
|
||||
graphicstate: PDFGraphicState,
|
||||
) -> None:
|
||||
LTText.__init__(self)
|
||||
self._text = text
|
||||
|
@ -337,8 +366,8 @@ class LTChar(LTComponent, LTText):
|
|||
if vx is None:
|
||||
vx = fontsize * 0.5
|
||||
else:
|
||||
vx = vx * fontsize * .001
|
||||
vy = (1000 - vy) * fontsize * .001
|
||||
vx = vx * fontsize * 0.001
|
||||
vy = (1000 - vy) * fontsize * 0.001
|
||||
bbox_lower_left = (-vx, vy + rise + self.adv)
|
||||
bbox_upper_right = (-vx + fontsize, vy + rise)
|
||||
else:
|
||||
|
@ -347,7 +376,7 @@ class LTChar(LTComponent, LTText):
|
|||
bbox_lower_left = (0, descent + rise)
|
||||
bbox_upper_right = (self.adv, descent + rise + fontsize)
|
||||
(a, b, c, d, e, f) = self.matrix
|
||||
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||
self.upright = 0 < a * d * scaling and b * c <= 0
|
||||
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
|
||||
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
|
||||
if x1 < x0:
|
||||
|
@ -362,10 +391,14 @@ class LTChar(LTComponent, LTText):
|
|||
return
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox),
|
||||
matrix2str(self.matrix), self.fontname, self.adv,
|
||||
self.get_text()))
|
||||
return "<%s %s matrix=%s font=%r adv=%s text=%r>" % (
|
||||
self.__class__.__name__,
|
||||
bbox2str(self.bbox),
|
||||
matrix2str(self.matrix),
|
||||
self.fontname,
|
||||
self.adv,
|
||||
self.get_text(),
|
||||
)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return self._text
|
||||
|
@ -375,7 +408,7 @@ class LTChar(LTComponent, LTText):
|
|||
return True
|
||||
|
||||
|
||||
LTItemT = TypeVar('LTItemT', bound=LTItem)
|
||||
LTItemT = TypeVar("LTItemT", bound=LTItem)
|
||||
|
||||
|
||||
class LTContainer(LTComponent, Generic[LTItemT]):
|
||||
|
@ -416,8 +449,14 @@ class LTExpandableContainer(LTContainer[LTItemT]):
|
|||
# super() LTContainer only considers LTItem (no bounding box).
|
||||
def add(self, obj: LTComponent) -> None: # type: ignore[override]
|
||||
LTContainer.add(self, cast(LTItemT, obj))
|
||||
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
|
||||
max(self.x1, obj.x1), max(self.y1, obj.y1)))
|
||||
self.set_bbox(
|
||||
(
|
||||
min(self.x0, obj.x0),
|
||||
min(self.y0, obj.y0),
|
||||
max(self.x1, obj.x1),
|
||||
max(self.y1, obj.y1),
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
@ -428,8 +467,9 @@ class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
|
|||
return
|
||||
|
||||
def get_text(self) -> str:
|
||||
return ''.join(cast(LTText, obj).get_text() for obj in self
|
||||
if isinstance(obj, LTText))
|
||||
return "".join(
|
||||
cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
|
||||
)
|
||||
|
||||
|
||||
TextLineElement = Union[LTChar, LTAnno]
|
||||
|
@ -448,16 +488,19 @@ class LTTextLine(LTTextContainer[TextLineElement]):
|
|||
return
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s %s %r>' %
|
||||
(self.__class__.__name__, bbox2str(self.bbox),
|
||||
self.get_text()))
|
||||
return "<%s %s %r>" % (
|
||||
self.__class__.__name__,
|
||||
bbox2str(self.bbox),
|
||||
self.get_text(),
|
||||
)
|
||||
|
||||
def analyze(self, laparams: LAParams) -> None:
|
||||
LTTextContainer.analyze(self, laparams)
|
||||
LTContainer.add(self, LTAnno('\n'))
|
||||
LTContainer.add(self, LTAnno("\n"))
|
||||
return
|
||||
|
||||
def find_neighbors(self, plane: Plane[LTComponentT], ratio: float
|
||||
def find_neighbors(
|
||||
self, plane: Plane[LTComponentT], ratio: float
|
||||
) -> List["LTTextLine"]:
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -474,15 +517,13 @@ class LTTextLineHorizontal(LTTextLine):
|
|||
if isinstance(obj, LTChar) and self.word_margin:
|
||||
margin = self.word_margin * max(obj.width, obj.height)
|
||||
if self._x1 < obj.x0 - margin:
|
||||
LTContainer.add(self, LTAnno(' '))
|
||||
LTContainer.add(self, LTAnno(" "))
|
||||
self._x1 = obj.x1
|
||||
super().add(obj)
|
||||
return
|
||||
|
||||
def find_neighbors(
|
||||
self,
|
||||
plane: Plane[LTComponentT],
|
||||
ratio: float
|
||||
self, plane: Plane[LTComponentT], ratio: float
|
||||
) -> List[LTTextLine]:
|
||||
"""
|
||||
Finds neighboring LTTextLineHorizontals in the plane.
|
||||
|
@ -494,49 +535,41 @@ class LTTextLineHorizontal(LTTextLine):
|
|||
"""
|
||||
d = ratio * self.height
|
||||
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineHorizontal) and
|
||||
self._is_same_height_as(obj, tolerance=d) and
|
||||
(self._is_left_aligned_with(obj, tolerance=d) or
|
||||
self._is_right_aligned_with(obj, tolerance=d) or
|
||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||
return [
|
||||
obj
|
||||
for obj in objs
|
||||
if (
|
||||
isinstance(obj, LTTextLineHorizontal)
|
||||
and self._is_same_height_as(obj, tolerance=d)
|
||||
and (
|
||||
self._is_left_aligned_with(obj, tolerance=d)
|
||||
or self._is_right_aligned_with(obj, tolerance=d)
|
||||
or self._is_centrally_aligned_with(obj, tolerance=d)
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
def _is_left_aligned_with(
|
||||
self,
|
||||
other: LTComponent,
|
||||
tolerance: float = 0
|
||||
) -> bool:
|
||||
def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
||||
"""
|
||||
Whether the left-hand edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.x0 - self.x0) <= tolerance
|
||||
|
||||
def _is_right_aligned_with(
|
||||
self,
|
||||
other: LTComponent,
|
||||
tolerance: float = 0
|
||||
) -> bool:
|
||||
def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
||||
"""
|
||||
Whether the right-hand edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.x1 - self.x1) <= tolerance
|
||||
|
||||
def _is_centrally_aligned_with(
|
||||
self,
|
||||
other: LTComponent,
|
||||
tolerance: float = 0
|
||||
self, other: LTComponent, tolerance: float = 0
|
||||
) -> bool:
|
||||
"""
|
||||
Whether the horizontal center of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(
|
||||
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
||||
return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
||||
|
||||
def _is_same_height_as(
|
||||
self,
|
||||
other: LTComponent,
|
||||
tolerance: float = 0
|
||||
) -> bool:
|
||||
def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
|
||||
return abs(other.height - self.height) <= tolerance
|
||||
|
||||
|
||||
|
@ -552,15 +585,13 @@ class LTTextLineVertical(LTTextLine):
|
|||
if isinstance(obj, LTChar) and self.word_margin:
|
||||
margin = self.word_margin * max(obj.width, obj.height)
|
||||
if obj.y1 + margin < self._y0:
|
||||
LTContainer.add(self, LTAnno(' '))
|
||||
LTContainer.add(self, LTAnno(" "))
|
||||
self._y0 = obj.y0
|
||||
super().add(obj)
|
||||
return
|
||||
|
||||
def find_neighbors(
|
||||
self,
|
||||
plane: Plane[LTComponentT],
|
||||
ratio: float
|
||||
self, plane: Plane[LTComponentT], ratio: float
|
||||
) -> List[LTTextLine]:
|
||||
"""
|
||||
Finds neighboring LTTextLineVerticals in the plane.
|
||||
|
@ -572,43 +603,39 @@ class LTTextLineVertical(LTTextLine):
|
|||
"""
|
||||
d = ratio * self.width
|
||||
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
|
||||
return [obj for obj in objs
|
||||
if (isinstance(obj, LTTextLineVertical) and
|
||||
self._is_same_width_as(obj, tolerance=d) and
|
||||
(self._is_lower_aligned_with(obj, tolerance=d) or
|
||||
self._is_upper_aligned_with(obj, tolerance=d) or
|
||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||
return [
|
||||
obj
|
||||
for obj in objs
|
||||
if (
|
||||
isinstance(obj, LTTextLineVertical)
|
||||
and self._is_same_width_as(obj, tolerance=d)
|
||||
and (
|
||||
self._is_lower_aligned_with(obj, tolerance=d)
|
||||
or self._is_upper_aligned_with(obj, tolerance=d)
|
||||
or self._is_centrally_aligned_with(obj, tolerance=d)
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
def _is_lower_aligned_with(
|
||||
self,
|
||||
other: LTComponent,
|
||||
tolerance: float = 0
|
||||
) -> bool:
|
||||
def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
||||
"""
|
||||
Whether the lower edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.y0 - self.y0) <= tolerance
|
||||
|
||||
def _is_upper_aligned_with(
|
||||
self,
|
||||
other: LTComponent,
|
||||
tolerance: float = 0
|
||||
) -> bool:
|
||||
def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
|
||||
"""
|
||||
Whether the upper edge of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(other.y1 - self.y1) <= tolerance
|
||||
|
||||
def _is_centrally_aligned_with(
|
||||
self,
|
||||
other: LTComponent,
|
||||
tolerance: float = 0
|
||||
self, other: LTComponent, tolerance: float = 0
|
||||
) -> bool:
|
||||
"""
|
||||
Whether the vertical center of `other` is within `tolerance`.
|
||||
"""
|
||||
return abs(
|
||||
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
||||
return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
||||
|
||||
def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
|
||||
return abs(other.width - self.width) <= tolerance
|
||||
|
@ -628,9 +655,12 @@ class LTTextBox(LTTextContainer[LTTextLine]):
|
|||
return
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s(%s) %s %r>' %
|
||||
(self.__class__.__name__,
|
||||
self.index, bbox2str(self.bbox), self.get_text()))
|
||||
return "<%s(%s) %s %r>" % (
|
||||
self.__class__.__name__,
|
||||
self.index,
|
||||
bbox2str(self.bbox),
|
||||
self.get_text(),
|
||||
)
|
||||
|
||||
def get_writing_mode(self) -> str:
|
||||
raise NotImplementedError
|
||||
|
@ -643,7 +673,7 @@ class LTTextBoxHorizontal(LTTextBox):
|
|||
return
|
||||
|
||||
def get_writing_mode(self) -> str:
|
||||
return 'lr-tb'
|
||||
return "lr-tb"
|
||||
|
||||
|
||||
class LTTextBoxVertical(LTTextBox):
|
||||
|
@ -653,7 +683,7 @@ class LTTextBoxVertical(LTTextBox):
|
|||
return
|
||||
|
||||
def get_writing_mode(self) -> str:
|
||||
return 'tb-rl'
|
||||
return "tb-rl"
|
||||
|
||||
|
||||
TextGroupElement = Union[LTTextBox, "LTTextGroup"]
|
||||
|
@ -674,7 +704,8 @@ class LTTextGroupLRTB(LTTextGroup):
|
|||
# reorder the objects from top-left to bottom-right.
|
||||
self._objs.sort(
|
||||
key=lambda obj: (1 - boxes_flow) * obj.x0
|
||||
- (1 + boxes_flow) * (obj.y0 + obj.y1))
|
||||
- (1 + boxes_flow) * (obj.y0 + obj.y1)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
@ -686,7 +717,8 @@ class LTTextGroupTBRL(LTTextGroup):
|
|||
# reorder the objects from top-right to bottom-left.
|
||||
self._objs.sort(
|
||||
key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
|
||||
- (1 - boxes_flow) * obj.y1)
|
||||
- (1 - boxes_flow) * obj.y1
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
@ -698,9 +730,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
|
||||
# group_objects: group text object to textlines.
|
||||
def group_objects(
|
||||
self,
|
||||
laparams: LAParams,
|
||||
objs: Iterable[LTComponent]
|
||||
self, laparams: LAParams, objs: Iterable[LTComponent]
|
||||
) -> Iterator[LTTextLine]:
|
||||
obj0 = None
|
||||
line = None
|
||||
|
@ -716,13 +746,14 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
#
|
||||
# |<--->|
|
||||
# (char_margin)
|
||||
halign = \
|
||||
obj0.is_compatible(obj1) \
|
||||
and obj0.is_voverlap(obj1) \
|
||||
and min(obj0.height, obj1.height) * laparams.line_overlap \
|
||||
< obj0.voverlap(obj1) \
|
||||
and obj0.hdistance(obj1) \
|
||||
halign = (
|
||||
obj0.is_compatible(obj1)
|
||||
and obj0.is_voverlap(obj1)
|
||||
and min(obj0.height, obj1.height) * laparams.line_overlap
|
||||
< obj0.voverlap(obj1)
|
||||
and obj0.hdistance(obj1)
|
||||
< max(obj0.width, obj1.width) * laparams.char_margin
|
||||
)
|
||||
|
||||
# valign: obj0 and obj1 is vertically aligned.
|
||||
#
|
||||
|
@ -738,17 +769,19 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
#
|
||||
# |<-->|
|
||||
# (line_overlap)
|
||||
valign = \
|
||||
laparams.detect_vertical \
|
||||
and obj0.is_compatible(obj1) \
|
||||
and obj0.is_hoverlap(obj1) \
|
||||
and min(obj0.width, obj1.width) * laparams.line_overlap \
|
||||
< obj0.hoverlap(obj1) \
|
||||
and obj0.vdistance(obj1) \
|
||||
valign = (
|
||||
laparams.detect_vertical
|
||||
and obj0.is_compatible(obj1)
|
||||
and obj0.is_hoverlap(obj1)
|
||||
and min(obj0.width, obj1.width) * laparams.line_overlap
|
||||
< obj0.hoverlap(obj1)
|
||||
and obj0.vdistance(obj1)
|
||||
< max(obj0.height, obj1.height) * laparams.char_margin
|
||||
)
|
||||
|
||||
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
||||
(valign and isinstance(line, LTTextLineVertical))):
|
||||
if (halign and isinstance(line, LTTextLineHorizontal)) or (
|
||||
valign and isinstance(line, LTTextLineVertical)
|
||||
):
|
||||
|
||||
line.add(obj1)
|
||||
elif line is not None:
|
||||
|
@ -777,9 +810,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
return
|
||||
|
||||
def group_textlines(
|
||||
self,
|
||||
laparams: LAParams,
|
||||
lines: Iterable[LTTextLine]
|
||||
self, laparams: LAParams, lines: Iterable[LTTextLine]
|
||||
) -> Iterator[LTTextBox]:
|
||||
"""Group neighboring lines to textboxes"""
|
||||
plane: Plane[LTTextLine] = Plane(self.bbox)
|
||||
|
@ -812,9 +843,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
return
|
||||
|
||||
def group_textboxes(
|
||||
self,
|
||||
laparams: LAParams,
|
||||
boxes: Sequence[LTTextBox]
|
||||
self, laparams: LAParams, boxes: Sequence[LTTextBox]
|
||||
) -> List[LTTextGroup]:
|
||||
"""Group textboxes hierarchically.
|
||||
|
||||
|
@ -853,8 +882,11 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
y0 = min(obj1.y0, obj2.y0)
|
||||
x1 = max(obj1.x1, obj2.x1)
|
||||
y1 = max(obj1.y1, obj2.y1)
|
||||
return (x1 - x0) * (y1 - y0) \
|
||||
- obj1.width*obj1.height - obj2.width*obj2.height
|
||||
return (
|
||||
(x1 - x0) * (y1 - y0)
|
||||
- obj1.width * obj1.height
|
||||
- obj2.width * obj2.height
|
||||
)
|
||||
|
||||
def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
|
||||
"""Check if there's any other object between obj1 and obj2."""
|
||||
|
@ -870,8 +902,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
box1 = boxes[i]
|
||||
for j in range(i + 1, len(boxes)):
|
||||
box2 = boxes[j]
|
||||
dists.append((False, dist(box1, box2), id(box1), id(box2),
|
||||
box1, box2))
|
||||
dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
|
||||
heapq.heapify(dists)
|
||||
|
||||
plane.extend(boxes)
|
||||
|
@ -883,8 +914,9 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
if not skip_isany and isany(obj1, obj2):
|
||||
heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
|
||||
continue
|
||||
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
||||
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
|
||||
obj2, (LTTextBoxVertical, LTTextGroupTBRL)
|
||||
):
|
||||
group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
|
||||
else:
|
||||
group = LTTextGroupLRTB([obj1, obj2])
|
||||
|
@ -893,8 +925,10 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
done.update([id1, id2])
|
||||
|
||||
for other in plane:
|
||||
heapq.heappush(dists, (False, dist(group, other),
|
||||
id(group), id(other), group, other))
|
||||
heapq.heappush(
|
||||
dists,
|
||||
(False, dist(group, other), id(group), id(other), group, other),
|
||||
)
|
||||
plane.add(group)
|
||||
# By now only groups are in the plane
|
||||
return list(cast(LTTextGroup, g) for g in plane)
|
||||
|
@ -902,8 +936,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
def analyze(self, laparams: LAParams) -> None:
|
||||
# textobjs is a list of LTChar objects, i.e.
|
||||
# it has all the individual characters in the page.
|
||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
|
||||
self)
|
||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
|
||||
for obj in otherobjs:
|
||||
obj.analyze(laparams)
|
||||
if not textobjs:
|
||||
|
@ -922,6 +955,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
return (0, -box.x1, -box.y0)
|
||||
else:
|
||||
return (1, -box.y0, box.x0)
|
||||
|
||||
textboxes.sort(key=getkey)
|
||||
else:
|
||||
self.groups = self.group_textboxes(laparams, textboxes)
|
||||
|
@ -930,8 +964,11 @@ class LTLayoutContainer(LTContainer[LTComponent]):
|
|||
group.analyze(laparams)
|
||||
assigner.run(group)
|
||||
textboxes.sort(key=lambda box: box.index)
|
||||
self._objs = (cast(List[LTComponent], textboxes) + otherobjs
|
||||
+ cast(List[LTComponent], empties))
|
||||
self._objs = (
|
||||
cast(List[LTComponent], textboxes)
|
||||
+ otherobjs
|
||||
+ cast(List[LTComponent], empties)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
@ -953,9 +990,12 @@ class LTFigure(LTLayoutContainer):
|
|||
return
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s(%s) %s matrix=%s>' %
|
||||
(self.__class__.__name__, self.name,
|
||||
bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||
return "<%s(%s) %s matrix=%s>" % (
|
||||
self.__class__.__name__,
|
||||
self.name,
|
||||
bbox2str(self.bbox),
|
||||
matrix2str(self.matrix),
|
||||
)
|
||||
|
||||
def analyze(self, laparams: LAParams) -> None:
|
||||
if not laparams.all_texts:
|
||||
|
@ -978,6 +1018,9 @@ class LTPage(LTLayoutContainer):
|
|||
return
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<%s(%r) %s rotate=%r>' %
|
||||
(self.__class__.__name__, self.pageid,
|
||||
bbox2str(self.bbox), self.rotate))
|
||||
return "<%s(%r) %s rotate=%r>" % (
|
||||
self.__class__.__name__,
|
||||
self.pageid,
|
||||
bbox2str(self.bbox),
|
||||
self.rotate,
|
||||
)
|
||||
|
|
|
@ -10,7 +10,6 @@ class CorruptDataError(Exception):
|
|||
|
||||
|
||||
class LZWDecoder:
|
||||
|
||||
def __init__(self, fp: BinaryIO) -> None:
|
||||
self.fp = fp
|
||||
self.buff = 0
|
||||
|
@ -46,12 +45,12 @@ class LZWDecoder:
|
|||
return v
|
||||
|
||||
def feed(self, code: int) -> bytes:
|
||||
x = b''
|
||||
x = b""
|
||||
if code == 256:
|
||||
self.table = [bytes((c,)) for c in range(256)] # 0-255
|
||||
self.table.append(None) # 256
|
||||
self.table.append(None) # 257
|
||||
self.prevbuf = b''
|
||||
self.prevbuf = b""
|
||||
self.nbits = 9
|
||||
elif code == 257:
|
||||
pass
|
||||
|
@ -91,11 +90,13 @@ class LZWDecoder:
|
|||
break
|
||||
yield x
|
||||
assert self.table is not None
|
||||
logger.debug('nbits=%d, code=%d, output=%r, table=%r'
|
||||
% (self.nbits, code, x, self.table[258:]))
|
||||
logger.debug(
|
||||
"nbits=%d, code=%d, output=%r, table=%r"
|
||||
% (self.nbits, code, x, self.table[258:])
|
||||
)
|
||||
|
||||
|
||||
def lzwdecode(data: bytes) -> bytes:
|
||||
fp = BytesIO(data)
|
||||
s = LZWDecoder(fp).run()
|
||||
return b''.join(s)
|
||||
return b"".join(s)
|
||||
|
|
|
@ -3,33 +3,31 @@ from typing import Dict
|
|||
|
||||
from .psparser import LIT
|
||||
|
||||
LITERAL_DEVICE_GRAY = LIT('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
||||
LITERAL_DEVICE_GRAY = LIT("DeviceGray")
|
||||
LITERAL_DEVICE_RGB = LIT("DeviceRGB")
|
||||
LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
|
||||
|
||||
|
||||
class PDFColorSpace:
|
||||
|
||||
def __init__(self, name: str, ncomponents: int) -> None:
|
||||
self.name = name
|
||||
self.ncomponents = ncomponents
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFColorSpace: %s, ncomponents=%d>' % \
|
||||
(self.name, self.ncomponents)
|
||||
return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
|
||||
|
||||
|
||||
PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
|
||||
|
||||
for (name, n) in [
|
||||
('DeviceGray', 1), # default value first
|
||||
('CalRGB', 3),
|
||||
('CalGray', 1),
|
||||
('Lab', 3),
|
||||
('DeviceRGB', 3),
|
||||
('DeviceCMYK', 4),
|
||||
('Separation', 1),
|
||||
('Indexed', 1),
|
||||
('Pattern', 1),
|
||||
("DeviceGray", 1), # default value first
|
||||
("CalRGB", 3),
|
||||
("CalGray", 1),
|
||||
("Lab", 3),
|
||||
("DeviceRGB", 3),
|
||||
("DeviceCMYK", 4),
|
||||
("Separation", 1),
|
||||
("Indexed", 1),
|
||||
("Pattern", 1),
|
||||
]:
|
||||
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)
|
||||
|
|
|
@ -1,5 +1,13 @@
|
|||
from typing import (BinaryIO, Iterable, List, Optional, Sequence,
|
||||
TYPE_CHECKING, Union, cast)
|
||||
from typing import (
|
||||
BinaryIO,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
TYPE_CHECKING,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
from pdfminer.psparser import PSLiteral
|
||||
from . import utils
|
||||
|
@ -21,25 +29,19 @@ PDFTextSeq = Iterable[Union[int, float, bytes]]
|
|||
|
||||
|
||||
class PDFDevice:
|
||||
"""Translate the output of PDFPageInterpreter to the output that is needed
|
||||
"""
|
||||
"""Translate the output of PDFPageInterpreter to the output that is needed"""
|
||||
|
||||
def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
|
||||
self.rsrcmgr = rsrcmgr
|
||||
self.ctm: Optional[Matrix] = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFDevice>'
|
||||
return "<PDFDevice>"
|
||||
|
||||
def __enter__(self) -> "PDFDevice":
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: object,
|
||||
exc_val: object,
|
||||
exc_tb: object
|
||||
) -> None:
|
||||
def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
|
||||
self.close()
|
||||
|
||||
def close(self) -> None:
|
||||
|
@ -48,21 +50,13 @@ class PDFDevice:
|
|||
def set_ctm(self, ctm: Matrix) -> None:
|
||||
self.ctm = ctm
|
||||
|
||||
def begin_tag(
|
||||
self,
|
||||
tag: PSLiteral,
|
||||
props: Optional["PDFStackT"] = None
|
||||
) -> None:
|
||||
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
||||
pass
|
||||
|
||||
def end_tag(self) -> None:
|
||||
pass
|
||||
|
||||
def do_tag(
|
||||
self,
|
||||
tag: PSLiteral,
|
||||
props: Optional["PDFStackT"] = None
|
||||
) -> None:
|
||||
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
||||
pass
|
||||
|
||||
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
||||
|
@ -83,7 +77,7 @@ class PDFDevice:
|
|||
stroke: bool,
|
||||
fill: bool,
|
||||
evenodd: bool,
|
||||
path: Sequence[PathSegment]
|
||||
path: Sequence[PathSegment],
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
|
@ -95,42 +89,61 @@ class PDFDevice:
|
|||
textstate: "PDFTextState",
|
||||
seq: PDFTextSeq,
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: "PDFGraphicState"
|
||||
graphicstate: "PDFGraphicState",
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class PDFTextDevice(PDFDevice):
|
||||
|
||||
def render_string(
|
||||
self,
|
||||
textstate: "PDFTextState",
|
||||
seq: PDFTextSeq,
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: "PDFGraphicState"
|
||||
graphicstate: "PDFGraphicState",
|
||||
) -> None:
|
||||
assert self.ctm is not None
|
||||
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
||||
font = textstate.font
|
||||
fontsize = textstate.fontsize
|
||||
scaling = textstate.scaling * .01
|
||||
scaling = textstate.scaling * 0.01
|
||||
charspace = textstate.charspace * scaling
|
||||
wordspace = textstate.wordspace * scaling
|
||||
rise = textstate.rise
|
||||
assert font is not None
|
||||
if font.is_multibyte():
|
||||
wordspace = 0
|
||||
dxscale = .001 * fontsize * scaling
|
||||
dxscale = 0.001 * fontsize * scaling
|
||||
if font.is_vertical():
|
||||
textstate.linematrix = self.render_string_vertical(
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs,
|
||||
graphicstate)
|
||||
seq,
|
||||
matrix,
|
||||
textstate.linematrix,
|
||||
font,
|
||||
fontsize,
|
||||
scaling,
|
||||
charspace,
|
||||
wordspace,
|
||||
rise,
|
||||
dxscale,
|
||||
ncs,
|
||||
graphicstate,
|
||||
)
|
||||
else:
|
||||
textstate.linematrix = self.render_string_horizontal(
|
||||
seq, matrix, textstate.linematrix, font, fontsize,
|
||||
scaling, charspace, wordspace, rise, dxscale, ncs,
|
||||
graphicstate)
|
||||
seq,
|
||||
matrix,
|
||||
textstate.linematrix,
|
||||
font,
|
||||
fontsize,
|
||||
scaling,
|
||||
charspace,
|
||||
wordspace,
|
||||
rise,
|
||||
dxscale,
|
||||
ncs,
|
||||
graphicstate,
|
||||
)
|
||||
|
||||
def render_string_horizontal(
|
||||
self,
|
||||
|
@ -145,7 +158,7 @@ class PDFTextDevice(PDFDevice):
|
|||
rise: float,
|
||||
dxscale: float,
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: "PDFGraphicState"
|
||||
graphicstate: "PDFGraphicState",
|
||||
) -> Point:
|
||||
(x, y) = pos
|
||||
needcharspace = False
|
||||
|
@ -158,8 +171,15 @@ class PDFTextDevice(PDFDevice):
|
|||
if needcharspace:
|
||||
x += charspace
|
||||
x += self.render_char(
|
||||
utils.translate_matrix(matrix, (x, y)), font,
|
||||
fontsize, scaling, rise, cid, ncs, graphicstate)
|
||||
utils.translate_matrix(matrix, (x, y)),
|
||||
font,
|
||||
fontsize,
|
||||
scaling,
|
||||
rise,
|
||||
cid,
|
||||
ncs,
|
||||
graphicstate,
|
||||
)
|
||||
if cid == 32 and wordspace:
|
||||
x += wordspace
|
||||
needcharspace = True
|
||||
|
@ -178,7 +198,7 @@ class PDFTextDevice(PDFDevice):
|
|||
rise: float,
|
||||
dxscale: float,
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: "PDFGraphicState"
|
||||
graphicstate: "PDFGraphicState",
|
||||
) -> Point:
|
||||
(x, y) = pos
|
||||
needcharspace = False
|
||||
|
@ -191,8 +211,15 @@ class PDFTextDevice(PDFDevice):
|
|||
if needcharspace:
|
||||
y += charspace
|
||||
y += self.render_char(
|
||||
utils.translate_matrix(matrix, (x, y)), font, fontsize,
|
||||
scaling, rise, cid, ncs, graphicstate)
|
||||
utils.translate_matrix(matrix, (x, y)),
|
||||
font,
|
||||
fontsize,
|
||||
scaling,
|
||||
rise,
|
||||
cid,
|
||||
ncs,
|
||||
graphicstate,
|
||||
)
|
||||
if cid == 32 and wordspace:
|
||||
y += wordspace
|
||||
needcharspace = True
|
||||
|
@ -207,18 +234,14 @@ class PDFTextDevice(PDFDevice):
|
|||
rise: float,
|
||||
cid: int,
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: "PDFGraphicState"
|
||||
graphicstate: "PDFGraphicState",
|
||||
) -> float:
|
||||
return 0
|
||||
|
||||
|
||||
class TagExtractor(PDFDevice):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rsrcmgr: "PDFResourceManager",
|
||||
outfp: BinaryIO,
|
||||
codec: str = 'utf-8'
|
||||
self, rsrcmgr: "PDFResourceManager", outfp: BinaryIO, codec: str = "utf-8"
|
||||
) -> None:
|
||||
PDFDevice.__init__(self, rsrcmgr)
|
||||
self.outfp = outfp
|
||||
|
@ -231,11 +254,11 @@ class TagExtractor(PDFDevice):
|
|||
textstate: "PDFTextState",
|
||||
seq: PDFTextSeq,
|
||||
ncs: PDFColorSpace,
|
||||
graphicstate: "PDFGraphicState"
|
||||
graphicstate: "PDFGraphicState",
|
||||
) -> None:
|
||||
font = textstate.font
|
||||
assert font is not None
|
||||
text = ''
|
||||
text = ""
|
||||
for obj in seq:
|
||||
if isinstance(obj, str):
|
||||
obj = utils.make_compat_bytes(obj)
|
||||
|
@ -251,25 +274,29 @@ class TagExtractor(PDFDevice):
|
|||
self._write(utils.enc(text))
|
||||
|
||||
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
||||
output = '<page id="%s" bbox="%s" rotate="%d">' %\
|
||||
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
||||
output = '<page id="%s" bbox="%s" rotate="%d">' % (
|
||||
self.pageno,
|
||||
utils.bbox2str(page.mediabox),
|
||||
page.rotate,
|
||||
)
|
||||
self._write(output)
|
||||
return
|
||||
|
||||
def end_page(self, page: PDFPage) -> None:
|
||||
self._write('</page>\n')
|
||||
self._write("</page>\n")
|
||||
self.pageno += 1
|
||||
return
|
||||
|
||||
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None
|
||||
) -> None:
|
||||
s = ''
|
||||
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
||||
s = ""
|
||||
if isinstance(props, dict):
|
||||
s = ''.join([
|
||||
s = "".join(
|
||||
[
|
||||
' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
|
||||
for (k, v) in sorted(props.items())
|
||||
])
|
||||
out_s = '<{}{}>'.format(utils.enc(cast(str, tag.name)), s)
|
||||
]
|
||||
)
|
||||
out_s = "<{}{}>".format(utils.enc(cast(str, tag.name)), s)
|
||||
self._write(out_s)
|
||||
self._stack.append(tag)
|
||||
return
|
||||
|
@ -277,12 +304,11 @@ class TagExtractor(PDFDevice):
|
|||
def end_tag(self) -> None:
|
||||
assert self._stack, str(self.pageno)
|
||||
tag = self._stack.pop(-1)
|
||||
out_s = '</%s>' % utils.enc(cast(str, tag.name))
|
||||
out_s = "</%s>" % utils.enc(cast(str, tag.name))
|
||||
self._write(out_s)
|
||||
return
|
||||
|
||||
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None
|
||||
) -> None:
|
||||
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
|
||||
self.begin_tag(tag, props)
|
||||
self._stack.pop(-1)
|
||||
return
|
||||
|
|
|
@ -3,8 +3,21 @@ import logging
|
|||
import re
|
||||
import struct
|
||||
from hashlib import sha256, md5, sha384, sha512
|
||||
from typing import (Any, Callable, Dict, Iterable, Iterator, KeysView, List,
|
||||
Optional, Sequence, Tuple, Type, Union, cast)
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
KeysView,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
from cryptography.hazmat.backends import default_backend
|
||||
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
||||
|
@ -13,12 +26,22 @@ from . import settings
|
|||
from .arcfour import Arcfour
|
||||
from .data_structures import NumberTree
|
||||
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
|
||||
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, \
|
||||
PDFStream, PDFObjectNotFound, decipher_all, int_value, str_value, \
|
||||
list_value, uint_value, dict_value, stream_value
|
||||
from .pdftypes import (
|
||||
DecipherCallable,
|
||||
PDFException,
|
||||
PDFTypeError,
|
||||
PDFStream,
|
||||
PDFObjectNotFound,
|
||||
decipher_all,
|
||||
int_value,
|
||||
str_value,
|
||||
list_value,
|
||||
uint_value,
|
||||
dict_value,
|
||||
stream_value,
|
||||
)
|
||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||
from .utils import choplist, decode_text, nunpack, format_int_roman, \
|
||||
format_int_alpha
|
||||
from .utils import choplist, decode_text, nunpack, format_int_roman, format_int_alpha
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -32,6 +55,7 @@ class PDFNoValidXRefWarning(SyntaxWarning):
|
|||
|
||||
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
|
@ -60,6 +84,7 @@ class PDFEncryptionWarning(UserWarning):
|
|||
|
||||
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
|
@ -68,6 +93,7 @@ class PDFTextExtractionNotAllowedWarning(UserWarning):
|
|||
|
||||
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
|
@ -78,15 +104,19 @@ class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
|||
class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
|
||||
def __init__(self, *args: object) -> None:
|
||||
from warnings import warn
|
||||
warn('PDFTextExtractionNotAllowedError will be removed in the future. '
|
||||
'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning)
|
||||
|
||||
warn(
|
||||
"PDFTextExtractionNotAllowedError will be removed in the future. "
|
||||
"Use PDFTextExtractionNotAllowed instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
super().__init__(*args)
|
||||
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_OBJSTM = LIT('ObjStm')
|
||||
LITERAL_XREF = LIT('XRef')
|
||||
LITERAL_CATALOG = LIT('Catalog')
|
||||
LITERAL_OBJSTM = LIT("ObjStm")
|
||||
LITERAL_XREF = LIT("XRef")
|
||||
LITERAL_CATALOG = LIT("Catalog")
|
||||
|
||||
|
||||
class PDFBaseXRef:
|
||||
|
@ -107,13 +137,12 @@ class PDFBaseXRef:
|
|||
|
||||
|
||||
class PDFXRef(PDFBaseXRef):
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
|
||||
self.trailer: Dict[str, Any] = {}
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
|
||||
return "<PDFXRef: offsets=%r>" % (self.offsets.keys())
|
||||
|
||||
def load(self, parser: PDFParser) -> None:
|
||||
while True:
|
||||
|
@ -123,51 +152,50 @@ class PDFXRef(PDFBaseXRef):
|
|||
if not line:
|
||||
continue
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
if line.startswith(b'trailer'):
|
||||
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
|
||||
if line.startswith(b"trailer"):
|
||||
parser.seek(pos)
|
||||
break
|
||||
f = line.split(b' ')
|
||||
f = line.split(b" ")
|
||||
if len(f) != 2:
|
||||
error_msg = 'Trailer not found: {!r}: line={!r}'\
|
||||
.format(parser, line)
|
||||
error_msg = "Trailer not found: {!r}: line={!r}".format(parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
try:
|
||||
(start, nobjs) = map(int, f)
|
||||
except ValueError:
|
||||
error_msg = 'Invalid line: {!r}: line={!r}'\
|
||||
.format(parser, line)
|
||||
error_msg = "Invalid line: {!r}: line={!r}".format(parser, line)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
for objid in range(start, start + nobjs):
|
||||
try:
|
||||
(_, line) = parser.nextline()
|
||||
line = line.strip()
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
f = line.split(b' ')
|
||||
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
|
||||
f = line.split(b" ")
|
||||
if len(f) != 3:
|
||||
error_msg = 'Invalid XRef format: {!r}, line={!r}'\
|
||||
.format(parser, line)
|
||||
error_msg = "Invalid XRef format: {!r}, line={!r}".format(
|
||||
parser, line
|
||||
)
|
||||
raise PDFNoValidXRef(error_msg)
|
||||
(pos_b, genno_b, use_b) = f
|
||||
if use_b != b'n':
|
||||
if use_b != b"n":
|
||||
continue
|
||||
self.offsets[objid] = (None, int(pos_b), int(genno_b))
|
||||
log.debug('xref objects: %r', self.offsets)
|
||||
log.debug("xref objects: %r", self.offsets)
|
||||
self.load_trailer(parser)
|
||||
|
||||
def load_trailer(self, parser: PDFParser) -> None:
|
||||
try:
|
||||
(_, kwd) = parser.nexttoken()
|
||||
assert kwd is KWD(b'trailer'), str(kwd)
|
||||
assert kwd is KWD(b"trailer"), str(kwd)
|
||||
(_, dic) = parser.nextobject()
|
||||
except PSEOF:
|
||||
x = parser.pop(1)
|
||||
if not x:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||
raise PDFNoValidXRef("Unexpected EOF - file corrupted")
|
||||
(_, dic) = x[0]
|
||||
self.trailer.update(dict_value(dic))
|
||||
log.debug('trailer=%r', self.trailer)
|
||||
log.debug("trailer=%r", self.trailer)
|
||||
|
||||
def get_trailer(self) -> Dict[str, Any]:
|
||||
return self.trailer
|
||||
|
@ -183,11 +211,10 @@ class PDFXRef(PDFBaseXRef):
|
|||
|
||||
|
||||
class PDFXRefFallback(PDFXRef):
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
||||
return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())
|
||||
|
||||
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||
PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
|
||||
|
||||
def load(self, parser: PDFParser) -> None:
|
||||
parser.seek(0)
|
||||
|
@ -196,12 +223,12 @@ class PDFXRefFallback(PDFXRef):
|
|||
(pos, line_bytes) = parser.nextline()
|
||||
except PSEOF:
|
||||
break
|
||||
if line_bytes.startswith(b'trailer'):
|
||||
if line_bytes.startswith(b"trailer"):
|
||||
parser.seek(pos)
|
||||
self.load_trailer(parser)
|
||||
log.debug('trailer: %r', self.trailer)
|
||||
log.debug("trailer: %r", self.trailer)
|
||||
break
|
||||
line = line_bytes.decode('latin-1') # default pdf encoding
|
||||
line = line_bytes.decode("latin-1") # default pdf encoding
|
||||
m = self.PDFOBJ_CUE.match(line)
|
||||
if not m:
|
||||
continue
|
||||
|
@ -212,14 +239,13 @@ class PDFXRefFallback(PDFXRef):
|
|||
# expand ObjStm.
|
||||
parser.seek(pos)
|
||||
(_, obj) = parser.nextobject()
|
||||
if isinstance(obj, PDFStream) \
|
||||
and obj.get('Type') is LITERAL_OBJSTM:
|
||||
if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
|
||||
stream = stream_value(obj)
|
||||
try:
|
||||
n = stream['N']
|
||||
n = stream["N"]
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||
raise PDFSyntaxError("N is not defined: %r" % stream)
|
||||
n = 0
|
||||
parser1 = PDFStreamParser(stream.get_data())
|
||||
objs: List[int] = []
|
||||
|
@ -236,7 +262,6 @@ class PDFXRefFallback(PDFXRef):
|
|||
|
||||
|
||||
class PDFXRefStream(PDFBaseXRef):
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.data: Optional[bytes] = None
|
||||
self.entlen: Optional[int] = None
|
||||
|
@ -246,31 +271,32 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
self.ranges: List[Tuple[int, int]] = []
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
||||
return "<PDFXRefStream: ranges=%r>" % (self.ranges)
|
||||
|
||||
def load(self, parser: PDFParser) -> None:
|
||||
(_, objid) = parser.nexttoken() # ignored
|
||||
(_, genno) = parser.nexttoken() # ignored
|
||||
(_, kwd) = parser.nexttoken()
|
||||
(_, stream) = parser.nextobject()
|
||||
if not isinstance(stream, PDFStream) \
|
||||
or stream.get('Type') is not LITERAL_XREF:
|
||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||
size = stream['Size']
|
||||
index_array = stream.get('Index', (0, size))
|
||||
if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
|
||||
raise PDFNoValidXRef("Invalid PDF stream spec.")
|
||||
size = stream["Size"]
|
||||
index_array = stream.get("Index", (0, size))
|
||||
if len(index_array) % 2 != 0:
|
||||
raise PDFSyntaxError('Invalid index number')
|
||||
self.ranges.extend(cast(Iterator[Tuple[int, int]],
|
||||
choplist(2, index_array)))
|
||||
(self.fl1, self.fl2, self.fl3) = stream['W']
|
||||
assert (self.fl1 is not None and self.fl2 is not None
|
||||
and self.fl3 is not None)
|
||||
raise PDFSyntaxError("Invalid index number")
|
||||
self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
|
||||
(self.fl1, self.fl2, self.fl3) = stream["W"]
|
||||
assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
|
||||
self.data = stream.get_data()
|
||||
self.entlen = self.fl1 + self.fl2 + self.fl3
|
||||
self.trailer = stream.attrs
|
||||
log.debug('xref stream: objid=%s, fields=%d,%d,%d',
|
||||
', '.join(map(repr, self.ranges)),
|
||||
self.fl1, self.fl2, self.fl3)
|
||||
log.debug(
|
||||
"xref stream: objid=%s, fields=%d,%d,%d",
|
||||
", ".join(map(repr, self.ranges)),
|
||||
self.fl1,
|
||||
self.fl2,
|
||||
self.fl3,
|
||||
)
|
||||
return
|
||||
|
||||
def get_trailer(self) -> Dict[str, Any]:
|
||||
|
@ -300,8 +326,7 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
raise KeyError(objid)
|
||||
assert self.entlen is not None
|
||||
assert self.data is not None
|
||||
assert (self.fl1 is not None and self.fl2 is not None
|
||||
and self.fl3 is not None)
|
||||
assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
|
||||
offset = self.entlen * index
|
||||
ent = self.data[offset : offset + self.entlen]
|
||||
f1 = nunpack(ent[: self.fl1], 1)
|
||||
|
@ -318,15 +343,14 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
|
||||
class PDFStandardSecurityHandler:
|
||||
|
||||
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
|
||||
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
|
||||
PASSWORD_PADDING = (
|
||||
b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
|
||||
b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
|
||||
)
|
||||
supported_revisions: Tuple[int, ...] = (2, 3)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
docid: Sequence[bytes],
|
||||
param: Dict[str, Any],
|
||||
password: str = ''
|
||||
self, docid: Sequence[bytes], param: Dict[str, Any], password: str = ""
|
||||
) -> None:
|
||||
self.docid = docid
|
||||
self.param = param
|
||||
|
@ -337,18 +361,18 @@ class PDFStandardSecurityHandler:
|
|||
def init(self) -> None:
|
||||
self.init_params()
|
||||
if self.r not in self.supported_revisions:
|
||||
error_msg = 'Unsupported revision: param=%r' % self.param
|
||||
error_msg = "Unsupported revision: param=%r" % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.init_key()
|
||||
return
|
||||
|
||||
def init_params(self) -> None:
|
||||
self.v = int_value(self.param.get('V', 0))
|
||||
self.r = int_value(self.param['R'])
|
||||
self.p = uint_value(self.param['P'], 32)
|
||||
self.o = str_value(self.param['O'])
|
||||
self.u = str_value(self.param['U'])
|
||||
self.length = int_value(self.param.get('Length', 40))
|
||||
self.v = int_value(self.param.get("V", 0))
|
||||
self.r = int_value(self.param["R"])
|
||||
self.p = uint_value(self.param["P"], 32)
|
||||
self.o = str_value(self.param["O"])
|
||||
self.u = str_value(self.param["U"])
|
||||
self.length = int_value(self.param.get("Length", 40))
|
||||
return
|
||||
|
||||
def init_key(self) -> None:
|
||||
|
@ -376,7 +400,7 @@ class PDFStandardSecurityHandler:
|
|||
hash.update(self.docid[0]) # 3
|
||||
result = Arcfour(key).encrypt(hash.digest()) # 4
|
||||
for i in range(1, 20): # 5
|
||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||
k = b"".join(bytes((c ^ i,)) for c in iter(key))
|
||||
result = Arcfour(k).encrypt(result)
|
||||
result += result # 6
|
||||
return result
|
||||
|
@ -387,11 +411,11 @@ class PDFStandardSecurityHandler:
|
|||
hash = md5(password) # 2
|
||||
hash.update(self.o) # 3
|
||||
# See https://github.com/pdfminer/pdfminer.six/issues/186
|
||||
hash.update(struct.pack('<L', self.p)) # 4
|
||||
hash.update(struct.pack("<L", self.p)) # 4
|
||||
hash.update(self.docid[0]) # 5
|
||||
if self.r >= 4:
|
||||
if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
|
||||
hash.update(b'\xff\xff\xff\xff')
|
||||
hash.update(b"\xff\xff\xff\xff")
|
||||
result = hash.digest()
|
||||
n = 5
|
||||
if self.r >= 3:
|
||||
|
@ -437,7 +461,7 @@ class PDFStandardSecurityHandler:
|
|||
else:
|
||||
user_password = self.o
|
||||
for i in range(19, -1, -1):
|
||||
k = b''.join(bytes((c ^ i,)) for c in iter(key))
|
||||
k = b"".join(bytes((c ^ i,)) for c in iter(key))
|
||||
user_password = Arcfour(k).decrypt(user_password)
|
||||
return self.authenticate_user_password(user_password)
|
||||
|
||||
|
@ -446,14 +470,13 @@ class PDFStandardSecurityHandler:
|
|||
objid: int,
|
||||
genno: int,
|
||||
data: bytes,
|
||||
attrs: Optional[Dict[str, Any]] = None
|
||||
attrs: Optional[Dict[str, Any]] = None,
|
||||
) -> bytes:
|
||||
return self.decrypt_rc4(objid, genno, data)
|
||||
|
||||
def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
|
||||
assert self.key is not None
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2]
|
||||
key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
|
||||
hash = md5(key)
|
||||
key = hash.digest()[: min(len(key), 16)]
|
||||
return Arcfour(key).decrypt(data)
|
||||
|
@ -466,34 +489,30 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
|||
def init_params(self) -> None:
|
||||
super().init_params()
|
||||
self.length = 128
|
||||
self.cf = dict_value(self.param.get('CF'))
|
||||
self.stmf = literal_name(self.param['StmF'])
|
||||
self.strf = literal_name(self.param['StrF'])
|
||||
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
|
||||
self.cf = dict_value(self.param.get("CF"))
|
||||
self.stmf = literal_name(self.param["StmF"])
|
||||
self.strf = literal_name(self.param["StrF"])
|
||||
self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
|
||||
if self.stmf != self.strf:
|
||||
error_msg = 'Unsupported crypt filter: param=%r' % self.param
|
||||
error_msg = "Unsupported crypt filter: param=%r" % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.cfm = {}
|
||||
for k, v in self.cf.items():
|
||||
f = self.get_cfm(literal_name(v['CFM']))
|
||||
f = self.get_cfm(literal_name(v["CFM"]))
|
||||
if f is None:
|
||||
error_msg = 'Unknown crypt filter method: param=%r' \
|
||||
% self.param
|
||||
error_msg = "Unknown crypt filter method: param=%r" % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
self.cfm[k] = f
|
||||
self.cfm['Identity'] = self.decrypt_identity
|
||||
self.cfm["Identity"] = self.decrypt_identity
|
||||
if self.strf not in self.cfm:
|
||||
error_msg = 'Undefined crypt filter: param=%r' % self.param
|
||||
error_msg = "Undefined crypt filter: param=%r" % self.param
|
||||
raise PDFEncryptionError(error_msg)
|
||||
return
|
||||
|
||||
def get_cfm(
|
||||
self,
|
||||
name: str
|
||||
) -> Optional[Callable[[int, int, bytes], bytes]]:
|
||||
if name == 'V2':
|
||||
def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
|
||||
if name == "V2":
|
||||
return self.decrypt_rc4
|
||||
elif name == 'AESV2':
|
||||
elif name == "AESV2":
|
||||
return self.decrypt_aes128
|
||||
else:
|
||||
return None
|
||||
|
@ -504,11 +523,11 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
|||
genno: int,
|
||||
data: bytes,
|
||||
attrs: Optional[Dict[str, Any]] = None,
|
||||
name: Optional[str] = None
|
||||
name: Optional[str] = None,
|
||||
) -> bytes:
|
||||
if not self.encrypt_metadata and attrs is not None:
|
||||
t = attrs.get('Type')
|
||||
if t is not None and literal_name(t) == 'Metadata':
|
||||
t = attrs.get("Type")
|
||||
if t is not None and literal_name(t) == "Metadata":
|
||||
return data
|
||||
if name is None:
|
||||
name = self.strf
|
||||
|
@ -519,15 +538,21 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
|||
|
||||
def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
|
||||
assert self.key is not None
|
||||
key = self.key + struct.pack('<L', objid)[:3] \
|
||||
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
||||
key = (
|
||||
self.key
|
||||
+ struct.pack("<L", objid)[:3]
|
||||
+ struct.pack("<L", genno)[:2]
|
||||
+ b"sAlT"
|
||||
)
|
||||
hash = md5(key)
|
||||
key = hash.digest()[: min(len(key), 16)]
|
||||
initialization_vector = data[:16]
|
||||
ciphertext = data[16:]
|
||||
cipher = Cipher(algorithms.AES(key),
|
||||
cipher = Cipher(
|
||||
algorithms.AES(key),
|
||||
modes.CBC(initialization_vector),
|
||||
backend=default_backend()) # type: ignore
|
||||
backend=default_backend(),
|
||||
) # type: ignore
|
||||
return cipher.decryptor().update(ciphertext) # type: ignore
|
||||
|
||||
|
||||
|
@ -538,8 +563,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
def init_params(self) -> None:
|
||||
super().init_params()
|
||||
self.length = 256
|
||||
self.oe = str_value(self.param['OE'])
|
||||
self.ue = str_value(self.param['UE'])
|
||||
self.oe = str_value(self.param["OE"])
|
||||
self.ue = str_value(self.param["UE"])
|
||||
self.o_hash = self.o[:32]
|
||||
self.o_validation_salt = self.o[32:40]
|
||||
self.o_key_salt = self.o[40:]
|
||||
|
@ -548,11 +573,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
self.u_key_salt = self.u[40:]
|
||||
return
|
||||
|
||||
def get_cfm(
|
||||
self,
|
||||
name: str
|
||||
) -> Optional[Callable[[int, int, bytes], bytes]]:
|
||||
if name == 'AESV3':
|
||||
def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
|
||||
if name == "AESV3":
|
||||
return self.decrypt_aes256
|
||||
else:
|
||||
return None
|
||||
|
@ -562,16 +584,16 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
hash = self._password_hash(password_b, self.o_validation_salt, self.u)
|
||||
if hash == self.o_hash:
|
||||
hash = self._password_hash(password_b, self.o_key_salt, self.u)
|
||||
cipher = Cipher(algorithms.AES(hash),
|
||||
modes.CBC(b'\0' * 16),
|
||||
backend=default_backend()) # type: ignore
|
||||
cipher = Cipher(
|
||||
algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend()
|
||||
) # type: ignore
|
||||
return cipher.decryptor().update(self.oe) # type: ignore
|
||||
hash = self._password_hash(password_b, self.u_validation_salt)
|
||||
if hash == self.u_hash:
|
||||
hash = self._password_hash(password_b, self.u_key_salt)
|
||||
cipher = Cipher(algorithms.AES(hash),
|
||||
modes.CBC(b'\0' * 16),
|
||||
backend=default_backend()) # type: ignore
|
||||
cipher = Cipher(
|
||||
algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend()
|
||||
) # type: ignore
|
||||
return cipher.decryptor().update(self.ue) # type: ignore
|
||||
return None
|
||||
|
||||
|
@ -579,16 +601,14 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
if self.r == 6:
|
||||
# saslprep expects non-empty strings, apparently
|
||||
if not password:
|
||||
return b''
|
||||
return b""
|
||||
from ._saslprep import saslprep
|
||||
|
||||
password = saslprep(password)
|
||||
return password.encode('utf-8')[:127]
|
||||
return password.encode("utf-8")[:127]
|
||||
|
||||
def _password_hash(
|
||||
self,
|
||||
password: bytes,
|
||||
salt: bytes,
|
||||
vector: Optional[bytes] = None
|
||||
self, password: bytes, salt: bytes, vector: Optional[bytes] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Compute password hash depending on revision number
|
||||
|
@ -598,10 +618,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
return self._r6_password(password, salt[0:8], vector)
|
||||
|
||||
def _r5_password(
|
||||
self,
|
||||
password: bytes,
|
||||
salt: bytes,
|
||||
vector: Optional[bytes] = None
|
||||
self, password: bytes, salt: bytes, vector: Optional[bytes] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Compute the password for revision 5
|
||||
|
@ -613,10 +630,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
return hash.digest()
|
||||
|
||||
def _r6_password(
|
||||
self,
|
||||
password: bytes,
|
||||
salt: bytes,
|
||||
vector: Optional[bytes] = None
|
||||
self, password: bytes, salt: bytes, vector: Optional[bytes] = None
|
||||
) -> bytes:
|
||||
"""
|
||||
Compute the password for revision 6
|
||||
|
@ -629,10 +643,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
hashes = (sha256, sha384, sha512)
|
||||
round_no = last_byte_val = 0
|
||||
while round_no < 64 or last_byte_val > round_no - 32:
|
||||
k1 = (password + k + (vector or b'')) * 64
|
||||
e = self._aes_cbc_encrypt(
|
||||
key=k[:16], iv=k[16:32], data=k1
|
||||
)
|
||||
k1 = (password + k + (vector or b"")) * 64
|
||||
e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
|
||||
# compute the first 16 bytes of e,
|
||||
# interpreted as an unsigned integer mod 3
|
||||
next_hash = hashes[self._bytes_mod_3(e[:16])]
|
||||
|
@ -646,12 +658,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
# 256 is 1 mod 3, so we can just sum 'em
|
||||
return sum(b % 3 for b in input_bytes) % 3
|
||||
|
||||
def _aes_cbc_encrypt(
|
||||
self,
|
||||
key: bytes,
|
||||
iv: bytes,
|
||||
data: bytes
|
||||
) -> bytes:
|
||||
def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
|
||||
cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
|
||||
encryptor = cipher.encryptor() # type: ignore
|
||||
return encryptor.update(data) + encryptor.finalize() # type: ignore
|
||||
|
@ -660,9 +667,11 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
|||
initialization_vector = data[:16]
|
||||
ciphertext = data[16:]
|
||||
assert self.key is not None
|
||||
cipher = Cipher(algorithms.AES(self.key),
|
||||
cipher = Cipher(
|
||||
algorithms.AES(self.key),
|
||||
modes.CBC(initialization_vector),
|
||||
backend=default_backend()) # type: ignore
|
||||
backend=default_backend(),
|
||||
) # type: ignore
|
||||
return cipher.decryptor().update(ciphertext) # type: ignore
|
||||
|
||||
|
||||
|
@ -689,9 +698,9 @@ class PDFDocument:
|
|||
def __init__(
|
||||
self,
|
||||
parser: PDFParser,
|
||||
password: str = '',
|
||||
password: str = "",
|
||||
caching: bool = True,
|
||||
fallback: bool = True
|
||||
fallback: bool = True,
|
||||
) -> None:
|
||||
"Set the document to use a given PDFParser object."
|
||||
self.caching = caching
|
||||
|
@ -723,43 +732,42 @@ class PDFDocument:
|
|||
if not trailer:
|
||||
continue
|
||||
# If there's an encryption info, remember it.
|
||||
if 'Encrypt' in trailer:
|
||||
if 'ID' in trailer:
|
||||
id_value = list_value(trailer['ID'])
|
||||
if "Encrypt" in trailer:
|
||||
if "ID" in trailer:
|
||||
id_value = list_value(trailer["ID"])
|
||||
else:
|
||||
# Some documents may not have a /ID, use two empty
|
||||
# byte strings instead. Solves
|
||||
# https://github.com/pdfminer/pdfminer.six/issues/594
|
||||
id_value = (b'', b'')
|
||||
self.encryption = (id_value,
|
||||
dict_value(trailer['Encrypt']))
|
||||
id_value = (b"", b"")
|
||||
self.encryption = (id_value, dict_value(trailer["Encrypt"]))
|
||||
self._initialize_password(password)
|
||||
if 'Info' in trailer:
|
||||
self.info.append(dict_value(trailer['Info']))
|
||||
if 'Root' in trailer:
|
||||
if "Info" in trailer:
|
||||
self.info.append(dict_value(trailer["Info"]))
|
||||
if "Root" in trailer:
|
||||
# Every PDF file must have exactly one /Root dictionary.
|
||||
self.catalog = dict_value(trailer['Root'])
|
||||
self.catalog = dict_value(trailer["Root"])
|
||||
break
|
||||
else:
|
||||
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
|
||||
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
||||
raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
|
||||
if self.catalog.get("Type") is not LITERAL_CATALOG:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Catalog not found!')
|
||||
raise PDFSyntaxError("Catalog not found!")
|
||||
return
|
||||
|
||||
KEYWORD_OBJ = KWD(b'obj')
|
||||
KEYWORD_OBJ = KWD(b"obj")
|
||||
|
||||
# _initialize_password(password=b'')
|
||||
# Perform the initialization with a given password.
|
||||
def _initialize_password(self, password: str = '') -> None:
|
||||
def _initialize_password(self, password: str = "") -> None:
|
||||
assert self.encryption is not None
|
||||
(docid, param) = self.encryption
|
||||
if literal_name(param.get('Filter')) != 'Standard':
|
||||
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
||||
v = int_value(param.get('V', 0))
|
||||
if literal_name(param.get("Filter")) != "Standard":
|
||||
raise PDFEncryptionError("Unknown filter: param=%r" % param)
|
||||
v = int_value(param.get("V", 0))
|
||||
factory = self.security_handler_registry.get(v)
|
||||
if factory is None:
|
||||
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
|
||||
raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
|
||||
handler = factory(docid, param, password)
|
||||
self.decipher = handler.decrypt
|
||||
self.is_printable = handler.is_printable()
|
||||
|
@ -769,12 +777,7 @@ class PDFDocument:
|
|||
self._parser.fallback = False # need to read streams with exact length
|
||||
return
|
||||
|
||||
def _getobj_objstm(
|
||||
self,
|
||||
stream: PDFStream,
|
||||
index: int,
|
||||
objid: int
|
||||
) -> object:
|
||||
def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
|
||||
if stream.objid in self._parsed_objs:
|
||||
(objs, n) = self._parsed_objs[stream.objid]
|
||||
else:
|
||||
|
@ -786,18 +789,18 @@ class PDFDocument:
|
|||
try:
|
||||
obj = objs[i]
|
||||
except IndexError:
|
||||
raise PDFSyntaxError('index too big: %r' % index)
|
||||
raise PDFSyntaxError("index too big: %r" % index)
|
||||
return obj
|
||||
|
||||
def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
|
||||
if stream.get('Type') is not LITERAL_OBJSTM:
|
||||
if stream.get("Type") is not LITERAL_OBJSTM:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||
raise PDFSyntaxError("Not a stream object: %r" % stream)
|
||||
try:
|
||||
n = cast(int, stream['N'])
|
||||
n = cast(int, stream["N"])
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||
raise PDFSyntaxError("N is not defined: %r" % stream)
|
||||
n = 0
|
||||
parser = PDFStreamParser(stream.get_data())
|
||||
parser.set_document(self)
|
||||
|
@ -830,11 +833,10 @@ class PDFDocument:
|
|||
objid1 = x[-2]
|
||||
# #### end hack around malformed pdf files
|
||||
if objid1 != objid:
|
||||
raise PDFSyntaxError('objid mismatch: {!r}={!r}'
|
||||
.format(objid1, objid))
|
||||
raise PDFSyntaxError("objid mismatch: {!r}={!r}".format(objid1, objid))
|
||||
|
||||
if kwd != KWD(b'obj'):
|
||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||
if kwd != KWD(b"obj"):
|
||||
raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
|
||||
(_, obj) = self._parser.nextobject()
|
||||
return obj
|
||||
|
||||
|
@ -846,8 +848,8 @@ class PDFDocument:
|
|||
:raises PDFObjectNotFound if objid does not exist in PDF
|
||||
"""
|
||||
if not self.xrefs:
|
||||
raise PDFException('PDFDocument is not initialized')
|
||||
log.debug('getobj: objid=%r', objid)
|
||||
raise PDFException("PDFDocument is not initialized")
|
||||
log.debug("getobj: objid=%r", objid)
|
||||
if objid in self._cached_objs:
|
||||
(obj, genno) = self._cached_objs[objid]
|
||||
else:
|
||||
|
@ -863,8 +865,7 @@ class PDFDocument:
|
|||
else:
|
||||
obj = self._getobj_parse(index, objid)
|
||||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno,
|
||||
obj)
|
||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
|
||||
if isinstance(obj, PDFStream):
|
||||
obj.set_objid(objid, genno)
|
||||
|
@ -873,7 +874,7 @@ class PDFDocument:
|
|||
continue
|
||||
else:
|
||||
raise PDFObjectNotFound(objid)
|
||||
log.debug('register: objid=%r: %r', objid, obj)
|
||||
log.debug("register: objid=%r: %r", objid, obj)
|
||||
if self.caching:
|
||||
self._cached_objs[objid] = (obj, genno)
|
||||
return obj
|
||||
|
@ -881,25 +882,25 @@ class PDFDocument:
|
|||
OutlineType = Tuple[Any, Any, Any, Any, Any]
|
||||
|
||||
def get_outlines(self) -> Iterator[OutlineType]:
|
||||
if 'Outlines' not in self.catalog:
|
||||
if "Outlines" not in self.catalog:
|
||||
raise PDFNoOutlines
|
||||
|
||||
def search(entry: object, level: int
|
||||
) -> Iterator[PDFDocument.OutlineType]:
|
||||
def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
|
||||
entry = dict_value(entry)
|
||||
if 'Title' in entry:
|
||||
if 'A' in entry or 'Dest' in entry:
|
||||
title = decode_text(str_value(entry['Title']))
|
||||
dest = entry.get('Dest')
|
||||
action = entry.get('A')
|
||||
se = entry.get('SE')
|
||||
if "Title" in entry:
|
||||
if "A" in entry or "Dest" in entry:
|
||||
title = decode_text(str_value(entry["Title"]))
|
||||
dest = entry.get("Dest")
|
||||
action = entry.get("A")
|
||||
se = entry.get("SE")
|
||||
yield (level, title, dest, action, se)
|
||||
if 'First' in entry and 'Last' in entry:
|
||||
yield from search(entry['First'], level+1)
|
||||
if 'Next' in entry:
|
||||
yield from search(entry['Next'], level)
|
||||
if "First" in entry and "Last" in entry:
|
||||
yield from search(entry["First"], level + 1)
|
||||
if "Next" in entry:
|
||||
yield from search(entry["Next"], level)
|
||||
return
|
||||
return search(self.catalog['Outlines'], 0)
|
||||
|
||||
return search(self.catalog["Outlines"], 0)
|
||||
|
||||
def get_page_labels(self) -> Iterator[str]:
|
||||
"""
|
||||
|
@ -913,51 +914,49 @@ class PDFDocument:
|
|||
assert self.catalog is not None
|
||||
|
||||
try:
|
||||
page_labels = PageLabels(self.catalog['PageLabels'])
|
||||
page_labels = PageLabels(self.catalog["PageLabels"])
|
||||
except (PDFTypeError, KeyError):
|
||||
raise PDFNoPageLabels
|
||||
|
||||
return page_labels.labels
|
||||
|
||||
def lookup_name(
|
||||
self,
|
||||
cat: str,
|
||||
key: Union[str, bytes]
|
||||
) -> Any:
|
||||
def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
|
||||
try:
|
||||
names = dict_value(self.catalog['Names'])
|
||||
names = dict_value(self.catalog["Names"])
|
||||
except (PDFTypeError, KeyError):
|
||||
raise KeyError((cat, key))
|
||||
# may raise KeyError
|
||||
d0 = dict_value(names[cat])
|
||||
|
||||
def lookup(d: Dict[str, Any]) -> Any:
|
||||
if 'Limits' in d:
|
||||
(k1, k2) = list_value(d['Limits'])
|
||||
if "Limits" in d:
|
||||
(k1, k2) = list_value(d["Limits"])
|
||||
if key < k1 or k2 < key:
|
||||
return None
|
||||
if 'Names' in d:
|
||||
objs = list_value(d['Names'])
|
||||
names = dict(cast(Iterator[Tuple[Union[str, bytes], Any]],
|
||||
choplist(2, objs)))
|
||||
if "Names" in d:
|
||||
objs = list_value(d["Names"])
|
||||
names = dict(
|
||||
cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs))
|
||||
)
|
||||
return names[key]
|
||||
if 'Kids' in d:
|
||||
for c in list_value(d['Kids']):
|
||||
if "Kids" in d:
|
||||
for c in list_value(d["Kids"]):
|
||||
v = lookup(dict_value(c))
|
||||
if v:
|
||||
return v
|
||||
raise KeyError((cat, key))
|
||||
|
||||
return lookup(d0)
|
||||
|
||||
def get_dest(self, name: Union[str, bytes]) -> Any:
|
||||
try:
|
||||
# PDF-1.2 or later
|
||||
obj = self.lookup_name('Dests', name)
|
||||
obj = self.lookup_name("Dests", name)
|
||||
except KeyError:
|
||||
# PDF-1.1 or prior
|
||||
if 'Dests' not in self.catalog:
|
||||
if "Dests" not in self.catalog:
|
||||
raise PDFDestinationNotFound(name)
|
||||
d0 = dict_value(self.catalog['Dests'])
|
||||
d0 = dict_value(self.catalog["Dests"])
|
||||
if name not in d0:
|
||||
raise PDFDestinationNotFound(name)
|
||||
obj = d0[name]
|
||||
|
@ -970,23 +969,20 @@ class PDFDocument:
|
|||
prev = None
|
||||
for line in parser.revreadlines():
|
||||
line = line.strip()
|
||||
log.debug('find_xref: %r', line)
|
||||
if line == b'startxref':
|
||||
log.debug("find_xref: %r", line)
|
||||
if line == b"startxref":
|
||||
break
|
||||
if line:
|
||||
prev = line
|
||||
else:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
log.debug('xref found: pos=%r', prev)
|
||||
raise PDFNoValidXRef("Unexpected EOF")
|
||||
log.debug("xref found: pos=%r", prev)
|
||||
assert prev is not None
|
||||
return int(prev)
|
||||
|
||||
# read xref table
|
||||
def read_xref_from(
|
||||
self,
|
||||
parser: PDFParser,
|
||||
start: int,
|
||||
xrefs: List[PDFBaseXRef]
|
||||
self, parser: PDFParser, start: int, xrefs: List[PDFBaseXRef]
|
||||
) -> None:
|
||||
"""Reads XRefs from the given location."""
|
||||
parser.seek(start)
|
||||
|
@ -994,8 +990,8 @@ class PDFDocument:
|
|||
try:
|
||||
(pos, token) = parser.nexttoken()
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
log.debug('read_xref_from: start=%d, token=%r', start, token)
|
||||
raise PDFNoValidXRef("Unexpected EOF")
|
||||
log.debug("read_xref_from: start=%d, token=%r", start, token)
|
||||
if isinstance(token, int):
|
||||
# XRefStream: PDF-1.5
|
||||
parser.seek(pos)
|
||||
|
@ -1009,13 +1005,13 @@ class PDFDocument:
|
|||
xref.load(parser)
|
||||
xrefs.append(xref)
|
||||
trailer = xref.get_trailer()
|
||||
log.debug('trailer: %r', trailer)
|
||||
if 'XRefStm' in trailer:
|
||||
pos = int_value(trailer['XRefStm'])
|
||||
log.debug("trailer: %r", trailer)
|
||||
if "XRefStm" in trailer:
|
||||
pos = int_value(trailer["XRefStm"])
|
||||
self.read_xref_from(parser, pos, xrefs)
|
||||
if 'Prev' in trailer:
|
||||
if "Prev" in trailer:
|
||||
# find previous xref
|
||||
pos = int_value(trailer['Prev'])
|
||||
pos = int_value(trailer["Prev"])
|
||||
self.read_xref_from(parser, pos, xrefs)
|
||||
return
|
||||
|
||||
|
@ -1033,16 +1029,16 @@ class PageLabels(NumberTree):
|
|||
# The tree must begin with page index 0
|
||||
if len(ranges) == 0 or ranges[0][0] != 0:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('PageLabels is missing page index 0')
|
||||
raise PDFSyntaxError("PageLabels is missing page index 0")
|
||||
else:
|
||||
# Try to cope, by assuming empty labels for the initial pages
|
||||
ranges.insert(0, (0, {}))
|
||||
|
||||
for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
|
||||
label_dict = dict_value(label_dict_unchecked)
|
||||
style = label_dict.get('S')
|
||||
prefix = decode_text(str_value(label_dict.get('P', b'')))
|
||||
first_value = int_value(label_dict.get('St', 1))
|
||||
style = label_dict.get("S")
|
||||
prefix = decode_text(str_value(label_dict.get("P", b"")))
|
||||
first_value = int_value(label_dict.get("St", 1))
|
||||
|
||||
if next == len(ranges):
|
||||
# This is the last specified range. It continues until the end
|
||||
|
@ -1061,18 +1057,18 @@ class PageLabels(NumberTree):
|
|||
def _format_page_label(value: int, style: Any) -> str:
|
||||
"""Format page label value in a specific style"""
|
||||
if style is None:
|
||||
label = ''
|
||||
elif style is LIT('D'): # Decimal arabic numerals
|
||||
label = ""
|
||||
elif style is LIT("D"): # Decimal arabic numerals
|
||||
label = str(value)
|
||||
elif style is LIT('R'): # Uppercase roman numerals
|
||||
elif style is LIT("R"): # Uppercase roman numerals
|
||||
label = format_int_roman(value).upper()
|
||||
elif style is LIT('r'): # Lowercase roman numerals
|
||||
elif style is LIT("r"): # Lowercase roman numerals
|
||||
label = format_int_roman(value)
|
||||
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
|
||||
elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ...
|
||||
label = format_int_alpha(value).upper()
|
||||
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
|
||||
elif style is LIT("a"): # Lowercase letters a-z, aa-zz...
|
||||
label = format_int_alpha(value)
|
||||
else:
|
||||
log.warning('Unknown page label style: %r', style)
|
||||
label = ''
|
||||
log.warning("Unknown page label style: %r", style)
|
||||
label = ""
|
||||
return label
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -50,11 +50,11 @@ class PDFInterpreterError(PDFException):
|
|||
pass
|
||||
|
||||
|
||||
LITERAL_PDF = LIT('PDF')
|
||||
LITERAL_TEXT = LIT('Text')
|
||||
LITERAL_FONT = LIT('Font')
|
||||
LITERAL_FORM = LIT('Form')
|
||||
LITERAL_IMAGE = LIT('Image')
|
||||
LITERAL_PDF = LIT("PDF")
|
||||
LITERAL_TEXT = LIT("Text")
|
||||
LITERAL_FONT = LIT("Font")
|
||||
LITERAL_FORM = LIT("Form")
|
||||
LITERAL_IMAGE = LIT("Image")
|
||||
|
||||
|
||||
class PDFTextState:
|
||||
|
@ -75,12 +75,23 @@ class PDFTextState:
|
|||
# self.linematrix is set
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \
|
||||
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \
|
||||
'matrix=%r, linematrix=%r>' \
|
||||
% (self.font, self.fontsize, self.charspace, self.wordspace,
|
||||
self.scaling, self.leading, self.render, self.rise,
|
||||
self.matrix, self.linematrix)
|
||||
return (
|
||||
"<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
|
||||
"wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
|
||||
"matrix=%r, linematrix=%r>"
|
||||
% (
|
||||
self.font,
|
||||
self.fontsize,
|
||||
self.charspace,
|
||||
self.wordspace,
|
||||
self.scaling,
|
||||
self.leading,
|
||||
self.render,
|
||||
self.rise,
|
||||
self.matrix,
|
||||
self.linematrix,
|
||||
)
|
||||
)
|
||||
|
||||
def copy(self) -> "PDFTextState":
|
||||
obj = PDFTextState()
|
||||
|
@ -104,11 +115,11 @@ class PDFTextState:
|
|||
Color = Union[
|
||||
float, # Greyscale
|
||||
Tuple[float, float, float], # R, G, B
|
||||
Tuple[float, float, float, float]] # C, M, Y, K
|
||||
Tuple[float, float, float, float],
|
||||
] # C, M, Y, K
|
||||
|
||||
|
||||
class PDFGraphicState:
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.linewidth: float = 0
|
||||
self.linecap: Optional[object] = None
|
||||
|
@ -138,12 +149,22 @@ class PDFGraphicState:
|
|||
return obj
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
||||
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, '
|
||||
' stroking color=%r, non stroking color=%r>' %
|
||||
(self.linewidth, self.linecap, self.linejoin,
|
||||
self.miterlimit, self.dash, self.intent, self.flatness,
|
||||
self.scolor, self.ncolor))
|
||||
return (
|
||||
"<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
|
||||
" miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
|
||||
" stroking color=%r, non stroking color=%r>"
|
||||
% (
|
||||
self.linewidth,
|
||||
self.linecap,
|
||||
self.linejoin,
|
||||
self.miterlimit,
|
||||
self.dash,
|
||||
self.intent,
|
||||
self.flatness,
|
||||
self.scolor,
|
||||
self.ncolor,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class PDFResourceManager:
|
||||
|
@ -179,41 +200,41 @@ class PDFResourceManager:
|
|||
if objid and objid in self._cached_fonts:
|
||||
font = self._cached_fonts[objid]
|
||||
else:
|
||||
log.debug('get_font: create: objid=%r, spec=%r', objid, spec)
|
||||
log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
|
||||
if settings.STRICT:
|
||||
if spec['Type'] is not LITERAL_FONT:
|
||||
raise PDFFontError('Type is not /Font')
|
||||
if spec["Type"] is not LITERAL_FONT:
|
||||
raise PDFFontError("Type is not /Font")
|
||||
# Create a Font object.
|
||||
if 'Subtype' in spec:
|
||||
subtype = literal_name(spec['Subtype'])
|
||||
if "Subtype" in spec:
|
||||
subtype = literal_name(spec["Subtype"])
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFFontError('Font Subtype is not specified.')
|
||||
subtype = 'Type1'
|
||||
if subtype in ('Type1', 'MMType1'):
|
||||
raise PDFFontError("Font Subtype is not specified.")
|
||||
subtype = "Type1"
|
||||
if subtype in ("Type1", "MMType1"):
|
||||
# Type1 Font
|
||||
font = PDFType1Font(self, spec)
|
||||
elif subtype == 'TrueType':
|
||||
elif subtype == "TrueType":
|
||||
# TrueType Font
|
||||
font = PDFTrueTypeFont(self, spec)
|
||||
elif subtype == 'Type3':
|
||||
elif subtype == "Type3":
|
||||
# Type3 Font
|
||||
font = PDFType3Font(self, spec)
|
||||
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
||||
elif subtype in ("CIDFontType0", "CIDFontType2"):
|
||||
# CID Font
|
||||
font = PDFCIDFont(self, spec)
|
||||
elif subtype == 'Type0':
|
||||
elif subtype == "Type0":
|
||||
# Type0 Font
|
||||
dfonts = list_value(spec['DescendantFonts'])
|
||||
dfonts = list_value(spec["DescendantFonts"])
|
||||
assert dfonts
|
||||
subspec = dict_value(dfonts[0]).copy()
|
||||
for k in ('Encoding', 'ToUnicode'):
|
||||
for k in ("Encoding", "ToUnicode"):
|
||||
if k in spec:
|
||||
subspec[k] = resolve1(spec[k])
|
||||
font = self.get_font(None, subspec)
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFFontError('Invalid Font spec: %r' % spec)
|
||||
raise PDFFontError("Invalid Font spec: %r" % spec)
|
||||
font = PDFType1Font(self, spec) # this is so wrong!
|
||||
if objid and self.caching:
|
||||
self._cached_fonts[objid] = font
|
||||
|
@ -221,7 +242,6 @@ class PDFResourceManager:
|
|||
|
||||
|
||||
class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
|
||||
|
||||
def __init__(self, streams: Sequence[object]) -> None:
|
||||
self.streams = streams
|
||||
self.istream = 0
|
||||
|
@ -236,7 +256,7 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
|
|||
strm = stream_value(self.streams[self.istream])
|
||||
self.istream += 1
|
||||
else:
|
||||
raise PSEOF('Unexpected EOF, file truncated?')
|
||||
raise PSEOF("Unexpected EOF, file truncated?")
|
||||
self.fp = BytesIO(strm.get_data())
|
||||
|
||||
def seek(self, pos: int) -> None:
|
||||
|
@ -255,14 +275,10 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
|
|||
self.fp = None # type: ignore[assignment]
|
||||
self.charpos = 0
|
||||
|
||||
def get_inline_data(
|
||||
self,
|
||||
pos: int,
|
||||
target: bytes = b'EI'
|
||||
) -> Tuple[int, bytes]:
|
||||
def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
|
||||
self.seek(pos)
|
||||
i = 0
|
||||
data = b''
|
||||
data = b""
|
||||
while i <= len(target):
|
||||
self.fillbuf()
|
||||
if i:
|
||||
|
@ -286,29 +302,28 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
|
|||
data += self.buf[self.charpos :]
|
||||
self.charpos = len(self.buf)
|
||||
data = data[: -(len(target) + 1)] # strip the last part
|
||||
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
|
||||
data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
|
||||
return (pos, data)
|
||||
|
||||
def flush(self) -> None:
|
||||
self.add_results(*self.popall())
|
||||
|
||||
KEYWORD_BI = KWD(b'BI')
|
||||
KEYWORD_ID = KWD(b'ID')
|
||||
KEYWORD_EI = KWD(b'EI')
|
||||
KEYWORD_BI = KWD(b"BI")
|
||||
KEYWORD_ID = KWD(b"ID")
|
||||
KEYWORD_EI = KWD(b"EI")
|
||||
|
||||
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||
if token is self.KEYWORD_BI:
|
||||
# inline image within a content stream
|
||||
self.start_type(pos, 'inline')
|
||||
self.start_type(pos, "inline")
|
||||
elif token is self.KEYWORD_ID:
|
||||
try:
|
||||
(_, objs) = self.end_type('inline')
|
||||
(_, objs) = self.end_type("inline")
|
||||
if len(objs) % 2 != 0:
|
||||
error_msg = 'Invalid dictionary construct: {!r}' \
|
||||
.format(objs)
|
||||
error_msg = "Invalid dictionary construct: {!r}".format(objs)
|
||||
raise PSTypeError(error_msg)
|
||||
d = {literal_name(k): v for (k, v) in choplist(2, objs)}
|
||||
(pos, data) = self.get_inline_data(pos+len(b'ID '))
|
||||
(pos, data) = self.get_inline_data(pos + len(b"ID "))
|
||||
obj = PDFStream(d, data)
|
||||
self.push((pos, obj))
|
||||
self.push((pos, self.KEYWORD_EI))
|
||||
|
@ -351,32 +366,30 @@ class PDFPageInterpreter:
|
|||
name = literal_name(spec[0])
|
||||
else:
|
||||
name = literal_name(spec)
|
||||
if name == 'ICCBased' and isinstance(spec, list) \
|
||||
and 2 <= len(spec):
|
||||
return PDFColorSpace(name, stream_value(spec[1])['N'])
|
||||
elif name == 'DeviceN' and isinstance(spec, list) \
|
||||
and 2 <= len(spec):
|
||||
if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec):
|
||||
return PDFColorSpace(name, stream_value(spec[1])["N"])
|
||||
elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec):
|
||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE.get(name)
|
||||
|
||||
for (k, v) in dict_value(resources).items():
|
||||
log.debug('Resource: %r: %r', k, v)
|
||||
if k == 'Font':
|
||||
log.debug("Resource: %r: %r", k, v)
|
||||
if k == "Font":
|
||||
for (fontid, spec) in dict_value(v).items():
|
||||
objid = None
|
||||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
elif k == "ColorSpace":
|
||||
for (csid, spec) in dict_value(v).items():
|
||||
colorspace = get_colorspace(resolve1(spec))
|
||||
if colorspace is not None:
|
||||
self.csmap[csid] = colorspace
|
||||
elif k == 'ProcSet':
|
||||
elif k == "ProcSet":
|
||||
self.rsrcmgr.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
elif k == "XObject":
|
||||
for (xobjid, xobjstrm) in dict_value(v).items():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
return
|
||||
|
@ -410,14 +423,11 @@ class PDFPageInterpreter:
|
|||
self.argstack = self.argstack[:-n]
|
||||
return x
|
||||
|
||||
def get_current_state(
|
||||
self
|
||||
) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
|
||||
def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
|
||||
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
|
||||
|
||||
def set_current_state(
|
||||
self,
|
||||
state: Tuple[Matrix, PDFTextState, PDFGraphicState]
|
||||
self, state: Tuple[Matrix, PDFTextState, PDFGraphicState]
|
||||
) -> None:
|
||||
(self.ctm, self.textstate, self.graphicstate) = state
|
||||
self.device.set_ctm(self.ctm)
|
||||
|
@ -441,11 +451,10 @@ class PDFPageInterpreter:
|
|||
c1: PDFStackT,
|
||||
d1: PDFStackT,
|
||||
e1: PDFStackT,
|
||||
f1: PDFStackT
|
||||
f1: PDFStackT,
|
||||
) -> None:
|
||||
"""Concatenate matrix to current transformation matrix"""
|
||||
self.ctm = \
|
||||
mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
|
||||
self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
|
||||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
|
@ -491,12 +500,12 @@ class PDFPageInterpreter:
|
|||
|
||||
def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
|
||||
"""Begin new subpath"""
|
||||
self.curpath.append(('m', cast(float, x), cast(float, y)))
|
||||
self.curpath.append(("m", cast(float, x), cast(float, y)))
|
||||
return
|
||||
|
||||
def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
|
||||
"""Append straight line segment to path"""
|
||||
self.curpath.append(('l', cast(float, x), cast(float, y)))
|
||||
self.curpath.append(("l", cast(float, x), cast(float, y)))
|
||||
return
|
||||
|
||||
def do_c(
|
||||
|
@ -506,66 +515,57 @@ class PDFPageInterpreter:
|
|||
x2: PDFStackT,
|
||||
y2: PDFStackT,
|
||||
x3: PDFStackT,
|
||||
y3: PDFStackT
|
||||
y3: PDFStackT,
|
||||
) -> None:
|
||||
"""Append curved segment to path (three control points)"""
|
||||
self.curpath.append(('c', cast(float, x1), cast(float, y1),
|
||||
cast(float, x2), cast(float, y2),
|
||||
cast(float, x3), cast(float, y3)))
|
||||
self.curpath.append(
|
||||
(
|
||||
"c",
|
||||
cast(float, x1),
|
||||
cast(float, y1),
|
||||
cast(float, x2),
|
||||
cast(float, y2),
|
||||
cast(float, x3),
|
||||
cast(float, y3),
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
def do_v(
|
||||
self,
|
||||
x2: PDFStackT,
|
||||
y2: PDFStackT,
|
||||
x3: PDFStackT,
|
||||
y3: PDFStackT
|
||||
) -> None:
|
||||
def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
|
||||
"""Append curved segment to path (initial point replicated)"""
|
||||
self.curpath.append(('v', cast(float, x2), cast(float, y2),
|
||||
cast(float, x3), cast(float, y3)))
|
||||
self.curpath.append(
|
||||
("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3))
|
||||
)
|
||||
return
|
||||
|
||||
def do_y(
|
||||
self,
|
||||
x1: PDFStackT,
|
||||
y1: PDFStackT,
|
||||
x3: PDFStackT,
|
||||
y3: PDFStackT
|
||||
) -> None:
|
||||
def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
|
||||
"""Append curved segment to path (final point replicated)"""
|
||||
self.curpath.append(('y', cast(float, x1), cast(float, y1),
|
||||
cast(float, x3), cast(float, y3)))
|
||||
self.curpath.append(
|
||||
("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3))
|
||||
)
|
||||
return
|
||||
|
||||
def do_h(self) -> None:
|
||||
"""Close subpath"""
|
||||
self.curpath.append(('h',))
|
||||
self.curpath.append(("h",))
|
||||
return
|
||||
|
||||
def do_re(
|
||||
self,
|
||||
x: PDFStackT,
|
||||
y: PDFStackT,
|
||||
w: PDFStackT,
|
||||
h: PDFStackT
|
||||
) -> None:
|
||||
def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
|
||||
"""Append rectangle to path"""
|
||||
x = cast(float, x)
|
||||
y = cast(float, y)
|
||||
w = cast(float, w)
|
||||
h = cast(float, h)
|
||||
self.curpath.append(('m', x, y))
|
||||
self.curpath.append(('l', x+w, y))
|
||||
self.curpath.append(('l', x+w, y+h))
|
||||
self.curpath.append(('l', x, y+h))
|
||||
self.curpath.append(('h',))
|
||||
self.curpath.append(("m", x, y))
|
||||
self.curpath.append(("l", x + w, y))
|
||||
self.curpath.append(("l", x + w, y + h))
|
||||
self.curpath.append(("l", x, y + h))
|
||||
self.curpath.append(("h",))
|
||||
return
|
||||
|
||||
def do_S(self) -> None:
|
||||
"""Stroke path"""
|
||||
self.device.paint_path(self.graphicstate, True, False, False,
|
||||
self.curpath)
|
||||
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
|
@ -577,8 +577,7 @@ class PDFPageInterpreter:
|
|||
|
||||
def do_f(self) -> None:
|
||||
"""Fill path using nonzero winding number rule"""
|
||||
self.device.paint_path(self.graphicstate, False, True, False,
|
||||
self.curpath)
|
||||
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
|
@ -588,22 +587,19 @@ class PDFPageInterpreter:
|
|||
|
||||
def do_f_a(self) -> None:
|
||||
"""Fill path using even-odd rule"""
|
||||
self.device.paint_path(self.graphicstate, False, True, True,
|
||||
self.curpath)
|
||||
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_B(self) -> None:
|
||||
"""Fill and stroke path using nonzero winding number rule"""
|
||||
self.device.paint_path(self.graphicstate, True, True, False,
|
||||
self.curpath)
|
||||
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
def do_B_a(self) -> None:
|
||||
"""Fill and stroke path using even-odd rule"""
|
||||
self.device.paint_path(self.graphicstate, True, True, True,
|
||||
self.curpath)
|
||||
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
|
@ -641,7 +637,7 @@ class PDFPageInterpreter:
|
|||
self.scs = self.csmap[literal_name(name)]
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||
raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
|
||||
return
|
||||
|
||||
def do_cs(self, name: PDFStackT) -> None:
|
||||
|
@ -650,7 +646,7 @@ class PDFPageInterpreter:
|
|||
self.ncs = self.csmap[literal_name(name)]
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||
raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
|
||||
return
|
||||
|
||||
def do_G(self, gray: PDFStackT) -> None:
|
||||
|
@ -665,38 +661,32 @@ class PDFPageInterpreter:
|
|||
|
||||
def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
|
||||
"""Set RGB color for stroking operations"""
|
||||
self.graphicstate.scolor = \
|
||||
(cast(float, r), cast(float, g), cast(float, b))
|
||||
self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b))
|
||||
return
|
||||
|
||||
def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
|
||||
"""Set RGB color for nonstroking operations"""
|
||||
self.graphicstate.ncolor = \
|
||||
(cast(float, r), cast(float, g), cast(float, b))
|
||||
self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b))
|
||||
return
|
||||
|
||||
def do_K(
|
||||
self,
|
||||
c: PDFStackT,
|
||||
m: PDFStackT,
|
||||
y: PDFStackT,
|
||||
k: PDFStackT
|
||||
) -> None:
|
||||
def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
|
||||
"""Set CMYK color for stroking operations"""
|
||||
self.graphicstate.scolor = \
|
||||
(cast(float, c), cast(float, m), cast(float, y), cast(float, k))
|
||||
self.graphicstate.scolor = (
|
||||
cast(float, c),
|
||||
cast(float, m),
|
||||
cast(float, y),
|
||||
cast(float, k),
|
||||
)
|
||||
return
|
||||
|
||||
def do_k(
|
||||
self,
|
||||
c: PDFStackT,
|
||||
m: PDFStackT,
|
||||
y: PDFStackT,
|
||||
k: PDFStackT
|
||||
) -> None:
|
||||
def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
|
||||
"""Set CMYK color for nonstroking operations"""
|
||||
self.graphicstate.ncolor = \
|
||||
(cast(float, c), cast(float, m), cast(float, y), cast(float, k))
|
||||
self.graphicstate.ncolor = (
|
||||
cast(float, c),
|
||||
cast(float, m),
|
||||
cast(float, y),
|
||||
cast(float, k),
|
||||
)
|
||||
return
|
||||
|
||||
def do_SCN(self) -> None:
|
||||
|
@ -705,7 +695,7 @@ class PDFPageInterpreter:
|
|||
n = self.scs.ncomponents
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('No colorspace specified!')
|
||||
raise PDFInterpreterError("No colorspace specified!")
|
||||
n = 1
|
||||
self.graphicstate.scolor = cast(Color, self.pop(n))
|
||||
return
|
||||
|
@ -716,7 +706,7 @@ class PDFPageInterpreter:
|
|||
n = self.ncs.ncomponents
|
||||
else:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('No colorspace specified!')
|
||||
raise PDFInterpreterError("No colorspace specified!")
|
||||
n = 1
|
||||
self.graphicstate.ncolor = cast(Color, self.pop(n))
|
||||
return
|
||||
|
@ -831,7 +821,7 @@ class PDFPageInterpreter:
|
|||
self.textstate.font = self.fontmap[literal_name(fontid)]
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
||||
raise PDFInterpreterError("Undefined Font id: %r" % fontid)
|
||||
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||
self.textstate.fontsize = cast(float, fontsize)
|
||||
return
|
||||
|
@ -875,7 +865,7 @@ class PDFPageInterpreter:
|
|||
c: PDFStackT,
|
||||
d: PDFStackT,
|
||||
e: PDFStackT,
|
||||
f: PDFStackT
|
||||
f: PDFStackT,
|
||||
) -> None:
|
||||
"""Set text matrix and text line matrix"""
|
||||
self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
|
||||
|
@ -885,8 +875,14 @@ class PDFPageInterpreter:
|
|||
def do_T_a(self) -> None:
|
||||
"""Move to start of next text line"""
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e,
|
||||
self.textstate.leading*d+f)
|
||||
self.textstate.matrix = (
|
||||
a,
|
||||
b,
|
||||
c,
|
||||
d,
|
||||
self.textstate.leading * c + e,
|
||||
self.textstate.leading * d + f,
|
||||
)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
|
@ -894,11 +890,12 @@ class PDFPageInterpreter:
|
|||
"""Show text, allowing individual glyph positioning"""
|
||||
if self.textstate.font is None:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('No font specified!')
|
||||
raise PDFInterpreterError("No font specified!")
|
||||
return
|
||||
assert self.ncs is not None
|
||||
self.device.render_string(self.textstate, cast(PDFTextSeq, seq),
|
||||
self.ncs, self.graphicstate.copy())
|
||||
self.device.render_string(
|
||||
self.textstate, cast(PDFTextSeq, seq), self.ncs, self.graphicstate.copy()
|
||||
)
|
||||
return
|
||||
|
||||
def do_Tj(self, s: PDFStackT) -> None:
|
||||
|
@ -935,7 +932,7 @@ class PDFPageInterpreter:
|
|||
|
||||
def do_EI(self, obj: PDFStackT) -> None:
|
||||
"""End inline image object"""
|
||||
if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj:
|
||||
if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
|
||||
iobjid = str(id(obj))
|
||||
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
self.device.render_image(iobjid, obj)
|
||||
|
@ -949,28 +946,28 @@ class PDFPageInterpreter:
|
|||
xobj = stream_value(self.xobjmap[xobjid])
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||
raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
|
||||
return
|
||||
log.debug('Processing xobj: %r', xobj)
|
||||
subtype = xobj.get('Subtype')
|
||||
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
||||
log.debug("Processing xobj: %r", xobj)
|
||||
subtype = xobj.get("Subtype")
|
||||
if subtype is LITERAL_FORM and "BBox" in xobj:
|
||||
interpreter = self.dup()
|
||||
bbox = cast(Rect, list_value(xobj['BBox']))
|
||||
matrix = cast(Matrix, list_value(
|
||||
xobj.get('Matrix', MATRIX_IDENTITY)))
|
||||
bbox = cast(Rect, list_value(xobj["BBox"]))
|
||||
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
|
||||
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
||||
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
||||
# instead of having their own Resources entry.
|
||||
xobjres = xobj.get('Resources')
|
||||
xobjres = xobj.get("Resources")
|
||||
if xobjres:
|
||||
resources = dict_value(xobjres)
|
||||
else:
|
||||
resources = self.resources.copy()
|
||||
self.device.begin_figure(xobjid, bbox, matrix)
|
||||
interpreter.render_contents(resources, [xobj],
|
||||
ctm=mult_matrix(matrix, self.ctm))
|
||||
interpreter.render_contents(
|
||||
resources, [xobj], ctm=mult_matrix(matrix, self.ctm)
|
||||
)
|
||||
self.device.end_figure(xobjid)
|
||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
||||
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
|
||||
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
||||
self.device.render_image(xobjid, xobj)
|
||||
self.device.end_figure(xobjid)
|
||||
|
@ -980,7 +977,7 @@ class PDFPageInterpreter:
|
|||
return
|
||||
|
||||
def process_page(self, page: PDFPage) -> None:
|
||||
log.debug('Processing page: %r', page)
|
||||
log.debug("Processing page: %r", page)
|
||||
(x0, y0, x1, y1) = page.mediabox
|
||||
if page.rotate == 90:
|
||||
ctm = (0, -1, 1, 0, -y0, x1)
|
||||
|
@ -999,14 +996,15 @@ class PDFPageInterpreter:
|
|||
self,
|
||||
resources: Dict[object, object],
|
||||
streams: Sequence[object],
|
||||
ctm: Matrix = MATRIX_IDENTITY
|
||||
ctm: Matrix = MATRIX_IDENTITY,
|
||||
) -> None:
|
||||
"""Render the content streams.
|
||||
|
||||
This method may be called recursively.
|
||||
"""
|
||||
log.debug('render_contents: resources=%r, streams=%r, ctm=%r',
|
||||
resources, streams, ctm)
|
||||
log.debug(
|
||||
"render_contents: resources=%r, streams=%r, ctm=%r", resources, streams, ctm
|
||||
)
|
||||
self.init_resources(resources)
|
||||
self.init_state(ctm)
|
||||
self.execute(list_value(streams))
|
||||
|
@ -1025,22 +1023,23 @@ class PDFPageInterpreter:
|
|||
break
|
||||
if isinstance(obj, PSKeyword):
|
||||
name = keyword_name(obj)
|
||||
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w')\
|
||||
.replace("'", '_q')
|
||||
method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
|
||||
"'", "_q"
|
||||
)
|
||||
if hasattr(self, method):
|
||||
func = getattr(self, method)
|
||||
nargs = func.__code__.co_argcount - 1
|
||||
if nargs:
|
||||
args = self.pop(nargs)
|
||||
log.debug('exec: %s %r', name, args)
|
||||
log.debug("exec: %s %r", name, args)
|
||||
if len(args) == nargs:
|
||||
func(*args)
|
||||
else:
|
||||
log.debug('exec: %s', name)
|
||||
log.debug("exec: %s", name)
|
||||
func()
|
||||
else:
|
||||
if settings.STRICT:
|
||||
error_msg = 'Unknown operator: %r' % name
|
||||
error_msg = "Unknown operator: %r" % name
|
||||
raise PDFInterpreterError(error_msg)
|
||||
else:
|
||||
self.push(obj)
|
||||
|
|
|
@ -4,8 +4,7 @@ from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
|
|||
|
||||
from pdfminer.utils import Rect
|
||||
from . import settings
|
||||
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
|
||||
PDFNoPageLabels
|
||||
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, PDFNoPageLabels
|
||||
from .pdfparser import PDFParser
|
||||
from .pdftypes import PDFObjectNotFound
|
||||
from .pdftypes import dict_value
|
||||
|
@ -17,8 +16,8 @@ from .psparser import LIT
|
|||
log = logging.getLogger(__name__)
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_PAGE = LIT('Page')
|
||||
LITERAL_PAGES = LIT('Pages')
|
||||
LITERAL_PAGE = LIT("Page")
|
||||
LITERAL_PAGES = LIT("Pages")
|
||||
|
||||
|
||||
class PDFPage:
|
||||
|
@ -44,11 +43,7 @@ class PDFPage:
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
doc: PDFDocument,
|
||||
pageid: object,
|
||||
attrs: object,
|
||||
label: Optional[str]
|
||||
self, doc: PDFDocument, pageid: object, attrs: object, label: Optional[str]
|
||||
) -> None:
|
||||
"""Initialize a page object.
|
||||
|
||||
|
@ -61,19 +56,20 @@ class PDFPage:
|
|||
self.pageid = pageid
|
||||
self.attrs = dict_value(attrs)
|
||||
self.label = label
|
||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||
self.resources: Dict[object, object] = \
|
||||
resolve1(self.attrs.get('Resources', dict()))
|
||||
self.mediabox: Rect = resolve1(self.attrs['MediaBox'])
|
||||
if 'CropBox' in self.attrs:
|
||||
self.cropbox: Rect = resolve1(self.attrs['CropBox'])
|
||||
self.lastmod = resolve1(self.attrs.get("LastModified"))
|
||||
self.resources: Dict[object, object] = resolve1(
|
||||
self.attrs.get("Resources", dict())
|
||||
)
|
||||
self.mediabox: Rect = resolve1(self.attrs["MediaBox"])
|
||||
if "CropBox" in self.attrs:
|
||||
self.cropbox: Rect = resolve1(self.attrs["CropBox"])
|
||||
else:
|
||||
self.cropbox = self.mediabox
|
||||
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
|
||||
self.annots = self.attrs.get("Annots")
|
||||
self.beads = self.attrs.get("B")
|
||||
if "Contents" in self.attrs:
|
||||
contents = resolve1(self.attrs["Contents"])
|
||||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
|
@ -81,16 +77,16 @@ class PDFPage:
|
|||
self.contents: List[object] = contents
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
|
||||
.format(self.resources, self.mediabox)
|
||||
return "<PDFPage: Resources={!r}, MediaBox={!r}>".format(
|
||||
self.resources, self.mediabox
|
||||
)
|
||||
|
||||
INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
|
||||
INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
|
||||
|
||||
@classmethod
|
||||
def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
|
||||
def search(
|
||||
obj: object,
|
||||
parent: Dict[str, object]
|
||||
obj: object, parent: Dict[str, object]
|
||||
) -> Iterator[Tuple[int, Dict[object, Dict[object, object]]]]:
|
||||
if isinstance(obj, int):
|
||||
objid = obj
|
||||
|
@ -104,16 +100,16 @@ class PDFPage:
|
|||
if k in cls.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
|
||||
tree_type = tree.get('Type')
|
||||
tree_type = tree.get("Type")
|
||||
if tree_type is None and not settings.STRICT: # See #64
|
||||
tree_type = tree.get('type')
|
||||
tree_type = tree.get("type")
|
||||
|
||||
if tree_type is LITERAL_PAGES and 'Kids' in tree:
|
||||
log.debug('Pages: Kids=%r', tree['Kids'])
|
||||
for c in list_value(tree['Kids']):
|
||||
if tree_type is LITERAL_PAGES and "Kids" in tree:
|
||||
log.debug("Pages: Kids=%r", tree["Kids"])
|
||||
for c in list_value(tree["Kids"]):
|
||||
yield from search(c, tree)
|
||||
elif tree_type is LITERAL_PAGE:
|
||||
log.debug('Page: %r', tree)
|
||||
log.debug("Page: %r", tree)
|
||||
yield (objid, tree)
|
||||
|
||||
try:
|
||||
|
@ -122,8 +118,8 @@ class PDFPage:
|
|||
page_labels = itertools.repeat(None)
|
||||
|
||||
pages = False
|
||||
if 'Pages' in document.catalog:
|
||||
objects = search(document.catalog['Pages'], document.catalog)
|
||||
if "Pages" in document.catalog:
|
||||
objects = search(document.catalog["Pages"], document.catalog)
|
||||
for (objid, tree) in objects:
|
||||
yield cls(document, objid, tree, next(page_labels))
|
||||
pages = True
|
||||
|
@ -133,8 +129,7 @@ class PDFPage:
|
|||
for objid in xref.get_objids():
|
||||
try:
|
||||
obj = document.getobj(objid)
|
||||
if isinstance(obj, dict) \
|
||||
and obj.get('Type') is LITERAL_PAGE:
|
||||
if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
|
||||
yield cls(document, objid, obj, next(page_labels))
|
||||
except PDFObjectNotFound:
|
||||
pass
|
||||
|
@ -146,9 +141,9 @@ class PDFPage:
|
|||
fp: BinaryIO,
|
||||
pagenos: Optional[Container[int]] = None,
|
||||
maxpages: int = 0,
|
||||
password: str = '',
|
||||
password: str = "",
|
||||
caching: bool = True,
|
||||
check_extractable: bool = False
|
||||
check_extractable: bool = False,
|
||||
) -> Iterator["PDFPage"]:
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
|
@ -158,14 +153,16 @@ class PDFPage:
|
|||
# If not, warn the user and proceed.
|
||||
if not doc.is_extractable:
|
||||
if check_extractable:
|
||||
error_msg = 'Text extraction is not allowed: %r' % fp
|
||||
error_msg = "Text extraction is not allowed: %r" % fp
|
||||
raise PDFTextExtractionNotAllowed(error_msg)
|
||||
else:
|
||||
warning_msg = 'The PDF %r contains a metadata field '\
|
||||
'indicating that it should not allow ' \
|
||||
'text extraction. Ignoring this field ' \
|
||||
'and proceeding. Use the check_extractable ' \
|
||||
'if you want to raise an error in this case' % fp
|
||||
warning_msg = (
|
||||
"The PDF %r contains a metadata field "
|
||||
"indicating that it should not allow "
|
||||
"text extraction. Ignoring this field "
|
||||
"and proceeding. Use the check_extractable "
|
||||
"if you want to raise an error in this case" % fp
|
||||
)
|
||||
log.warning(warning_msg)
|
||||
# Process each page contained in the document.
|
||||
for (pageno, page) in enumerate(cls.create_pages(doc)):
|
||||
|
|
|
@ -51,12 +51,12 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
|
|||
"""Associates the parser with a PDFDocument object."""
|
||||
self.doc = doc
|
||||
|
||||
KEYWORD_R = KWD(b'R')
|
||||
KEYWORD_NULL = KWD(b'null')
|
||||
KEYWORD_ENDOBJ = KWD(b'endobj')
|
||||
KEYWORD_STREAM = KWD(b'stream')
|
||||
KEYWORD_XREF = KWD(b'xref')
|
||||
KEYWORD_STARTXREF = KWD(b'startxref')
|
||||
KEYWORD_R = KWD(b"R")
|
||||
KEYWORD_NULL = KWD(b"null")
|
||||
KEYWORD_ENDOBJ = KWD(b"endobj")
|
||||
KEYWORD_STREAM = KWD(b"stream")
|
||||
KEYWORD_XREF = KWD(b"xref")
|
||||
KEYWORD_STARTXREF = KWD(b"startxref")
|
||||
|
||||
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||
"""Handles PDF-related keywords."""
|
||||
|
@ -76,8 +76,7 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
|
|||
if len(self.curstack) >= 2:
|
||||
try:
|
||||
((_, objid), (_, genno)) = self.pop(2)
|
||||
(objid, genno) = (
|
||||
int(objid), int(genno)) # type: ignore[arg-type]
|
||||
(objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
|
||||
assert self.doc is not None
|
||||
obj = PDFObjRef(self.doc, objid, genno)
|
||||
self.push((pos, obj))
|
||||
|
@ -90,16 +89,16 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
|
|||
objlen = 0
|
||||
if not self.fallback:
|
||||
try:
|
||||
objlen = int_value(dic['Length'])
|
||||
objlen = int_value(dic["Length"])
|
||||
except KeyError:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('/Length is undefined: %r' % dic)
|
||||
raise PDFSyntaxError("/Length is undefined: %r" % dic)
|
||||
self.seek(pos)
|
||||
try:
|
||||
(_, line) = self.nextline() # 'stream'
|
||||
except PSEOF:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
raise PDFSyntaxError("Unexpected EOF")
|
||||
return
|
||||
pos += len(line)
|
||||
self.fp.seek(pos)
|
||||
|
@ -110,10 +109,10 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
|
|||
(linepos, line) = self.nextline()
|
||||
except PSEOF:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('Unexpected EOF')
|
||||
raise PDFSyntaxError("Unexpected EOF")
|
||||
break
|
||||
if b'endstream' in line:
|
||||
i = line.index(b'endstream')
|
||||
if b"endstream" in line:
|
||||
i = line.index(b"endstream")
|
||||
objlen += i
|
||||
if self.fallback:
|
||||
data += line[:i]
|
||||
|
@ -123,8 +122,13 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
|
|||
data += line
|
||||
self.seek(pos + objlen)
|
||||
# XXX limit objlen not to exceed object boundary
|
||||
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
|
||||
objlen, dic, data[:10])
|
||||
log.debug(
|
||||
"Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
|
||||
pos,
|
||||
objlen,
|
||||
dic,
|
||||
data[:10],
|
||||
)
|
||||
assert self.doc is not None
|
||||
stream = PDFStream(dic, bytes(data), self.doc.decipher)
|
||||
self.push((pos, stream))
|
||||
|
@ -149,15 +153,14 @@ class PDFStreamParser(PDFParser):
|
|||
def flush(self) -> None:
|
||||
self.add_results(*self.popall())
|
||||
|
||||
KEYWORD_OBJ = KWD(b'obj')
|
||||
KEYWORD_OBJ = KWD(b"obj")
|
||||
|
||||
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||
if token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
((_, objid), (_, genno)) = self.pop(2)
|
||||
(objid, genno) = (
|
||||
int(objid), int(genno)) # type: ignore[arg-type]
|
||||
(objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
|
||||
obj = PDFObjRef(self.doc, objid, genno)
|
||||
self.push((pos, obj))
|
||||
except PSSyntaxError:
|
||||
|
@ -167,7 +170,7 @@ class PDFStreamParser(PDFParser):
|
|||
if settings.STRICT:
|
||||
# See PDF Spec 3.4.6: Only the object values are stored in the
|
||||
# stream; the obj and endobj keywords are not used.
|
||||
raise PDFSyntaxError('Keyword endobj found in stream')
|
||||
raise PDFSyntaxError("Keyword endobj found in stream")
|
||||
return
|
||||
# others
|
||||
self.push((pos, token))
|
||||
|
|
|
@ -2,8 +2,17 @@ import io
|
|||
import logging
|
||||
import sys
|
||||
import zlib
|
||||
from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List,
|
||||
Tuple, cast)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Optional,
|
||||
Union,
|
||||
List,
|
||||
Tuple,
|
||||
cast,
|
||||
)
|
||||
|
||||
from . import settings
|
||||
from .ascii85 import ascii85decode
|
||||
|
@ -21,18 +30,18 @@ if TYPE_CHECKING:
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
LITERAL_CRYPT = LIT('Crypt')
|
||||
LITERAL_CRYPT = LIT("Crypt")
|
||||
|
||||
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
|
||||
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
|
||||
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
|
||||
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
|
||||
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
|
||||
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
|
||||
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
|
||||
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
||||
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
|
||||
LITERALS_JPX_DECODE = (LIT('JPXDecode'),)
|
||||
LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl"))
|
||||
LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW"))
|
||||
LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85"))
|
||||
LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx"))
|
||||
LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL"))
|
||||
LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF"))
|
||||
LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT"))
|
||||
LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),)
|
||||
LITERALS_JPX_DECODE = (LIT("JPXDecode"),)
|
||||
|
||||
|
||||
if sys.version_info >= (3, 8):
|
||||
|
@ -40,8 +49,14 @@ if sys.version_info >= (3, 8):
|
|||
|
||||
class DecipherCallable(Protocol):
|
||||
"""Fully typed a decipher callback, with optional parameter."""
|
||||
def __call__(self, objid: int, genno: int, data: bytes,
|
||||
attrs: Optional[Dict[str, Any]] = None) -> bytes:
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
objid: int,
|
||||
genno: int,
|
||||
data: bytes,
|
||||
attrs: Optional[Dict[str, Any]] = None,
|
||||
) -> bytes:
|
||||
raise NotImplementedError
|
||||
|
||||
else: # Fallback for older Python
|
||||
|
@ -75,21 +90,15 @@ class PDFNotImplementedError(PDFException):
|
|||
|
||||
|
||||
class PDFObjRef(PDFObject):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
doc: Optional["PDFDocument"],
|
||||
objid: int,
|
||||
_: object
|
||||
) -> None:
|
||||
def __init__(self, doc: Optional["PDFDocument"], objid: int, _: object) -> None:
|
||||
if objid == 0:
|
||||
if settings.STRICT:
|
||||
raise PDFValueError('PDF object id cannot be 0.')
|
||||
raise PDFValueError("PDF object id cannot be 0.")
|
||||
self.doc = doc
|
||||
self.objid = objid
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<PDFObjRef:%d>' % (self.objid)
|
||||
return "<PDFObjRef:%d>" % (self.objid)
|
||||
|
||||
def resolve(self, default: object = None) -> Any:
|
||||
assert self.doc is not None
|
||||
|
@ -126,14 +135,8 @@ def resolve_all(x: object, default: object = None) -> Any:
|
|||
return x
|
||||
|
||||
|
||||
def decipher_all(
|
||||
decipher: DecipherCallable,
|
||||
objid: int,
|
||||
genno: int,
|
||||
x: object
|
||||
) -> Any:
|
||||
"""Recursively deciphers the given object.
|
||||
"""
|
||||
def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any:
|
||||
"""Recursively deciphers the given object."""
|
||||
if isinstance(x, bytes):
|
||||
return decipher(objid, genno, x)
|
||||
if isinstance(x, list):
|
||||
|
@ -148,7 +151,7 @@ def int_value(x: object) -> int:
|
|||
x = resolve1(x)
|
||||
if not isinstance(x, int):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('Integer required: %r' % x)
|
||||
raise PDFTypeError("Integer required: %r" % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
|
@ -157,7 +160,7 @@ def float_value(x: object) -> float:
|
|||
x = resolve1(x)
|
||||
if not isinstance(x, float):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('Float required: %r' % x)
|
||||
raise PDFTypeError("Float required: %r" % x)
|
||||
return 0.0
|
||||
return x
|
||||
|
||||
|
@ -166,7 +169,7 @@ def num_value(x: object) -> float:
|
|||
x = resolve1(x)
|
||||
if not isinstance(x, (int, float)): # == utils.isnumber(x)
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('Int or Float required: %r' % x)
|
||||
raise PDFTypeError("Int or Float required: %r" % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
|
@ -184,8 +187,8 @@ def str_value(x: object) -> bytes:
|
|||
x = resolve1(x)
|
||||
if not isinstance(x, bytes):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('String required: %r' % x)
|
||||
return b''
|
||||
raise PDFTypeError("String required: %r" % x)
|
||||
return b""
|
||||
return x
|
||||
|
||||
|
||||
|
@ -193,7 +196,7 @@ def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]:
|
|||
x = resolve1(x)
|
||||
if not isinstance(x, (list, tuple)):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('List required: %r' % x)
|
||||
raise PDFTypeError("List required: %r" % x)
|
||||
return []
|
||||
return x
|
||||
|
||||
|
@ -202,8 +205,8 @@ def dict_value(x: object) -> Dict[Any, Any]:
|
|||
x = resolve1(x)
|
||||
if not isinstance(x, dict):
|
||||
if settings.STRICT:
|
||||
logger.error('PDFTypeError : Dict required: %r', x)
|
||||
raise PDFTypeError('Dict required: %r' % x)
|
||||
logger.error("PDFTypeError : Dict required: %r", x)
|
||||
raise PDFTypeError("Dict required: %r" % x)
|
||||
return {}
|
||||
return x
|
||||
|
||||
|
@ -212,8 +215,8 @@ def stream_value(x: object) -> "PDFStream":
|
|||
x = resolve1(x)
|
||||
if not isinstance(x, PDFStream):
|
||||
if settings.STRICT:
|
||||
raise PDFTypeError('PDFStream required: %r' % x)
|
||||
return PDFStream({}, b'')
|
||||
raise PDFTypeError("PDFStream required: %r" % x)
|
||||
return PDFStream({}, b"")
|
||||
return x
|
||||
|
||||
|
||||
|
@ -223,7 +226,7 @@ def decompress_corrupted(data: bytes) -> bytes:
|
|||
"""
|
||||
d = zlib.decompressobj()
|
||||
f = io.BytesIO(data)
|
||||
result_str = b''
|
||||
result_str = b""
|
||||
buffer = f.read(1)
|
||||
i = 0
|
||||
try:
|
||||
|
@ -239,12 +242,11 @@ def decompress_corrupted(data: bytes) -> bytes:
|
|||
|
||||
|
||||
class PDFStream(PDFObject):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attrs: Dict[str, Any],
|
||||
rawdata: bytes,
|
||||
decipher: Optional[DecipherCallable] = None
|
||||
decipher: Optional[DecipherCallable] = None,
|
||||
) -> None:
|
||||
assert isinstance(attrs, dict), str(type(attrs))
|
||||
self.attrs = attrs
|
||||
|
@ -261,12 +263,18 @@ class PDFStream(PDFObject):
|
|||
def __repr__(self) -> str:
|
||||
if self.data is None:
|
||||
assert self.rawdata is not None
|
||||
return '<PDFStream(%r): raw=%d, %r>' % \
|
||||
(self.objid, len(self.rawdata), self.attrs)
|
||||
return "<PDFStream(%r): raw=%d, %r>" % (
|
||||
self.objid,
|
||||
len(self.rawdata),
|
||||
self.attrs,
|
||||
)
|
||||
else:
|
||||
assert self.data is not None
|
||||
return '<PDFStream(%r): len=%d, %r>' % \
|
||||
(self.objid, len(self.data), self.attrs)
|
||||
return "<PDFStream(%r): len=%d, %r>" % (
|
||||
self.objid,
|
||||
len(self.data),
|
||||
self.attrs,
|
||||
)
|
||||
|
||||
def __contains__(self, name: object) -> bool:
|
||||
return name in self.attrs
|
||||
|
@ -284,8 +292,8 @@ class PDFStream(PDFObject):
|
|||
return default
|
||||
|
||||
def get_filters(self) -> List[Tuple[Any, Any]]:
|
||||
filters = self.get_any(('F', 'Filter'))
|
||||
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
||||
filters = self.get_any(("F", "Filter"))
|
||||
params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})
|
||||
if not filters:
|
||||
return []
|
||||
if not isinstance(filters, list):
|
||||
|
@ -298,15 +306,16 @@ class PDFStream(PDFObject):
|
|||
# resolve filter if possible
|
||||
_filters = []
|
||||
for fltr in filters:
|
||||
if hasattr(fltr, 'resolve'):
|
||||
if hasattr(fltr, "resolve"):
|
||||
fltr = fltr.resolve()[0]
|
||||
_filters.append(fltr)
|
||||
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15
|
||||
return list(zip(_filters, params))
|
||||
|
||||
def decode(self) -> None:
|
||||
assert self.data is None \
|
||||
and self.rawdata is not None, str((self.data, self.rawdata))
|
||||
assert self.data is None and self.rawdata is not None, str(
|
||||
(self.data, self.rawdata)
|
||||
)
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
|
@ -326,14 +335,13 @@ class PDFStream(PDFObject):
|
|||
|
||||
except zlib.error as e:
|
||||
if settings.STRICT:
|
||||
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
|
||||
.format(e, data)
|
||||
error_msg = "Invalid zlib bytes: {!r}, {!r}".format(e, data)
|
||||
raise PDFException(error_msg)
|
||||
|
||||
try:
|
||||
data = decompress_corrupted(data)
|
||||
except zlib.error:
|
||||
data = b''
|
||||
data = b""
|
||||
|
||||
elif f in LITERALS_LZW_DECODE:
|
||||
data = lzwdecode(data)
|
||||
|
@ -356,25 +364,26 @@ class PDFStream(PDFObject):
|
|||
pass
|
||||
elif f == LITERAL_CRYPT:
|
||||
# not yet..
|
||||
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
||||
raise PDFNotImplementedError("/Crypt filter is unsupported")
|
||||
else:
|
||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||
raise PDFNotImplementedError("Unsupported filter: %r" % f)
|
||||
# apply predictors
|
||||
if params and 'Predictor' in params:
|
||||
pred = int_value(params['Predictor'])
|
||||
if params and "Predictor" in params:
|
||||
pred = int_value(params["Predictor"])
|
||||
if pred == 1:
|
||||
# no predictor
|
||||
pass
|
||||
elif 10 <= pred:
|
||||
# PNG predictor
|
||||
colors = int_value(params.get('Colors', 1))
|
||||
columns = int_value(params.get('Columns', 1))
|
||||
raw_bits_per_component = params.get('BitsPerComponent', 8)
|
||||
colors = int_value(params.get("Colors", 1))
|
||||
columns = int_value(params.get("Columns", 1))
|
||||
raw_bits_per_component = params.get("BitsPerComponent", 8)
|
||||
bitspercomponent = int_value(raw_bits_per_component)
|
||||
data = apply_png_predictor(pred, colors, columns,
|
||||
bitspercomponent, data)
|
||||
data = apply_png_predictor(
|
||||
pred, colors, columns, bitspercomponent, data
|
||||
)
|
||||
else:
|
||||
error_msg = 'Unsupported predictor: %r' % pred
|
||||
error_msg = "Unsupported predictor: %r" % pred
|
||||
raise PDFNotImplementedError(error_msg)
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
|
|
|
@ -4,8 +4,19 @@
|
|||
|
||||
import logging
|
||||
import re
|
||||
from typing import (Any, BinaryIO, Dict, Generic, Iterator, List,
|
||||
Optional, Tuple, Type, TypeVar, Union)
|
||||
from typing import (
|
||||
Any,
|
||||
BinaryIO,
|
||||
Dict,
|
||||
Generic,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
from . import settings
|
||||
from .utils import choplist
|
||||
|
@ -59,7 +70,7 @@ class PSLiteral(PSObject):
|
|||
|
||||
def __repr__(self) -> str:
|
||||
name = self.name
|
||||
return '/%r' % name
|
||||
return "/%r" % name
|
||||
|
||||
|
||||
class PSKeyword(PSObject):
|
||||
|
@ -79,10 +90,10 @@ class PSKeyword(PSObject):
|
|||
|
||||
def __repr__(self) -> str:
|
||||
name = self.name
|
||||
return '/%r' % name
|
||||
return "/%r" % name
|
||||
|
||||
|
||||
_SymbolT = TypeVar('_SymbolT', PSLiteral, PSKeyword)
|
||||
_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword)
|
||||
|
||||
|
||||
class PSSymbolTable(Generic[_SymbolT]):
|
||||
|
@ -110,25 +121,25 @@ PSLiteralTable = PSSymbolTable(PSLiteral)
|
|||
PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||
LIT = PSLiteralTable.intern
|
||||
KWD = PSKeywordTable.intern
|
||||
KEYWORD_PROC_BEGIN = KWD(b'{')
|
||||
KEYWORD_PROC_END = KWD(b'}')
|
||||
KEYWORD_ARRAY_BEGIN = KWD(b'[')
|
||||
KEYWORD_ARRAY_END = KWD(b']')
|
||||
KEYWORD_DICT_BEGIN = KWD(b'<<')
|
||||
KEYWORD_DICT_END = KWD(b'>>')
|
||||
KEYWORD_PROC_BEGIN = KWD(b"{")
|
||||
KEYWORD_PROC_END = KWD(b"}")
|
||||
KEYWORD_ARRAY_BEGIN = KWD(b"[")
|
||||
KEYWORD_ARRAY_END = KWD(b"]")
|
||||
KEYWORD_DICT_BEGIN = KWD(b"<<")
|
||||
KEYWORD_DICT_END = KWD(b">>")
|
||||
|
||||
|
||||
def literal_name(x: object) -> Any:
|
||||
if not isinstance(x, PSLiteral):
|
||||
if settings.STRICT:
|
||||
raise PSTypeError('Literal required: {!r}'.format(x))
|
||||
raise PSTypeError("Literal required: {!r}".format(x))
|
||||
else:
|
||||
name = x
|
||||
else:
|
||||
name = x.name
|
||||
if not isinstance(name, str):
|
||||
try:
|
||||
name = str(name, 'utf-8')
|
||||
name = str(name, "utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
return name
|
||||
|
@ -137,34 +148,34 @@ def literal_name(x: object) -> Any:
|
|||
def keyword_name(x: object) -> Any:
|
||||
if not isinstance(x, PSKeyword):
|
||||
if settings.STRICT:
|
||||
raise PSTypeError('Keyword required: %r' % x)
|
||||
raise PSTypeError("Keyword required: %r" % x)
|
||||
else:
|
||||
name = x
|
||||
else:
|
||||
name = str(x.name, 'utf-8', 'ignore')
|
||||
name = str(x.name, "utf-8", "ignore")
|
||||
return name
|
||||
|
||||
|
||||
EOL = re.compile(br'[\r\n]')
|
||||
SPC = re.compile(br'\s')
|
||||
NONSPC = re.compile(br'\S')
|
||||
HEX = re.compile(br'[0-9a-fA-F]')
|
||||
END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
|
||||
END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
|
||||
HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
|
||||
END_NUMBER = re.compile(br'[^0-9]')
|
||||
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
|
||||
END_STRING = re.compile(br'[()\134]')
|
||||
OCT_STRING = re.compile(br'[0-7]')
|
||||
EOL = re.compile(rb"[\r\n]")
|
||||
SPC = re.compile(rb"\s")
|
||||
NONSPC = re.compile(rb"\S")
|
||||
HEX = re.compile(rb"[0-9a-fA-F]")
|
||||
END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
|
||||
END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
|
||||
HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
|
||||
END_NUMBER = re.compile(rb"[^0-9]")
|
||||
END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
|
||||
END_STRING = re.compile(rb"[()\134]")
|
||||
OCT_STRING = re.compile(rb"[0-7]")
|
||||
ESC_STRING = {
|
||||
b'b': 8,
|
||||
b't': 9,
|
||||
b'n': 10,
|
||||
b'f': 12,
|
||||
b'r': 13,
|
||||
b'(': 40,
|
||||
b')': 41,
|
||||
b'\\': 92
|
||||
b"b": 8,
|
||||
b"t": 9,
|
||||
b"n": 10,
|
||||
b"f": 12,
|
||||
b"r": 13,
|
||||
b"(": 40,
|
||||
b")": 41,
|
||||
b"\\": 92,
|
||||
}
|
||||
|
||||
|
||||
|
@ -173,8 +184,8 @@ PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
|
|||
|
||||
class PSBaseParser:
|
||||
|
||||
"""Most basic PostScript parser that performs only tokenization.
|
||||
"""
|
||||
"""Most basic PostScript parser that performs only tokenization."""
|
||||
|
||||
BUFSIZ = 4096
|
||||
|
||||
def __init__(self, fp: BinaryIO) -> None:
|
||||
|
@ -182,8 +193,7 @@ class PSBaseParser:
|
|||
self.seek(0)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
|
||||
self.bufpos)
|
||||
return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)
|
||||
|
||||
def flush(self) -> None:
|
||||
return
|
||||
|
@ -200,22 +210,21 @@ class PSBaseParser:
|
|||
if not pos:
|
||||
pos = self.bufpos + self.charpos
|
||||
self.fp.seek(pos)
|
||||
log.debug('poll(%d): %r', pos, self.fp.read(n))
|
||||
log.debug("poll(%d): %r", pos, self.fp.read(n))
|
||||
self.fp.seek(pos0)
|
||||
return
|
||||
|
||||
def seek(self, pos: int) -> None:
|
||||
"""Seeks the parser to the given position.
|
||||
"""
|
||||
log.debug('seek: %r', pos)
|
||||
"""Seeks the parser to the given position."""
|
||||
log.debug("seek: %r", pos)
|
||||
self.fp.seek(pos)
|
||||
# reset the status for nextline()
|
||||
self.bufpos = pos
|
||||
self.buf = b''
|
||||
self.buf = b""
|
||||
self.charpos = 0
|
||||
# reset the status for nexttoken()
|
||||
self._parse1 = self._parse_main
|
||||
self._curtoken = b''
|
||||
self._curtoken = b""
|
||||
self._curtokenpos = 0
|
||||
self._tokens: List[Tuple[int, PSBaseParserToken]] = []
|
||||
return
|
||||
|
@ -227,14 +236,13 @@ class PSBaseParser:
|
|||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
if not self.buf:
|
||||
raise PSEOF('Unexpected EOF')
|
||||
raise PSEOF("Unexpected EOF")
|
||||
self.charpos = 0
|
||||
return
|
||||
|
||||
def nextline(self) -> Tuple[int, bytes]:
|
||||
"""Fetches a next line that ends either with \\r or \\n.
|
||||
"""
|
||||
linebuf = b''
|
||||
"""Fetches a next line that ends either with \\r or \\n."""
|
||||
linebuf = b""
|
||||
linepos = self.bufpos + self.charpos
|
||||
eol = False
|
||||
while 1:
|
||||
|
@ -242,7 +250,7 @@ class PSBaseParser:
|
|||
if eol:
|
||||
c = self.buf[self.charpos : self.charpos + 1]
|
||||
# handle b'\r\n'
|
||||
if c == b'\n':
|
||||
if c == b"\n":
|
||||
linebuf += c
|
||||
self.charpos += 1
|
||||
break
|
||||
|
@ -250,14 +258,14 @@ class PSBaseParser:
|
|||
if m:
|
||||
linebuf += self.buf[self.charpos : m.end(0)]
|
||||
self.charpos = m.end(0)
|
||||
if linebuf[-1:] == b'\r':
|
||||
if linebuf[-1:] == b"\r":
|
||||
eol = True
|
||||
else:
|
||||
break
|
||||
else:
|
||||
linebuf += self.buf[self.charpos :]
|
||||
self.charpos = len(self.buf)
|
||||
log.debug('nextline: %r, %r', linepos, linebuf)
|
||||
log.debug("nextline: %r, %r", linepos, linebuf)
|
||||
|
||||
return (linepos, linebuf)
|
||||
|
||||
|
@ -268,7 +276,7 @@ class PSBaseParser:
|
|||
"""
|
||||
self.fp.seek(0, 2)
|
||||
pos = self.fp.tell()
|
||||
buf = b''
|
||||
buf = b""
|
||||
while 0 < pos:
|
||||
prevpos = pos
|
||||
pos = max(0, pos - self.BUFSIZ)
|
||||
|
@ -277,13 +285,13 @@ class PSBaseParser:
|
|||
if not s:
|
||||
break
|
||||
while 1:
|
||||
n = max(s.rfind(b'\r'), s.rfind(b'\n'))
|
||||
n = max(s.rfind(b"\r"), s.rfind(b"\n"))
|
||||
if n == -1:
|
||||
buf = s + buf
|
||||
break
|
||||
yield s[n:] + buf
|
||||
s = s[:n]
|
||||
buf = b''
|
||||
buf = b""
|
||||
return
|
||||
|
||||
def _parse_main(self, s: bytes, i: int) -> int:
|
||||
|
@ -293,19 +301,19 @@ class PSBaseParser:
|
|||
j = m.start(0)
|
||||
c = s[j : j + 1]
|
||||
self._curtokenpos = self.bufpos + j
|
||||
if c == b'%':
|
||||
self._curtoken = b'%'
|
||||
if c == b"%":
|
||||
self._curtoken = b"%"
|
||||
self._parse1 = self._parse_comment
|
||||
return j + 1
|
||||
elif c == b'/':
|
||||
self._curtoken = b''
|
||||
elif c == b"/":
|
||||
self._curtoken = b""
|
||||
self._parse1 = self._parse_literal
|
||||
return j + 1
|
||||
elif c in b'-+' or c.isdigit():
|
||||
elif c in b"-+" or c.isdigit():
|
||||
self._curtoken = c
|
||||
self._parse1 = self._parse_number
|
||||
return j + 1
|
||||
elif c == b'.':
|
||||
elif c == b".":
|
||||
self._curtoken = c
|
||||
self._parse1 = self._parse_float
|
||||
return j + 1
|
||||
|
@ -313,17 +321,17 @@ class PSBaseParser:
|
|||
self._curtoken = c
|
||||
self._parse1 = self._parse_keyword
|
||||
return j + 1
|
||||
elif c == b'(':
|
||||
self._curtoken = b''
|
||||
elif c == b"(":
|
||||
self._curtoken = b""
|
||||
self.paren = 1
|
||||
self._parse1 = self._parse_string
|
||||
return j + 1
|
||||
elif c == b'<':
|
||||
self._curtoken = b''
|
||||
elif c == b"<":
|
||||
self._curtoken = b""
|
||||
self._parse1 = self._parse_wopen
|
||||
return j + 1
|
||||
elif c == b'>':
|
||||
self._curtoken = b''
|
||||
elif c == b">":
|
||||
self._curtoken = b""
|
||||
self._parse1 = self._parse_wclose
|
||||
return j + 1
|
||||
else:
|
||||
|
@ -354,12 +362,12 @@ class PSBaseParser:
|
|||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j : j + 1]
|
||||
if c == b'#':
|
||||
self.hex = b''
|
||||
if c == b"#":
|
||||
self.hex = b""
|
||||
self._parse1 = self._parse_literal_hex
|
||||
return j + 1
|
||||
try:
|
||||
name: Union[str, bytes] = str(self._curtoken, 'utf-8')
|
||||
name: Union[str, bytes] = str(self._curtoken, "utf-8")
|
||||
except Exception:
|
||||
name = self._curtoken
|
||||
self._add_token(LIT(name))
|
||||
|
@ -384,7 +392,7 @@ class PSBaseParser:
|
|||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j : j + 1]
|
||||
if c == b'.':
|
||||
if c == b".":
|
||||
self._curtoken += c
|
||||
self._parse1 = self._parse_float
|
||||
return j + 1
|
||||
|
@ -416,9 +424,9 @@ class PSBaseParser:
|
|||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
if self._curtoken == b'true':
|
||||
if self._curtoken == b"true":
|
||||
token: Union[bool, PSKeyword] = True
|
||||
elif self._curtoken == b'false':
|
||||
elif self._curtoken == b"false":
|
||||
token = False
|
||||
else:
|
||||
token = KWD(self._curtoken)
|
||||
|
@ -434,15 +442,15 @@ class PSBaseParser:
|
|||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j : j + 1]
|
||||
if c == b'\\':
|
||||
self.oct = b''
|
||||
if c == b"\\":
|
||||
self.oct = b""
|
||||
self._parse1 = self._parse_string_1
|
||||
return j + 1
|
||||
if c == b'(':
|
||||
if c == b"(":
|
||||
self.paren += 1
|
||||
self._curtoken += c
|
||||
return j + 1
|
||||
if c == b')':
|
||||
if c == b")":
|
||||
self.paren -= 1
|
||||
if self.paren:
|
||||
# WTF, they said balanced parens need no special treatment.
|
||||
|
@ -470,7 +478,7 @@ class PSBaseParser:
|
|||
elif c in ESC_STRING:
|
||||
self._curtoken += bytes((ESC_STRING[c],))
|
||||
|
||||
elif c == b'\r' and len(s) > i+1 and s[i+1:i+2] == b'\n':
|
||||
elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
|
||||
# If current and next character is \r\n skip both because enters
|
||||
# after a \ are ignored
|
||||
i += 1
|
||||
|
@ -481,7 +489,7 @@ class PSBaseParser:
|
|||
|
||||
def _parse_wopen(self, s: bytes, i: int) -> int:
|
||||
c = s[i : i + 1]
|
||||
if c == b'<':
|
||||
if c == b"<":
|
||||
self._add_token(KEYWORD_DICT_BEGIN)
|
||||
self._parse1 = self._parse_main
|
||||
i += 1
|
||||
|
@ -491,7 +499,7 @@ class PSBaseParser:
|
|||
|
||||
def _parse_wclose(self, s: bytes, i: int) -> int:
|
||||
c = s[i : i + 1]
|
||||
if c == b'>':
|
||||
if c == b">":
|
||||
self._add_token(KEYWORD_DICT_END)
|
||||
i += 1
|
||||
self._parse1 = self._parse_main
|
||||
|
@ -504,8 +512,9 @@ class PSBaseParser:
|
|||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
token = HEX_PAIR.sub(lambda m: bytes((int(m.group(0), 16),)),
|
||||
SPC.sub(b'', self._curtoken))
|
||||
token = HEX_PAIR.sub(
|
||||
lambda m: bytes((int(m.group(0), 16),)), SPC.sub(b"", self._curtoken)
|
||||
)
|
||||
self._add_token(token)
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
@ -515,7 +524,7 @@ class PSBaseParser:
|
|||
self.fillbuf()
|
||||
self.charpos = self._parse1(self.buf, self.charpos)
|
||||
token = self._tokens.pop(0)
|
||||
log.debug('nexttoken: %r', token)
|
||||
log.debug("nexttoken: %r", token)
|
||||
return token
|
||||
|
||||
|
||||
|
@ -530,15 +539,13 @@ PSStackEntry = Tuple[int, PSStackType[ExtraT]]
|
|||
|
||||
|
||||
class PSStackParser(PSBaseParser, Generic[ExtraT]):
|
||||
|
||||
def __init__(self, fp: BinaryIO) -> None:
|
||||
PSBaseParser.__init__(self, fp)
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def reset(self) -> None:
|
||||
self.context: List[Tuple[int, Optional[str],
|
||||
List[PSStackEntry[ExtraT]]]] = []
|
||||
self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = []
|
||||
self.curtype: Optional[str] = None
|
||||
self.curstack: List[PSStackEntry[ExtraT]] = []
|
||||
self.results: List[PSStackEntry[ExtraT]] = []
|
||||
|
@ -565,25 +572,24 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
|
|||
|
||||
def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
|
||||
try:
|
||||
log.debug('add_results: %r', objs)
|
||||
log.debug("add_results: %r", objs)
|
||||
except Exception:
|
||||
log.debug('add_results: (unprintable object)')
|
||||
log.debug("add_results: (unprintable object)")
|
||||
self.results.extend(objs)
|
||||
return
|
||||
|
||||
def start_type(self, pos: int, type: str) -> None:
|
||||
self.context.append((pos, self.curtype, self.curstack))
|
||||
(self.curtype, self.curstack) = (type, [])
|
||||
log.debug('start_type: pos=%r, type=%r', pos, type)
|
||||
log.debug("start_type: pos=%r, type=%r", pos, type)
|
||||
return
|
||||
|
||||
def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
|
||||
if self.curtype != type:
|
||||
raise PSTypeError('Type mismatch: {!r} != {!r}'
|
||||
.format(self.curtype, type))
|
||||
raise PSTypeError("Type mismatch: {!r} != {!r}".format(self.curtype, type))
|
||||
objs = [obj for (_, obj) in self.curstack]
|
||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
|
||||
log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
|
||||
return (pos, objs)
|
||||
|
||||
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||
|
@ -604,47 +610,55 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
|
|||
self.push((pos, token))
|
||||
elif token == KEYWORD_ARRAY_BEGIN:
|
||||
# begin array
|
||||
self.start_type(pos, 'a')
|
||||
self.start_type(pos, "a")
|
||||
elif token == KEYWORD_ARRAY_END:
|
||||
# end array
|
||||
try:
|
||||
self.push(self.end_type('a'))
|
||||
self.push(self.end_type("a"))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_DICT_BEGIN:
|
||||
# begin dictionary
|
||||
self.start_type(pos, 'd')
|
||||
self.start_type(pos, "d")
|
||||
elif token == KEYWORD_DICT_END:
|
||||
# end dictionary
|
||||
try:
|
||||
(pos, objs) = self.end_type('d')
|
||||
(pos, objs) = self.end_type("d")
|
||||
if len(objs) % 2 != 0:
|
||||
error_msg = 'Invalid dictionary construct: %r' % objs
|
||||
error_msg = "Invalid dictionary construct: %r" % objs
|
||||
raise PSSyntaxError(error_msg)
|
||||
d = {literal_name(k): v
|
||||
for (k, v) in choplist(2, objs) if v is not None}
|
||||
d = {
|
||||
literal_name(k): v
|
||||
for (k, v) in choplist(2, objs)
|
||||
if v is not None
|
||||
}
|
||||
self.push((pos, d))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_PROC_BEGIN:
|
||||
# begin proc
|
||||
self.start_type(pos, 'p')
|
||||
self.start_type(pos, "p")
|
||||
elif token == KEYWORD_PROC_END:
|
||||
# end proc
|
||||
try:
|
||||
self.push(self.end_type('p'))
|
||||
self.push(self.end_type("p"))
|
||||
except PSTypeError:
|
||||
if settings.STRICT:
|
||||
raise
|
||||
elif isinstance(token, PSKeyword):
|
||||
log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos,
|
||||
token, self.curstack)
|
||||
log.debug(
|
||||
"do_keyword: pos=%r, token=%r, stack=%r", pos, token, self.curstack
|
||||
)
|
||||
self.do_keyword(pos, token)
|
||||
else:
|
||||
log.error('unknown token: pos=%r, token=%r, stack=%r', pos,
|
||||
token, self.curstack)
|
||||
log.error(
|
||||
"unknown token: pos=%r, token=%r, stack=%r",
|
||||
pos,
|
||||
token,
|
||||
self.curstack,
|
||||
)
|
||||
self.do_keyword(pos, token)
|
||||
raise
|
||||
if self.context:
|
||||
|
@ -653,7 +667,7 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
|
|||
self.flush()
|
||||
obj = self.results.pop(0)
|
||||
try:
|
||||
log.debug('nextobject: %r', obj)
|
||||
log.debug("nextobject: %r", obj)
|
||||
except Exception:
|
||||
log.debug('nextobject: (unprintable object)')
|
||||
log.debug("nextobject: (unprintable object)")
|
||||
return obj
|
||||
|
|
|
@ -20,7 +20,7 @@ def rldecode(data: bytes) -> bytes:
|
|||
(2 to 128) times during decompression. A length value of 128
|
||||
denotes EOD.
|
||||
"""
|
||||
decoded = b''
|
||||
decoded = b""
|
||||
i = 0
|
||||
while i < len(data):
|
||||
length = data[i]
|
||||
|
|
|
@ -6,9 +6,24 @@ import pathlib
|
|||
import string
|
||||
import struct
|
||||
from html import escape
|
||||
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
|
||||
List, Optional, Set, TextIO, Tuple, TypeVar, Union,
|
||||
TYPE_CHECKING, cast)
|
||||
from typing import (
|
||||
Any,
|
||||
BinaryIO,
|
||||
Callable,
|
||||
Dict,
|
||||
Generic,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Set,
|
||||
TextIO,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
TYPE_CHECKING,
|
||||
cast,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .layout import LTComponent
|
||||
|
@ -30,12 +45,8 @@ class open_filename(object):
|
|||
(str or pathlib.PurePath type is supported) and closes it on exit,
|
||||
(just like `open`), but does nothing for file-like objects.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
filename: FileOrName,
|
||||
*args: Any,
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
|
||||
def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
|
||||
if isinstance(filename, pathlib.PurePath):
|
||||
filename = str(filename)
|
||||
if isinstance(filename, str):
|
||||
|
@ -45,17 +56,12 @@ class open_filename(object):
|
|||
self.file_handler = cast(AnyIO, filename)
|
||||
self.closing = False
|
||||
else:
|
||||
raise TypeError('Unsupported input type: %s' % type(filename))
|
||||
raise TypeError("Unsupported input type: %s" % type(filename))
|
||||
|
||||
def __enter__(self) -> AnyIO:
|
||||
return self.file_handler
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: object,
|
||||
exc_val: object,
|
||||
exc_tb: object
|
||||
) -> None:
|
||||
def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
|
||||
if self.closing:
|
||||
self.file_handler.close()
|
||||
|
||||
|
@ -70,7 +76,7 @@ def make_compat_str(o: object) -> str:
|
|||
"""Converts everything to string, if bytes guessing the encoding."""
|
||||
if isinstance(o, bytes):
|
||||
enc = chardet.detect(o)
|
||||
return o.decode(enc['encoding'])
|
||||
return o.decode(enc["encoding"])
|
||||
else:
|
||||
return str(o)
|
||||
|
||||
|
@ -80,15 +86,13 @@ def shorten_str(s: str, size: int) -> str:
|
|||
return s[:size]
|
||||
if len(s) > size:
|
||||
length = (size - 5) // 2
|
||||
return '{} ... {}'.format(s[:length], s[-length:])
|
||||
return "{} ... {}".format(s[:length], s[-length:])
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def compatible_encode_method(
|
||||
bytesorstring: Union[bytes, str],
|
||||
encoding: str = 'utf-8',
|
||||
erraction: str = 'ignore'
|
||||
bytesorstring: Union[bytes, str], encoding: str = "utf-8", erraction: str = "ignore"
|
||||
) -> str:
|
||||
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
|
||||
|
||||
|
@ -119,11 +123,7 @@ def paeth_predictor(left: int, above: int, upper_left: int) -> int:
|
|||
|
||||
|
||||
def apply_png_predictor(
|
||||
pred: int,
|
||||
colors: int,
|
||||
columns: int,
|
||||
bitspercomponent: int,
|
||||
data: bytes
|
||||
pred: int, colors: int, columns: int, bitspercomponent: int, data: bytes
|
||||
) -> bytes:
|
||||
"""Reverse the effect of the PNG predictor
|
||||
|
||||
|
@ -135,12 +135,12 @@ def apply_png_predictor(
|
|||
|
||||
nbytes = colors * columns * bitspercomponent // 8
|
||||
bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel
|
||||
buf = b''
|
||||
line_above = b'\x00' * columns
|
||||
buf = b""
|
||||
line_above = b"\x00" * columns
|
||||
for scanline_i in range(0, len(data), nbytes + 1):
|
||||
filter_type = data[scanline_i]
|
||||
line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
|
||||
raw = b''
|
||||
raw = b""
|
||||
|
||||
if filter_type == 0:
|
||||
# Filter type 0: None
|
||||
|
@ -226,7 +226,8 @@ PathSegment = Union[
|
|||
Tuple[str], # Literal['h']
|
||||
Tuple[str, float, float], # Literal['m', 'l']
|
||||
Tuple[str, float, float, float, float], # Literal['v', 'y']
|
||||
Tuple[str, float, float, float, float, float, float]] # Literal['c']
|
||||
Tuple[str, float, float, float, float, float, float],
|
||||
] # Literal['c']
|
||||
|
||||
# Matrix operations
|
||||
MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
|
||||
|
@ -236,9 +237,14 @@ def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
|
|||
(a1, b1, c1, d1, e1, f1) = m1
|
||||
(a0, b0, c0, d0, e0, f0) = m0
|
||||
"""Returns the multiplication of two matrices."""
|
||||
return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1,
|
||||
a0 * c1 + c0 * d1, b0 * c1 + d0 * d1,
|
||||
a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
|
||||
return (
|
||||
a0 * a1 + c0 * b1,
|
||||
b0 * a1 + d0 * b1,
|
||||
a0 * c1 + c0 * d1,
|
||||
b0 * c1 + d0 * d1,
|
||||
a0 * e1 + c0 * f1 + e0,
|
||||
b0 * e1 + d0 * f1 + f0,
|
||||
)
|
||||
|
||||
|
||||
def translate_matrix(m: Matrix, v: Point) -> Matrix:
|
||||
|
@ -264,11 +270,12 @@ def apply_matrix_norm(m: Matrix, v: Point) -> Point:
|
|||
|
||||
# Utility functions
|
||||
|
||||
|
||||
def isnumber(x: object) -> bool:
|
||||
return isinstance(x, (int, float))
|
||||
|
||||
|
||||
_T = TypeVar('_T')
|
||||
_T = TypeVar("_T")
|
||||
|
||||
|
||||
def uniq(objs: Iterable[_T]) -> Iterator[_T]:
|
||||
|
@ -282,10 +289,7 @@ def uniq(objs: Iterable[_T]) -> Iterator[_T]:
|
|||
return
|
||||
|
||||
|
||||
def fsplit(
|
||||
pred: Callable[[_T], bool],
|
||||
objs: Iterable[_T]
|
||||
) -> Tuple[List[_T], List[_T]]:
|
||||
def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]:
|
||||
"""Split a list into two classes according to the predicate."""
|
||||
t = []
|
||||
f = []
|
||||
|
@ -315,9 +319,7 @@ def get_bound(pts: Iterable[Point]) -> Rect:
|
|||
|
||||
|
||||
def pick(
|
||||
seq: Iterable[_T],
|
||||
func: Callable[[_T], float],
|
||||
maxobj: Optional[_T] = None
|
||||
seq: Iterable[_T], func: Callable[[_T], float], maxobj: Optional[_T] = None
|
||||
) -> Optional[_T]:
|
||||
"""Picks the object obj where func(obj) has the highest value."""
|
||||
maxscore = None
|
||||
|
@ -347,77 +349,303 @@ def nunpack(s: bytes, default: int = 0) -> int:
|
|||
elif length == 1:
|
||||
return ord(s)
|
||||
elif length == 2:
|
||||
return cast(int, struct.unpack('>H', s)[0])
|
||||
return cast(int, struct.unpack(">H", s)[0])
|
||||
elif length == 3:
|
||||
return cast(int, struct.unpack('>L', b'\x00' + s)[0])
|
||||
return cast(int, struct.unpack(">L", b"\x00" + s)[0])
|
||||
elif length == 4:
|
||||
return cast(int, struct.unpack('>L', s)[0])
|
||||
return cast(int, struct.unpack(">L", s)[0])
|
||||
elif length == 8:
|
||||
return cast(int, struct.unpack('>Q', s)[0])
|
||||
return cast(int, struct.unpack(">Q", s)[0])
|
||||
else:
|
||||
raise TypeError('invalid length: %d' % length)
|
||||
raise TypeError("invalid length: %d" % length)
|
||||
|
||||
|
||||
PDFDocEncoding = ''.join(chr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
|
||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
|
||||
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
|
||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
|
||||
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
|
||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
||||
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
|
||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
|
||||
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
|
||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
|
||||
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
|
||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
|
||||
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
|
||||
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
|
||||
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
|
||||
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
|
||||
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
||||
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
|
||||
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
||||
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
|
||||
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
|
||||
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
|
||||
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
|
||||
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
|
||||
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
|
||||
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
|
||||
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||
))
|
||||
PDFDocEncoding = "".join(
|
||||
chr(x)
|
||||
for x in (
|
||||
0x0000,
|
||||
0x0001,
|
||||
0x0002,
|
||||
0x0003,
|
||||
0x0004,
|
||||
0x0005,
|
||||
0x0006,
|
||||
0x0007,
|
||||
0x0008,
|
||||
0x0009,
|
||||
0x000A,
|
||||
0x000B,
|
||||
0x000C,
|
||||
0x000D,
|
||||
0x000E,
|
||||
0x000F,
|
||||
0x0010,
|
||||
0x0011,
|
||||
0x0012,
|
||||
0x0013,
|
||||
0x0014,
|
||||
0x0015,
|
||||
0x0017,
|
||||
0x0017,
|
||||
0x02D8,
|
||||
0x02C7,
|
||||
0x02C6,
|
||||
0x02D9,
|
||||
0x02DD,
|
||||
0x02DB,
|
||||
0x02DA,
|
||||
0x02DC,
|
||||
0x0020,
|
||||
0x0021,
|
||||
0x0022,
|
||||
0x0023,
|
||||
0x0024,
|
||||
0x0025,
|
||||
0x0026,
|
||||
0x0027,
|
||||
0x0028,
|
||||
0x0029,
|
||||
0x002A,
|
||||
0x002B,
|
||||
0x002C,
|
||||
0x002D,
|
||||
0x002E,
|
||||
0x002F,
|
||||
0x0030,
|
||||
0x0031,
|
||||
0x0032,
|
||||
0x0033,
|
||||
0x0034,
|
||||
0x0035,
|
||||
0x0036,
|
||||
0x0037,
|
||||
0x0038,
|
||||
0x0039,
|
||||
0x003A,
|
||||
0x003B,
|
||||
0x003C,
|
||||
0x003D,
|
||||
0x003E,
|
||||
0x003F,
|
||||
0x0040,
|
||||
0x0041,
|
||||
0x0042,
|
||||
0x0043,
|
||||
0x0044,
|
||||
0x0045,
|
||||
0x0046,
|
||||
0x0047,
|
||||
0x0048,
|
||||
0x0049,
|
||||
0x004A,
|
||||
0x004B,
|
||||
0x004C,
|
||||
0x004D,
|
||||
0x004E,
|
||||
0x004F,
|
||||
0x0050,
|
||||
0x0051,
|
||||
0x0052,
|
||||
0x0053,
|
||||
0x0054,
|
||||
0x0055,
|
||||
0x0056,
|
||||
0x0057,
|
||||
0x0058,
|
||||
0x0059,
|
||||
0x005A,
|
||||
0x005B,
|
||||
0x005C,
|
||||
0x005D,
|
||||
0x005E,
|
||||
0x005F,
|
||||
0x0060,
|
||||
0x0061,
|
||||
0x0062,
|
||||
0x0063,
|
||||
0x0064,
|
||||
0x0065,
|
||||
0x0066,
|
||||
0x0067,
|
||||
0x0068,
|
||||
0x0069,
|
||||
0x006A,
|
||||
0x006B,
|
||||
0x006C,
|
||||
0x006D,
|
||||
0x006E,
|
||||
0x006F,
|
||||
0x0070,
|
||||
0x0071,
|
||||
0x0072,
|
||||
0x0073,
|
||||
0x0074,
|
||||
0x0075,
|
||||
0x0076,
|
||||
0x0077,
|
||||
0x0078,
|
||||
0x0079,
|
||||
0x007A,
|
||||
0x007B,
|
||||
0x007C,
|
||||
0x007D,
|
||||
0x007E,
|
||||
0x0000,
|
||||
0x2022,
|
||||
0x2020,
|
||||
0x2021,
|
||||
0x2026,
|
||||
0x2014,
|
||||
0x2013,
|
||||
0x0192,
|
||||
0x2044,
|
||||
0x2039,
|
||||
0x203A,
|
||||
0x2212,
|
||||
0x2030,
|
||||
0x201E,
|
||||
0x201C,
|
||||
0x201D,
|
||||
0x2018,
|
||||
0x2019,
|
||||
0x201A,
|
||||
0x2122,
|
||||
0xFB01,
|
||||
0xFB02,
|
||||
0x0141,
|
||||
0x0152,
|
||||
0x0160,
|
||||
0x0178,
|
||||
0x017D,
|
||||
0x0131,
|
||||
0x0142,
|
||||
0x0153,
|
||||
0x0161,
|
||||
0x017E,
|
||||
0x0000,
|
||||
0x20AC,
|
||||
0x00A1,
|
||||
0x00A2,
|
||||
0x00A3,
|
||||
0x00A4,
|
||||
0x00A5,
|
||||
0x00A6,
|
||||
0x00A7,
|
||||
0x00A8,
|
||||
0x00A9,
|
||||
0x00AA,
|
||||
0x00AB,
|
||||
0x00AC,
|
||||
0x0000,
|
||||
0x00AE,
|
||||
0x00AF,
|
||||
0x00B0,
|
||||
0x00B1,
|
||||
0x00B2,
|
||||
0x00B3,
|
||||
0x00B4,
|
||||
0x00B5,
|
||||
0x00B6,
|
||||
0x00B7,
|
||||
0x00B8,
|
||||
0x00B9,
|
||||
0x00BA,
|
||||
0x00BB,
|
||||
0x00BC,
|
||||
0x00BD,
|
||||
0x00BE,
|
||||
0x00BF,
|
||||
0x00C0,
|
||||
0x00C1,
|
||||
0x00C2,
|
||||
0x00C3,
|
||||
0x00C4,
|
||||
0x00C5,
|
||||
0x00C6,
|
||||
0x00C7,
|
||||
0x00C8,
|
||||
0x00C9,
|
||||
0x00CA,
|
||||
0x00CB,
|
||||
0x00CC,
|
||||
0x00CD,
|
||||
0x00CE,
|
||||
0x00CF,
|
||||
0x00D0,
|
||||
0x00D1,
|
||||
0x00D2,
|
||||
0x00D3,
|
||||
0x00D4,
|
||||
0x00D5,
|
||||
0x00D6,
|
||||
0x00D7,
|
||||
0x00D8,
|
||||
0x00D9,
|
||||
0x00DA,
|
||||
0x00DB,
|
||||
0x00DC,
|
||||
0x00DD,
|
||||
0x00DE,
|
||||
0x00DF,
|
||||
0x00E0,
|
||||
0x00E1,
|
||||
0x00E2,
|
||||
0x00E3,
|
||||
0x00E4,
|
||||
0x00E5,
|
||||
0x00E6,
|
||||
0x00E7,
|
||||
0x00E8,
|
||||
0x00E9,
|
||||
0x00EA,
|
||||
0x00EB,
|
||||
0x00EC,
|
||||
0x00ED,
|
||||
0x00EE,
|
||||
0x00EF,
|
||||
0x00F0,
|
||||
0x00F1,
|
||||
0x00F2,
|
||||
0x00F3,
|
||||
0x00F4,
|
||||
0x00F5,
|
||||
0x00F6,
|
||||
0x00F7,
|
||||
0x00F8,
|
||||
0x00F9,
|
||||
0x00FA,
|
||||
0x00FB,
|
||||
0x00FC,
|
||||
0x00FD,
|
||||
0x00FE,
|
||||
0x00FF,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def decode_text(s: bytes) -> str:
|
||||
"""Decodes a PDFDocEncoding string to Unicode."""
|
||||
if s.startswith(b'\xfe\xff'):
|
||||
return str(s[2:], 'utf-16be', 'ignore')
|
||||
if s.startswith(b"\xfe\xff"):
|
||||
return str(s[2:], "utf-16be", "ignore")
|
||||
else:
|
||||
return ''.join(PDFDocEncoding[c] for c in s)
|
||||
return "".join(PDFDocEncoding[c] for c in s)
|
||||
|
||||
|
||||
def enc(x: str) -> str:
|
||||
"""Encodes a string for SGML/XML/HTML"""
|
||||
if isinstance(x, bytes):
|
||||
return ''
|
||||
return ""
|
||||
return escape(x)
|
||||
|
||||
|
||||
def bbox2str(bbox: Rect) -> str:
|
||||
(x0, y0, x1, y1) = bbox
|
||||
return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1)
|
||||
return "{:.3f},{:.3f},{:.3f},{:.3f}".format(x0, y0, x1, y1)
|
||||
|
||||
|
||||
def matrix2str(m: Matrix) -> str:
|
||||
(a, b, c, d, e, f) = m
|
||||
return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\
|
||||
.format(a, b, c, d, e, f)
|
||||
return "[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]".format(a, b, c, d, e, f)
|
||||
|
||||
|
||||
def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
|
||||
|
@ -446,7 +674,7 @@ def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
|
|||
return max(0, iw), max(0, ih)
|
||||
|
||||
|
||||
LTComponentT = TypeVar('LTComponentT', bound='LTComponent')
|
||||
LTComponentT = TypeVar("LTComponentT", bound="LTComponent")
|
||||
|
||||
|
||||
class Plane(Generic[LTComponentT]):
|
||||
|
@ -465,7 +693,7 @@ class Plane(Generic[LTComponentT]):
|
|||
(self.x0, self.y0, self.x1, self.y1) = bbox
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return '<Plane objs=%r>' % list(self)
|
||||
return "<Plane objs=%r>" % list(self)
|
||||
|
||||
def __iter__(self) -> Iterator[LTComponentT]:
|
||||
return (obj for obj in self._seq if obj in self._objs)
|
||||
|
@ -524,14 +752,13 @@ class Plane(Generic[LTComponentT]):
|
|||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 \
|
||||
or y1 <= obj.y0:
|
||||
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
|
||||
continue
|
||||
yield obj
|
||||
|
||||
|
||||
ROMAN_ONES = ['i', 'x', 'c', 'm']
|
||||
ROMAN_FIVES = ['v', 'l', 'd']
|
||||
ROMAN_ONES = ["i", "x", "c", "m"]
|
||||
ROMAN_FIVES = ["v", "l", "d"]
|
||||
|
||||
|
||||
def format_int_roman(value: int) -> str:
|
||||
|
@ -557,7 +784,7 @@ def format_int_roman(value: int) -> str:
|
|||
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
|
||||
index += 1
|
||||
|
||||
return ''.join(result)
|
||||
return "".join(result)
|
||||
|
||||
|
||||
def format_int_alpha(value: int) -> str:
|
||||
|
@ -571,4 +798,4 @@ def format_int_alpha(value: int) -> str:
|
|||
result.append(string.ascii_lowercase[remainder])
|
||||
|
||||
result.reverse()
|
||||
return ''.join(result)
|
||||
return "".join(result)
|
||||
|
|
62
setup.py
62
setup.py
|
@ -8,52 +8,52 @@ sys.path.append(str(Path(__file__).parent))
|
|||
import pdfminer as package
|
||||
|
||||
|
||||
with open(path.join(path.abspath(path.dirname(__file__)), 'README.md')) as f:
|
||||
with open(path.join(path.abspath(path.dirname(__file__)), "README.md")) as f:
|
||||
readme = f.read()
|
||||
|
||||
setup(
|
||||
name='pdfminer.six',
|
||||
name="pdfminer.six",
|
||||
version=package.__version__,
|
||||
packages=['pdfminer'],
|
||||
package_data={'pdfminer': ['cmap/*.pickle.gz', 'py.typed']},
|
||||
packages=["pdfminer"],
|
||||
package_data={"pdfminer": ["cmap/*.pickle.gz", "py.typed"]},
|
||||
install_requires=[
|
||||
'chardet ; python_version > "3.0"',
|
||||
'cryptography',
|
||||
"cryptography",
|
||||
],
|
||||
extras_require={
|
||||
"dev": ["pytest", "nox", "mypy == 0.931"],
|
||||
"dev": ["pytest", "nox", "black", "mypy == 0.931"],
|
||||
"docs": ["sphinx", "sphinx-argparse"],
|
||||
},
|
||||
description='PDF parser and analyzer',
|
||||
description="PDF parser and analyzer",
|
||||
long_description=readme,
|
||||
long_description_content_type='text/markdown',
|
||||
license='MIT/X',
|
||||
author='Yusuke Shinyama + Philippe Guglielmetti',
|
||||
author_email='pdfminer@goulu.net',
|
||||
url='https://github.com/pdfminer/pdfminer.six',
|
||||
long_description_content_type="text/markdown",
|
||||
license="MIT/X",
|
||||
author="Yusuke Shinyama + Philippe Guglielmetti",
|
||||
author_email="pdfminer@goulu.net",
|
||||
url="https://github.com/pdfminer/pdfminer.six",
|
||||
scripts=[
|
||||
'tools/pdf2txt.py',
|
||||
'tools/dumppdf.py',
|
||||
"tools/pdf2txt.py",
|
||||
"tools/dumppdf.py",
|
||||
],
|
||||
keywords=[
|
||||
'pdf parser',
|
||||
'pdf converter',
|
||||
'layout analysis',
|
||||
'text mining',
|
||||
"pdf parser",
|
||||
"pdf converter",
|
||||
"layout analysis",
|
||||
"text mining",
|
||||
],
|
||||
python_requires='>=3.6',
|
||||
python_requires=">=3.6",
|
||||
classifiers=[
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3 :: Only',
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Topic :: Text Processing',
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Topic :: Text Processing",
|
||||
],
|
||||
)
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
|
||||
|
||||
def absolute_sample_path(relative_sample_path):
|
||||
sample_dir = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), '../samples'))
|
||||
sample_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../samples"))
|
||||
sample_file = os.path.join(sample_dir, relative_sample_path)
|
||||
return sample_file
|
||||
|
|
|
@ -4,7 +4,7 @@ import tempfile
|
|||
import os
|
||||
|
||||
|
||||
class TemporaryFilePath():
|
||||
class TemporaryFilePath:
|
||||
"""Context manager class, which generates temporary file name
|
||||
|
||||
Coonroraly to standard tempfile.NamedTemporaryFile(), it does not
|
||||
|
@ -40,9 +40,9 @@ class TemporaryFilePath():
|
|||
`tempfile.NamedTemporaryFile` will create and delete a file, and
|
||||
this method only returns the filepath of the non-existing file.
|
||||
"""
|
||||
with tempfile.NamedTemporaryFile(suffix=self.suffix,
|
||||
prefix=self.prefix,
|
||||
dir=self.dir) as file:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=self.suffix, prefix=self.prefix, dir=self.dir
|
||||
) as file:
|
||||
self.temp_file_name = file.name
|
||||
|
||||
return self.temp_file_name
|
||||
|
|
|
@ -9,14 +9,14 @@ from pdfminer.pdfinterp import PDFGraphicState
|
|||
|
||||
class TestPaintPath:
|
||||
def test_paint_path(self):
|
||||
path = [('m', 6, 7), ('l', 7, 7)]
|
||||
path = [("m", 6, 7), ("l", 7, 7)]
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
assert len(analyzer.cur_item._objs) == 1
|
||||
|
||||
def test_paint_path_mlllh(self):
|
||||
path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)]
|
||||
path = [("m", 6, 7), ("l", 7, 7), ("l", 7, 91), ("l", 6, 91), ("h",)]
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
|
@ -25,9 +25,21 @@ class TestPaintPath:
|
|||
def test_paint_path_multiple_mlllh(self):
|
||||
"""Path from samples/contrib/issue-00369-excel.pdf"""
|
||||
path = [
|
||||
('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',),
|
||||
('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',),
|
||||
('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',)
|
||||
("m", 6, 7),
|
||||
("l", 7, 7),
|
||||
("l", 7, 91),
|
||||
("l", 6, 91),
|
||||
("h",),
|
||||
("m", 4, 7),
|
||||
("l", 6, 7),
|
||||
("l", 6, 91),
|
||||
("l", 4, 91),
|
||||
("h",),
|
||||
("m", 67, 2),
|
||||
("l", 68, 2),
|
||||
("l", 68, 3),
|
||||
("l", 67, 3),
|
||||
("h",),
|
||||
]
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
||||
|
@ -177,34 +189,34 @@ class TestPaintPath:
|
|||
return analyzer.cur_item._objs
|
||||
|
||||
# "c" operator
|
||||
assert parse([
|
||||
assert parse(
|
||||
[
|
||||
("m", 72.41, 433.89),
|
||||
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
||||
])[0].pts == [
|
||||
]
|
||||
)[0].pts == [
|
||||
(72.41, 433.89),
|
||||
(71.41, 434.89),
|
||||
]
|
||||
|
||||
# "v" operator
|
||||
assert parse([
|
||||
("m", 72.41, 433.89),
|
||||
("v", 71.96, 434.89, 71.41, 434.89),
|
||||
])[0].pts == [
|
||||
assert parse([("m", 72.41, 433.89), ("v", 71.96, 434.89, 71.41, 434.89)])[
|
||||
0
|
||||
].pts == [
|
||||
(72.41, 433.89),
|
||||
(71.41, 434.89),
|
||||
]
|
||||
|
||||
# "y" operator
|
||||
assert parse([
|
||||
("m", 72.41, 433.89),
|
||||
("y", 72.41, 434.45, 71.41, 434.89),
|
||||
])[0].pts == [
|
||||
assert parse([("m", 72.41, 433.89), ("y", 72.41, 434.45, 71.41, 434.89)])[
|
||||
0
|
||||
].pts == [
|
||||
(72.41, 433.89),
|
||||
(71.41, 434.89),
|
||||
]
|
||||
|
||||
|
||||
class TestBinaryDetector():
|
||||
class TestBinaryDetector:
|
||||
def test_stringio(self):
|
||||
assert not PDFConverter._is_binary_stream(io.StringIO())
|
||||
|
||||
|
@ -212,11 +224,11 @@ class TestBinaryDetector():
|
|||
assert PDFConverter._is_binary_stream(io.BytesIO())
|
||||
|
||||
def test_tmpfile(self):
|
||||
with TemporaryFile(mode='w') as f:
|
||||
with TemporaryFile(mode="w") as f:
|
||||
assert not PDFConverter._is_binary_stream(f)
|
||||
|
||||
def test_binary_tmpfile(self):
|
||||
with TemporaryFile(mode='wb') as f:
|
||||
with TemporaryFile(mode="wb") as f:
|
||||
assert PDFConverter._is_binary_stream(f)
|
||||
|
||||
def test_non_file_like_object_defaults_to_binary(self):
|
||||
|
|
|
@ -13,31 +13,31 @@ from pdfminer.psparser import PSLiteral
|
|||
def test_name2unicode_name_in_agl():
|
||||
"""The name "Lcommaaccent" has a single component,
|
||||
which is mapped to the string U+013B by AGL"""
|
||||
assert '\u013B' == name2unicode('Lcommaaccent')
|
||||
assert "\u013B" == name2unicode("Lcommaaccent")
|
||||
|
||||
|
||||
def test_name2unicode_uni():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B"
|
||||
all map to the string U+013B"""
|
||||
assert '\u013B' == name2unicode('uni013B')
|
||||
assert "\u013B" == name2unicode("uni013B")
|
||||
|
||||
|
||||
def test_name2unicode_uni_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B"
|
||||
all map to the string U+013B"""
|
||||
assert '\u013B' == name2unicode('uni013b')
|
||||
assert "\u013B" == name2unicode("uni013b")
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits():
|
||||
"""The name "uni20AC0308" has a single component,
|
||||
which is mapped to the string U+20AC U+0308"""
|
||||
assert '\u20AC\u0308' == name2unicode('uni20AC0308')
|
||||
assert "\u20AC\u0308" == name2unicode("uni20AC0308")
|
||||
|
||||
|
||||
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
|
||||
"""The name "uni20AC0308" has a single component,
|
||||
which is mapped to the string U+20AC U+0308"""
|
||||
assert '\u20AC\u0308' == name2unicode('uni20ac0308')
|
||||
assert "\u20AC\u0308" == name2unicode("uni20ac0308")
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string():
|
||||
|
@ -46,7 +46,7 @@ def test_name2unicode_uni_empty_string():
|
|||
|
||||
According to the specification this should be mapped to an empty string,
|
||||
but we also want to support lowercase hexadecimals"""
|
||||
assert '\u20ac' == name2unicode('uni20ac')
|
||||
assert "\u20ac" == name2unicode("uni20ac")
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long():
|
||||
|
@ -60,7 +60,7 @@ def test_name2unicode_uni_empty_string_long():
|
|||
glyph name "u1040C.
|
||||
"""
|
||||
with pytest.raises(KeyError):
|
||||
name2unicode('uniD801DC0C')
|
||||
name2unicode("uniD801DC0C")
|
||||
|
||||
|
||||
def test_name2unicode_uni_empty_string_long_lowercase():
|
||||
|
@ -73,57 +73,59 @@ def test_name2unicode_uni_empty_string_long_lowercase():
|
|||
This character can be correctly mapped by using the
|
||||
glyph name "u1040C."""
|
||||
with pytest.raises(KeyError):
|
||||
name2unicode('uniD801DC0C')
|
||||
name2unicode("uniD801DC0C")
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua():
|
||||
""" "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
|
||||
U+F6FB."""
|
||||
assert '\uF6FB' == name2unicode('uniF6FB')
|
||||
assert "\uF6FB" == name2unicode("uniF6FB")
|
||||
|
||||
|
||||
def test_name2unicode_uni_pua_lowercase():
|
||||
""" "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
|
||||
U+F6FB."""
|
||||
assert '\uF6FB' == name2unicode('unif6fb')
|
||||
assert "\uF6FB" == name2unicode("unif6fb")
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the
|
||||
string U+013B"""
|
||||
assert '\u013B' == name2unicode('u013B')
|
||||
assert "\u013B" == name2unicode("u013B")
|
||||
|
||||
|
||||
def test_name2unicode_u_with_4_digits_lowercase():
|
||||
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the
|
||||
string U+013B"""
|
||||
assert '\u013B' == name2unicode('u013b')
|
||||
assert "\u013B" == name2unicode("u013b")
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string
|
||||
U+1040C"""
|
||||
assert '\U0001040C' == name2unicode('u1040C')
|
||||
assert "\U0001040C" == name2unicode("u1040C")
|
||||
|
||||
|
||||
def test_name2unicode_u_with_5_digits_lowercase():
|
||||
"""The name "u1040C" has a single component, which is mapped to the string
|
||||
U+1040C"""
|
||||
assert '\U0001040C' == name2unicode('u1040c')
|
||||
assert "\U0001040C" == name2unicode("u1040c")
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
|
||||
string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert '\u013B\u20AC\u0308\U0001040C' == \
|
||||
name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
|
||||
assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
|
||||
"Lcommaaccent_uni20AC0308_u1040C.alternate"
|
||||
)
|
||||
|
||||
|
||||
def test_name2unicode_multiple_components_lowercase():
|
||||
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
|
||||
string U+013B U+20AC U+0308 U+1040C"""
|
||||
assert '\u013B\u20AC\u0308\U0001040C' == \
|
||||
name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
|
||||
assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
|
||||
"Lcommaaccent_uni20ac0308_u1040c.alternate"
|
||||
)
|
||||
|
||||
|
||||
def test_name2unicode_foo():
|
||||
|
@ -131,26 +133,26 @@ def test_name2unicode_foo():
|
|||
because 'foo' is not in AGL,
|
||||
and because it does not start with a 'u.'"""
|
||||
with pytest.raises(KeyError):
|
||||
name2unicode('foo')
|
||||
name2unicode("foo")
|
||||
|
||||
|
||||
def test_name2unicode_notdef():
|
||||
"""The name ".notdef" is reduced to an empty string (step 1)
|
||||
and mapped to an empty string (step 3)"""
|
||||
with pytest.raises(KeyError):
|
||||
name2unicode('.notdef')
|
||||
name2unicode(".notdef")
|
||||
|
||||
|
||||
def test_name2unicode_pua_ogoneksmall():
|
||||
""" "
|
||||
Ogoneksmall" and "uniF6FB" both map to the string
|
||||
that corresponds to U+F6FB."""
|
||||
assert '\uF6FB' == name2unicode('Ogoneksmall')
|
||||
assert "\uF6FB" == name2unicode("Ogoneksmall")
|
||||
|
||||
|
||||
def test_name2unicode_overflow_error():
|
||||
with pytest.raises(KeyError):
|
||||
name2unicode('226215240241240240240240')
|
||||
name2unicode("226215240241240240240240")
|
||||
|
||||
|
||||
def test_get_encoding_with_invalid_differences():
|
||||
|
@ -158,5 +160,5 @@ def test_get_encoding_with_invalid_differences():
|
|||
|
||||
Regression test for https://github.com/pdfminer/pdfminer.six/issues/385
|
||||
"""
|
||||
invalid_differences = [PSLiteral('ubuntu'), PSLiteral('1234')]
|
||||
EncodingDB.get_encoding('StandardEncoding', invalid_differences)
|
||||
invalid_differences = [PSLiteral("ubuntu"), PSLiteral("1234")]
|
||||
EncodingDB.get_encoding("StandardEncoding", invalid_differences)
|
||||
|
|
|
@ -4,7 +4,7 @@ from pdfminer.layout import LTChar, LTTextBox
|
|||
|
||||
|
||||
def test_font_size():
|
||||
path = absolute_sample_path('font-size-test.pdf')
|
||||
path = absolute_sample_path("font-size-test.pdf")
|
||||
for page in extract_pages(path):
|
||||
for text_box in page:
|
||||
if isinstance(text_box, LTTextBox):
|
||||
|
|
|
@ -129,37 +129,43 @@ class TestExtractPages(unittest.TestCase):
|
|||
def test_line_margin(self):
|
||||
# The lines have margin 0.2 relative to the height.
|
||||
# Extract with line_margin 0.19 should break into 3 separate textboxes.
|
||||
pages = list(extract_pages(
|
||||
self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
|
||||
pages = list(
|
||||
extract_pages(
|
||||
self._get_test_file_path(), laparams=LAParams(line_margin=0.19)
|
||||
)
|
||||
)
|
||||
self.assertEqual(len(pages), 1)
|
||||
page = pages[0]
|
||||
|
||||
elements = [element for element in page
|
||||
if isinstance(element, LTTextContainer)]
|
||||
elements = [element for element in page if isinstance(element, LTTextContainer)]
|
||||
self.assertEqual(len(elements), 3)
|
||||
self.assertEqual(elements[0].get_text(), "Text1\n")
|
||||
self.assertEqual(elements[1].get_text(), "Text2\n")
|
||||
self.assertEqual(elements[2].get_text(), "Text3\n")
|
||||
|
||||
# Extract with line_margin 0.21 should merge into one textbox.
|
||||
pages = list(extract_pages(
|
||||
self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
|
||||
pages = list(
|
||||
extract_pages(
|
||||
self._get_test_file_path(), laparams=LAParams(line_margin=0.21)
|
||||
)
|
||||
)
|
||||
self.assertEqual(len(pages), 1)
|
||||
page = pages[0]
|
||||
|
||||
elements = [element for element in page
|
||||
if isinstance(element, LTTextContainer)]
|
||||
elements = [element for element in page if isinstance(element, LTTextContainer)]
|
||||
self.assertEqual(len(elements), 1)
|
||||
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
|
||||
|
||||
def test_no_boxes_flow(self):
|
||||
pages = list(extract_pages(
|
||||
self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
|
||||
pages = list(
|
||||
extract_pages(
|
||||
self._get_test_file_path(), laparams=LAParams(boxes_flow=None)
|
||||
)
|
||||
)
|
||||
self.assertEqual(len(pages), 1)
|
||||
page = pages[0]
|
||||
|
||||
elements = [element for element in page
|
||||
if isinstance(element, LTTextContainer)]
|
||||
elements = [element for element in page if isinstance(element, LTTextContainer)]
|
||||
self.assertEqual(len(elements), 1)
|
||||
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
|
||||
|
||||
|
|
|
@ -46,8 +46,7 @@ class TestFindNeigbors(unittest.TestCase):
|
|||
right_aligned_below.set_bbox((15, 2, 20, 4))
|
||||
plane.add(right_aligned_below)
|
||||
|
||||
centrally_aligned_overlapping = LTTextLineHorizontal(
|
||||
laparams.word_margin)
|
||||
centrally_aligned_overlapping = LTTextLineHorizontal(laparams.word_margin)
|
||||
centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
|
||||
plane.add(centrally_aligned_overlapping)
|
||||
|
||||
|
@ -86,8 +85,7 @@ class TestFindNeigbors(unittest.TestCase):
|
|||
top_aligned_left.set_bbox((2, 15, 4, 20))
|
||||
plane.add(top_aligned_left)
|
||||
|
||||
centrally_aligned_overlapping = LTTextLineVertical(
|
||||
laparams.word_margin)
|
||||
centrally_aligned_overlapping = LTTextLineVertical(laparams.word_margin)
|
||||
centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
|
||||
plane.add(centrally_aligned_overlapping)
|
||||
|
||||
|
|
|
@ -9,9 +9,8 @@ from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
|
|||
|
||||
|
||||
class TestPdfDocument(object):
|
||||
|
||||
def test_get_zero_objid_raises_pdfobjectnotfound(self):
|
||||
with open(absolute_sample_path('simple1.pdf'), 'rb') as in_file:
|
||||
with open(absolute_sample_path("simple1.pdf"), "rb") as in_file:
|
||||
parser = PDFParser(in_file)
|
||||
doc = PDFDocument(parser)
|
||||
with pytest.raises(PDFObjectNotFound):
|
||||
|
@ -21,24 +20,29 @@ class TestPdfDocument(object):
|
|||
# Some documents may be encrypted but not have an /ID key in
|
||||
# their trailer. Tests
|
||||
# https://github.com/pdfminer/pdfminer.six/issues/594
|
||||
path = absolute_sample_path('encryption/encrypted_doc_no_id.pdf')
|
||||
with open(path, 'rb') as fp:
|
||||
path = absolute_sample_path("encryption/encrypted_doc_no_id.pdf")
|
||||
with open(path, "rb") as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
assert doc.info == [{'Producer': b'European Patent Office'}]
|
||||
assert doc.info == [{"Producer": b"European Patent Office"}]
|
||||
|
||||
def test_page_labels(self):
|
||||
path = absolute_sample_path('contrib/pagelabels.pdf')
|
||||
with open(path, 'rb') as fp:
|
||||
path = absolute_sample_path("contrib/pagelabels.pdf")
|
||||
with open(path, "rb") as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
|
||||
assert list(itertools.islice(doc.get_page_labels(), total_pages)) \
|
||||
== ['iii', 'iv', '1', '2', '1']
|
||||
total_pages = int_value(dict_value(doc.catalog["Pages"])["Count"])
|
||||
assert list(itertools.islice(doc.get_page_labels(), total_pages)) == [
|
||||
"iii",
|
||||
"iv",
|
||||
"1",
|
||||
"2",
|
||||
"1",
|
||||
]
|
||||
|
||||
def test_no_page_labels(self):
|
||||
path = absolute_sample_path('simple1.pdf')
|
||||
with open(path, 'rb') as fp:
|
||||
path = absolute_sample_path("simple1.pdf")
|
||||
with open(path, "rb") as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
|
||||
|
|
|
@ -9,96 +9,95 @@ from pdfminer.psparser import PSLiteral
|
|||
|
||||
|
||||
class TestPDFEncoding:
|
||||
|
||||
def test_cmapname_onebyteidentityV(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("OneByteIdentityV")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMapByte)
|
||||
|
||||
def test_cmapname_onebyteidentityH(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("OneByteIdentityH")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMapByte)
|
||||
|
||||
def test_cmapname_V(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("V")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, CMap)
|
||||
|
||||
def test_cmapname_H(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('H')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("H")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, CMap)
|
||||
|
||||
def test_encoding_identityH(self):
|
||||
spec = {'Encoding': PSLiteral('Identity-H')}
|
||||
spec = {"Encoding": PSLiteral("Identity-H")}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityV(self):
|
||||
spec = {'Encoding': PSLiteral('Identity-V')}
|
||||
spec = {"Encoding": PSLiteral("Identity-V")}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityH_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('Identity-H')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("Identity-H")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityV_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('Identity-V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("Identity-V")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityH_as_stream(self):
|
||||
stream = PDFStream({'CMapName': 'Identity-H'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": "Identity-H"}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_identityV_as_stream(self):
|
||||
stream = PDFStream({'CMapName': 'Identity-V'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": "Identity-V"}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH(self):
|
||||
spec = {'Encoding': PSLiteral('DLIdent-H')}
|
||||
spec = {"Encoding": PSLiteral("DLIdent-H")}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentV(self):
|
||||
spec = {'Encoding': PSLiteral('DLIdent-V')}
|
||||
spec = {"Encoding": PSLiteral("DLIdent-V")}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('DLIdent-H')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("DLIdent-H")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentV_as_PSLiteral_stream(self):
|
||||
stream = PDFStream({'CMapName': PSLiteral('DLIdent-V')}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": PSLiteral("DLIdent-V")}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentH_as_stream(self):
|
||||
stream = PDFStream({'CMapName': 'DLIdent-H'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": "DLIdent-H"}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
def test_encoding_DLIdentV_as_stream(self):
|
||||
stream = PDFStream({'CMapName': 'DLIdent-V'}, '')
|
||||
spec = {'Encoding': stream}
|
||||
stream = PDFStream({"CMapName": "DLIdent-V"}, "")
|
||||
spec = {"Encoding": stream}
|
||||
font = PDFCIDFont(None, spec)
|
||||
assert isinstance(font.cmap, IdentityCMap)
|
||||
|
||||
|
|
|
@ -8,12 +8,12 @@ def test_get_cmap_from_pickle():
|
|||
|
||||
Regression test for https://github.com/pdfminer/pdfminer.six/issues/391
|
||||
"""
|
||||
cmap_name = 'UniGB-UCS2-H'
|
||||
spec = {'Encoding': PSLiteral(cmap_name)}
|
||||
cmap_name = "UniGB-UCS2-H"
|
||||
spec = {"Encoding": PSLiteral(cmap_name)}
|
||||
resource_manager = PDFResourceManager()
|
||||
font = PDFCIDFont(resource_manager, spec)
|
||||
|
||||
cmap = font.get_cmap_from_spec(spec, False)
|
||||
|
||||
assert cmap.attrs.get('CMapName') == cmap_name
|
||||
assert cmap.attrs.get("CMapName") == cmap_name
|
||||
assert len(cmap.code2cid) > 0
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from pdfminer.ccitt import CCITTG4Parser, CCITTFaxDecoder
|
||||
|
||||
|
||||
class TestCCITTG4Parser():
|
||||
class TestCCITTG4Parser:
|
||||
def get_parser(self, bits):
|
||||
parser = CCITTG4Parser(len(bits))
|
||||
parser._curline = [int(c) for c in bits]
|
||||
|
@ -9,60 +9,60 @@ class TestCCITTG4Parser():
|
|||
return parser
|
||||
|
||||
def test_b1(self):
|
||||
parser = self.get_parser('00000')
|
||||
parser = self.get_parser("00000")
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 0
|
||||
return
|
||||
|
||||
def test_b2(self):
|
||||
parser = self.get_parser('10000')
|
||||
parser = self.get_parser("10000")
|
||||
parser._do_vertical(-1)
|
||||
assert parser._curpos == 0
|
||||
return
|
||||
|
||||
def test_b3(self):
|
||||
parser = self.get_parser('000111')
|
||||
parser = self.get_parser("000111")
|
||||
parser._do_pass()
|
||||
assert parser._curpos == 3
|
||||
assert parser._get_bits() == '111'
|
||||
assert parser._get_bits() == "111"
|
||||
return
|
||||
|
||||
def test_b4(self):
|
||||
parser = self.get_parser('00000')
|
||||
parser = self.get_parser("00000")
|
||||
parser._do_vertical(+2)
|
||||
assert parser._curpos == 2
|
||||
assert parser._get_bits() == '11'
|
||||
assert parser._get_bits() == "11"
|
||||
return
|
||||
|
||||
def test_b5(self):
|
||||
parser = self.get_parser('11111111100')
|
||||
parser = self.get_parser("11111111100")
|
||||
parser._do_horizontal(0, 3)
|
||||
assert parser._curpos == 3
|
||||
parser._do_vertical(1)
|
||||
assert parser._curpos == 10
|
||||
assert parser._get_bits() == '0001111111'
|
||||
assert parser._get_bits() == "0001111111"
|
||||
return
|
||||
|
||||
def test_e1(self):
|
||||
parser = self.get_parser('10000')
|
||||
parser = self.get_parser("10000")
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 1
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 5
|
||||
assert parser._get_bits() == '10000'
|
||||
assert parser._get_bits() == "10000"
|
||||
return
|
||||
|
||||
def test_e2(self):
|
||||
parser = self.get_parser('10011')
|
||||
parser = self.get_parser("10011")
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 1
|
||||
parser._do_vertical(2)
|
||||
assert parser._curpos == 5
|
||||
assert parser._get_bits() == '10000'
|
||||
assert parser._get_bits() == "10000"
|
||||
return
|
||||
|
||||
def test_e3(self):
|
||||
parser = self.get_parser('011111')
|
||||
parser = self.get_parser("011111")
|
||||
parser._color = 0
|
||||
parser._do_vertical(0)
|
||||
assert parser._color == 1
|
||||
|
@ -72,90 +72,90 @@ class TestCCITTG4Parser():
|
|||
assert parser._curpos == 4
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 6
|
||||
assert parser._get_bits() == '011100'
|
||||
assert parser._get_bits() == "011100"
|
||||
return
|
||||
|
||||
def test_e4(self):
|
||||
parser = self.get_parser('10000')
|
||||
parser = self.get_parser("10000")
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 1
|
||||
parser._do_vertical(-2)
|
||||
assert parser._curpos == 3
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 5
|
||||
assert parser._get_bits() == '10011'
|
||||
assert parser._get_bits() == "10011"
|
||||
return
|
||||
|
||||
def test_e5(self):
|
||||
parser = self.get_parser('011000')
|
||||
parser = self.get_parser("011000")
|
||||
parser._color = 0
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 1
|
||||
parser._do_vertical(3)
|
||||
assert parser._curpos == 6
|
||||
assert parser._get_bits() == '011111'
|
||||
assert parser._get_bits() == "011111"
|
||||
return
|
||||
|
||||
def test_e6(self):
|
||||
parser = self.get_parser('11001')
|
||||
parser = self.get_parser("11001")
|
||||
parser._do_pass()
|
||||
assert parser._curpos == 4
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 5
|
||||
assert parser._get_bits() == '11111'
|
||||
assert parser._get_bits() == "11111"
|
||||
return
|
||||
|
||||
def test_e7(self):
|
||||
parser = self.get_parser('0000000000')
|
||||
parser = self.get_parser("0000000000")
|
||||
parser._curpos = 2
|
||||
parser._color = 1
|
||||
parser._do_horizontal(2, 6)
|
||||
assert parser._curpos == 10
|
||||
assert parser._get_bits() == '1111000000'
|
||||
assert parser._get_bits() == "1111000000"
|
||||
return
|
||||
|
||||
def test_e8(self):
|
||||
parser = self.get_parser('001100000')
|
||||
parser = self.get_parser("001100000")
|
||||
parser._curpos = 1
|
||||
parser._color = 0
|
||||
parser._do_vertical(0)
|
||||
assert parser._curpos == 2
|
||||
parser._do_horizontal(7, 0)
|
||||
assert parser._curpos == 9
|
||||
assert parser._get_bits() == '101111111'
|
||||
assert parser._get_bits() == "101111111"
|
||||
return
|
||||
|
||||
def test_m1(self):
|
||||
parser = self.get_parser('10101')
|
||||
parser = self.get_parser("10101")
|
||||
parser._do_pass()
|
||||
assert parser._curpos == 2
|
||||
parser._do_pass()
|
||||
assert parser._curpos == 4
|
||||
assert parser._get_bits() == '1111'
|
||||
assert parser._get_bits() == "1111"
|
||||
return
|
||||
|
||||
def test_m2(self):
|
||||
parser = self.get_parser('101011')
|
||||
parser = self.get_parser("101011")
|
||||
parser._do_vertical(-1)
|
||||
parser._do_vertical(-1)
|
||||
parser._do_vertical(1)
|
||||
parser._do_horizontal(1, 1)
|
||||
assert parser._get_bits() == '011101'
|
||||
assert parser._get_bits() == "011101"
|
||||
return
|
||||
|
||||
def test_m3(self):
|
||||
parser = self.get_parser('10111011')
|
||||
parser = self.get_parser("10111011")
|
||||
parser._do_vertical(-1)
|
||||
parser._do_pass()
|
||||
parser._do_vertical(1)
|
||||
parser._do_vertical(1)
|
||||
assert parser._get_bits() == '00000001'
|
||||
assert parser._get_bits() == "00000001"
|
||||
return
|
||||
|
||||
|
||||
class TestCCITTFaxDecoder:
|
||||
def test_b1(self):
|
||||
decoder = CCITTFaxDecoder(5)
|
||||
decoder.output_line(0, b'0')
|
||||
assert decoder.close() == b'\x80'
|
||||
decoder.output_line(0, b"0")
|
||||
assert decoder.close() == b"\x80"
|
||||
return
|
||||
|
|
|
@ -18,36 +18,37 @@ def dehex(b):
|
|||
return binascii.unhexlify(b)
|
||||
|
||||
|
||||
class TestAscii85():
|
||||
class TestAscii85:
|
||||
def test_ascii85decode(self):
|
||||
"""The sample string is taken from:
|
||||
http://en.wikipedia.org/w/index.php?title=Ascii85"""
|
||||
assert ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q') \
|
||||
== b'Man is distinguished'
|
||||
assert ascii85decode(b'E,9)oF*2M7/c~>') == b'pleasure.'
|
||||
assert ascii85decode(b"9jqo^BlbD-BleB1DJ+*+F(f,q") == b"Man is distinguished"
|
||||
assert ascii85decode(b"E,9)oF*2M7/c~>") == b"pleasure."
|
||||
|
||||
def test_asciihexdecode(self):
|
||||
assert asciihexdecode(b'61 62 2e6364 65') == b'ab.cde'
|
||||
assert asciihexdecode(b'61 62 2e6364 657>') == b'ab.cdep'
|
||||
assert asciihexdecode(b'7>') == b'p'
|
||||
assert asciihexdecode(b"61 62 2e6364 65") == b"ab.cde"
|
||||
assert asciihexdecode(b"61 62 2e6364 657>") == b"ab.cdep"
|
||||
assert asciihexdecode(b"7>") == b"p"
|
||||
|
||||
|
||||
class TestArcfour():
|
||||
class TestArcfour:
|
||||
def test(self):
|
||||
assert hex(Arcfour(b'Key').process(b'Plaintext')) \
|
||||
== b'bbf316e8d940af0ad3'
|
||||
assert hex(Arcfour(b'Wiki').process(b'pedia')) == b'1021bf0420'
|
||||
assert hex(Arcfour(b'Secret').process(b'Attack at dawn')) \
|
||||
== b'45a01f645fc35b383552544b9bf5'
|
||||
assert hex(Arcfour(b"Key").process(b"Plaintext")) == b"bbf316e8d940af0ad3"
|
||||
assert hex(Arcfour(b"Wiki").process(b"pedia")) == b"1021bf0420"
|
||||
assert (
|
||||
hex(Arcfour(b"Secret").process(b"Attack at dawn"))
|
||||
== b"45a01f645fc35b383552544b9bf5"
|
||||
)
|
||||
|
||||
|
||||
class TestLzw():
|
||||
class TestLzw:
|
||||
def test_lzwdecode(self):
|
||||
assert lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') \
|
||||
== b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
||||
assert (
|
||||
lzwdecode(b"\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01")
|
||||
== b"\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42"
|
||||
)
|
||||
|
||||
|
||||
class TestRunlength():
|
||||
class TestRunlength:
|
||||
def test_rldecode(self):
|
||||
assert rldecode(b'\x05123456\xfa7\x04abcde\x80junk') \
|
||||
== b'1234567777777abcde'
|
||||
assert rldecode(b"\x05123456\xfa7\x04abcde\x80junk") == b"1234567777777abcde"
|
||||
|
|
|
@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
|
|||
class TestPSBaseParser:
|
||||
"""Simplistic Test cases"""
|
||||
|
||||
TESTDATA = br'''%!PS
|
||||
TESTDATA = rb"""%!PS
|
||||
begin end
|
||||
" @ #
|
||||
/a/BCD /Some_Name /foo#5f#xbaa
|
||||
|
@ -26,33 +26,83 @@ baa)
|
|||
func/a/b{(c)do*}def
|
||||
[ 1 (z) ! ]
|
||||
<< /foo (bar) >>
|
||||
'''
|
||||
"""
|
||||
|
||||
TOKENS = [
|
||||
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')),
|
||||
(19, KWD(b'@')), (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')),
|
||||
(30, LIT('Some_Name')), (41, LIT('foo_xbaa')), (54, 0), (56, 1),
|
||||
(59, -2), (62, 0.5), (65, 1.234), (71, b'abc'), (77, b''),
|
||||
(80, b'abc ( def ) ghi'), (98, b'def \x00 4ghi'),
|
||||
(118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'),
|
||||
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '),
|
||||
(211, b'\xab\xcd\x00\x124\x05'), (226, KWD(b'func')), (230, LIT('a')),
|
||||
(232, LIT('b')), (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')),
|
||||
(241, KWD(b'}')), (242, KWD(b'def')), (246, KWD(b'[')), (248, 1),
|
||||
(250, b'z'), (254, KWD(b'!')), (256, KWD(b']')), (258, KWD(b'<<')),
|
||||
(261, LIT('foo')), (266, b'bar'), (272, KWD(b'>>'))
|
||||
(5, KWD(b"begin")),
|
||||
(11, KWD(b"end")),
|
||||
(16, KWD(b'"')),
|
||||
(19, KWD(b"@")),
|
||||
(21, KWD(b"#")),
|
||||
(23, LIT("a")),
|
||||
(25, LIT("BCD")),
|
||||
(30, LIT("Some_Name")),
|
||||
(41, LIT("foo_xbaa")),
|
||||
(54, 0),
|
||||
(56, 1),
|
||||
(59, -2),
|
||||
(62, 0.5),
|
||||
(65, 1.234),
|
||||
(71, b"abc"),
|
||||
(77, b""),
|
||||
(80, b"abc ( def ) ghi"),
|
||||
(98, b"def \x00 4ghi"),
|
||||
(118, b"bach\\slask"),
|
||||
(132, b"foo\nbaa"),
|
||||
(143, b"this % is not a comment."),
|
||||
(170, b"foo\nbaa"),
|
||||
(180, b"foobaa"),
|
||||
(191, b""),
|
||||
(194, b" "),
|
||||
(199, b"@@ "),
|
||||
(211, b"\xab\xcd\x00\x124\x05"),
|
||||
(226, KWD(b"func")),
|
||||
(230, LIT("a")),
|
||||
(232, LIT("b")),
|
||||
(234, KWD(b"{")),
|
||||
(235, b"c"),
|
||||
(238, KWD(b"do*")),
|
||||
(241, KWD(b"}")),
|
||||
(242, KWD(b"def")),
|
||||
(246, KWD(b"[")),
|
||||
(248, 1),
|
||||
(250, b"z"),
|
||||
(254, KWD(b"!")),
|
||||
(256, KWD(b"]")),
|
||||
(258, KWD(b"<<")),
|
||||
(261, LIT("foo")),
|
||||
(266, b"bar"),
|
||||
(272, KWD(b">>")),
|
||||
]
|
||||
|
||||
OBJS = [
|
||||
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
|
||||
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'),
|
||||
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '),
|
||||
(211, b'\xab\xcd\x00\x124\x05'), (230, LIT('a')), (232, LIT('b')),
|
||||
(234, [b'c']), (246, [1, b'z']), (258, {'foo': b'bar'}),
|
||||
(23, LIT("a")),
|
||||
(25, LIT("BCD")),
|
||||
(30, LIT("Some_Name")),
|
||||
(41, LIT("foo_xbaa")),
|
||||
(54, 0),
|
||||
(56, 1),
|
||||
(59, -2),
|
||||
(62, 0.5),
|
||||
(65, 1.234),
|
||||
(71, b"abc"),
|
||||
(77, b""),
|
||||
(80, b"abc ( def ) ghi"),
|
||||
(98, b"def \x00 4ghi"),
|
||||
(118, b"bach\\slask"),
|
||||
(132, b"foo\nbaa"),
|
||||
(143, b"this % is not a comment."),
|
||||
(170, b"foo\nbaa"),
|
||||
(180, b"foobaa"),
|
||||
(191, b""),
|
||||
(194, b" "),
|
||||
(199, b"@@ "),
|
||||
(211, b"\xab\xcd\x00\x124\x05"),
|
||||
(230, LIT("a")),
|
||||
(232, LIT("b")),
|
||||
(234, [b"c"]),
|
||||
(246, [1, b"z"]),
|
||||
(258, {"foo": b"bar"}),
|
||||
]
|
||||
|
||||
def get_tokens(self, s):
|
||||
|
|
|
@ -6,10 +6,10 @@ from pdfminer.pdfparser import PDFParser
|
|||
|
||||
class TestPdfPage(object):
|
||||
def test_page_labels(self):
|
||||
path = absolute_sample_path('contrib/pagelabels.pdf')
|
||||
expected_labels = ['iii', 'iv', '1', '2', '1']
|
||||
path = absolute_sample_path("contrib/pagelabels.pdf")
|
||||
expected_labels = ["iii", "iv", "1", "2", "1"]
|
||||
|
||||
with open(path, 'rb') as fp:
|
||||
with open(path, "rb") as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
for (i, page) in enumerate(PDFPage.create_pages(doc)):
|
||||
|
|
|
@ -11,48 +11,47 @@ def run(filename, options=None):
|
|||
absolute_path = absolute_sample_path(filename)
|
||||
with TemporaryFilePath() as output_file_name:
|
||||
if options:
|
||||
s = 'dumppdf -o %s %s %s' % (output_file_name,
|
||||
options, absolute_path)
|
||||
s = "dumppdf -o %s %s %s" % (output_file_name, options, absolute_path)
|
||||
else:
|
||||
s = 'dumppdf -o %s %s' % (output_file_name, absolute_path)
|
||||
s = "dumppdf -o %s %s" % (output_file_name, absolute_path)
|
||||
|
||||
dumppdf.main(s.split(' ')[1:])
|
||||
dumppdf.main(s.split(" ")[1:])
|
||||
|
||||
|
||||
class TestDumpPDF(unittest.TestCase):
|
||||
def test_simple1(self):
|
||||
run('simple1.pdf', '-t -a')
|
||||
run("simple1.pdf", "-t -a")
|
||||
|
||||
def test_simple2(self):
|
||||
run('simple2.pdf', '-t -a')
|
||||
run("simple2.pdf", "-t -a")
|
||||
|
||||
def test_jo(self):
|
||||
run('jo.pdf', '-t -a')
|
||||
run("jo.pdf", "-t -a")
|
||||
|
||||
def test_simple3(self):
|
||||
run('simple3.pdf', '-t -a')
|
||||
run("simple3.pdf", "-t -a")
|
||||
|
||||
def test_2(self):
|
||||
run('nonfree/dmca.pdf', '-t -a')
|
||||
run("nonfree/dmca.pdf", "-t -a")
|
||||
|
||||
def test_3(self):
|
||||
run('nonfree/f1040nr.pdf')
|
||||
run("nonfree/f1040nr.pdf")
|
||||
|
||||
def test_4(self):
|
||||
run('nonfree/i1040nr.pdf')
|
||||
run("nonfree/i1040nr.pdf")
|
||||
|
||||
def test_5(self):
|
||||
run('nonfree/kampo.pdf', '-t -a')
|
||||
run("nonfree/kampo.pdf", "-t -a")
|
||||
|
||||
def test_6(self):
|
||||
run('nonfree/naacl06-shinyama.pdf', '-t -a')
|
||||
run("nonfree/naacl06-shinyama.pdf", "-t -a")
|
||||
|
||||
def test_simple1_raw(self):
|
||||
"""Known issue: crash in dumpxml writing binary to text stream."""
|
||||
with pytest.raises(TypeError):
|
||||
run('simple1.pdf', '-r -a')
|
||||
run("simple1.pdf", "-r -a")
|
||||
|
||||
def test_simple1_binary(self):
|
||||
"""Known issue: crash in dumpxml writing binary to text stream."""
|
||||
with pytest.raises(TypeError):
|
||||
run('simple1.pdf', '-b -a')
|
||||
run("simple1.pdf", "-b -a")
|
||||
|
|
|
@ -12,115 +12,119 @@ def run(sample_path, options=None):
|
|||
absolute_path = absolute_sample_path(sample_path)
|
||||
with TemporaryFilePath() as output_file_name:
|
||||
if options:
|
||||
s = 'pdf2txt -o{} {} {}' \
|
||||
.format(output_file_name, options, absolute_path)
|
||||
s = "pdf2txt -o{} {} {}".format(output_file_name, options, absolute_path)
|
||||
else:
|
||||
s = 'pdf2txt -o{} {}'.format(output_file_name, absolute_path)
|
||||
s = "pdf2txt -o{} {}".format(output_file_name, absolute_path)
|
||||
|
||||
pdf2txt.main(s.split(' ')[1:])
|
||||
pdf2txt.main(s.split(" ")[1:])
|
||||
|
||||
|
||||
class TestPdf2Txt():
|
||||
class TestPdf2Txt:
|
||||
def test_jo(self):
|
||||
run('jo.pdf')
|
||||
run("jo.pdf")
|
||||
|
||||
def test_simple1(self):
|
||||
run('simple1.pdf')
|
||||
run("simple1.pdf")
|
||||
|
||||
def test_simple2(self):
|
||||
run('simple2.pdf')
|
||||
run("simple2.pdf")
|
||||
|
||||
def test_simple3(self):
|
||||
run('simple3.pdf')
|
||||
run("simple3.pdf")
|
||||
|
||||
def test_sample_one_byte_identity_encode(self):
|
||||
run('sampleOneByteIdentityEncode.pdf')
|
||||
run("sampleOneByteIdentityEncode.pdf")
|
||||
|
||||
def test_nonfree_175(self):
|
||||
"""Regression test for:
|
||||
https://github.com/pdfminer/pdfminer.six/issues/65
|
||||
"""
|
||||
run('nonfree/175.pdf')
|
||||
run("nonfree/175.pdf")
|
||||
|
||||
def test_nonfree_dmca(self):
|
||||
run('nonfree/dmca.pdf')
|
||||
run("nonfree/dmca.pdf")
|
||||
|
||||
def test_nonfree_f1040nr(self):
|
||||
run('nonfree/f1040nr.pdf', '-p 1')
|
||||
run("nonfree/f1040nr.pdf", "-p 1")
|
||||
|
||||
def test_nonfree_i1040nr(self):
|
||||
run('nonfree/i1040nr.pdf', '-p 1')
|
||||
run("nonfree/i1040nr.pdf", "-p 1")
|
||||
|
||||
def test_nonfree_kampo(self):
|
||||
run('nonfree/kampo.pdf')
|
||||
run("nonfree/kampo.pdf")
|
||||
|
||||
def test_nonfree_naacl06_shinyama(self):
|
||||
run('nonfree/naacl06-shinyama.pdf')
|
||||
run("nonfree/naacl06-shinyama.pdf")
|
||||
|
||||
def test_nlp2004slides(self):
|
||||
run('nonfree/nlp2004slides.pdf', '-p 1')
|
||||
run("nonfree/nlp2004slides.pdf", "-p 1")
|
||||
|
||||
def test_contrib_2b(self):
|
||||
run('contrib/2b.pdf', '-A -t xml')
|
||||
run("contrib/2b.pdf", "-A -t xml")
|
||||
|
||||
def test_contrib_issue_350(self):
|
||||
"""Regression test for
|
||||
https://github.com/pdfminer/pdfminer.six/issues/350"""
|
||||
run('contrib/issue-00352-asw-oct96-p41.pdf')
|
||||
run("contrib/issue-00352-asw-oct96-p41.pdf")
|
||||
|
||||
def test_scancode_patchelf(self):
|
||||
"""Regression test for https://github.com/euske/pdfminer/issues/96"""
|
||||
run('scancode/patchelf.pdf')
|
||||
run("scancode/patchelf.pdf")
|
||||
|
||||
def test_contrib_hash_two_complement(self):
|
||||
"""Check that unsigned integer is added correctly to encryption hash.et
|
||||
|
||||
See https://github.com/pdfminer/pdfminer.six/issues/186
|
||||
"""
|
||||
run('contrib/issue-00352-hash-twos-complement.pdf')
|
||||
run("contrib/issue-00352-hash-twos-complement.pdf")
|
||||
|
||||
def test_contrib_excel(self):
|
||||
"""Regression test for
|
||||
https://github.com/pdfminer/pdfminer.six/issues/369
|
||||
"""
|
||||
run('contrib/issue-00369-excel.pdf', '-t html')
|
||||
run("contrib/issue-00369-excel.pdf", "-t html")
|
||||
|
||||
def test_encryption_aes128(self):
|
||||
run('encryption/aes-128.pdf', '-P foo')
|
||||
run("encryption/aes-128.pdf", "-P foo")
|
||||
|
||||
def test_encryption_aes128m(self):
|
||||
run('encryption/aes-128-m.pdf', '-P foo')
|
||||
run("encryption/aes-128-m.pdf", "-P foo")
|
||||
|
||||
def test_encryption_aes256(self):
|
||||
run('encryption/aes-256.pdf', '-P foo')
|
||||
run("encryption/aes-256.pdf", "-P foo")
|
||||
|
||||
def test_encryption_aes256m(self):
|
||||
run('encryption/aes-256-m.pdf', '-P foo')
|
||||
run("encryption/aes-256-m.pdf", "-P foo")
|
||||
|
||||
def test_encryption_aes256_r6_user(self):
|
||||
run('encryption/aes-256-r6.pdf', '-P usersecret')
|
||||
run("encryption/aes-256-r6.pdf", "-P usersecret")
|
||||
|
||||
def test_encryption_aes256_r6_owner(self):
|
||||
run('encryption/aes-256-r6.pdf', '-P ownersecret')
|
||||
run("encryption/aes-256-r6.pdf", "-P ownersecret")
|
||||
|
||||
def test_encryption_base(self):
|
||||
run('encryption/base.pdf', '-P foo')
|
||||
run("encryption/base.pdf", "-P foo")
|
||||
|
||||
def test_encryption_rc4_40(self):
|
||||
run('encryption/rc4-40.pdf', '-P foo')
|
||||
run("encryption/rc4-40.pdf", "-P foo")
|
||||
|
||||
def test_encryption_rc4_128(self):
|
||||
run('encryption/rc4-128.pdf', '-P foo')
|
||||
run("encryption/rc4-128.pdf", "-P foo")
|
||||
|
||||
|
||||
class TestDumpImages:
|
||||
|
||||
@staticmethod
|
||||
def extract_images(input_file, *args):
|
||||
output_dir = mkdtemp()
|
||||
with TemporaryFilePath() as output_file_name:
|
||||
commands = ['-o', output_file_name, '--output-dir',
|
||||
output_dir, input_file, *args]
|
||||
commands = [
|
||||
"-o",
|
||||
output_file_name,
|
||||
"--output-dir",
|
||||
output_dir,
|
||||
input_file,
|
||||
*args,
|
||||
]
|
||||
pdf2txt.main(commands)
|
||||
image_files = os.listdir(output_dir)
|
||||
rmtree(output_dir)
|
||||
|
@ -132,39 +136,38 @@ class TestDumpImages:
|
|||
Regression test for:
|
||||
https://github.com/pdfminer/pdfminer.six/issues/131
|
||||
"""
|
||||
filepath = absolute_sample_path('../samples/nonfree/dmca.pdf')
|
||||
image_files = self.extract_images(filepath, '-p', '1')
|
||||
assert image_files[0].endswith('bmp')
|
||||
filepath = absolute_sample_path("../samples/nonfree/dmca.pdf")
|
||||
image_files = self.extract_images(filepath, "-p", "1")
|
||||
assert image_files[0].endswith("bmp")
|
||||
|
||||
def test_nonfree_175(self):
|
||||
"""Extract images of pdf containing jpg images"""
|
||||
self.extract_images(absolute_sample_path('../samples/nonfree/175.pdf'))
|
||||
self.extract_images(absolute_sample_path("../samples/nonfree/175.pdf"))
|
||||
|
||||
def test_jbig2_image_export(self):
|
||||
"""Extract images of pdf containing jbig2 images
|
||||
|
||||
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
|
||||
"""
|
||||
input_file = absolute_sample_path(
|
||||
'../samples/contrib/pdf-with-jbig2.pdf')
|
||||
input_file = absolute_sample_path("../samples/contrib/pdf-with-jbig2.pdf")
|
||||
output_dir = mkdtemp()
|
||||
with TemporaryFilePath() as output_file_name:
|
||||
commands = ['-o', output_file_name, '--output-dir',
|
||||
output_dir, input_file]
|
||||
commands = ["-o", output_file_name, "--output-dir", output_dir, input_file]
|
||||
pdf2txt.main(commands)
|
||||
image_files = os.listdir(output_dir)
|
||||
try:
|
||||
assert image_files[0].endswith('.jb2')
|
||||
assert filecmp.cmp(output_dir + '/' + image_files[0],
|
||||
absolute_sample_path(
|
||||
'../samples/contrib/XIPLAYER0.jb2'))
|
||||
assert image_files[0].endswith(".jb2")
|
||||
assert filecmp.cmp(
|
||||
output_dir + "/" + image_files[0],
|
||||
absolute_sample_path("../samples/contrib/XIPLAYER0.jb2"),
|
||||
)
|
||||
finally:
|
||||
rmtree(output_dir)
|
||||
|
||||
def test_contrib_matplotlib(self):
|
||||
"""Test a pdf with Type3 font"""
|
||||
run('contrib/matplotlib.pdf')
|
||||
run("contrib/matplotlib.pdf")
|
||||
|
||||
def test_nonfree_cmp_itext_logo(self):
|
||||
"""Test a pdf with Type3 font"""
|
||||
run('nonfree/cmp_itext_logo.pdf')
|
||||
run("nonfree/cmp_itext_logo.pdf")
|
||||
|
|
|
@ -4,8 +4,13 @@ import pytest
|
|||
|
||||
from helpers import absolute_sample_path
|
||||
from pdfminer.layout import LTComponent
|
||||
from pdfminer.utils import open_filename, Plane, shorten_str, \
|
||||
format_int_roman, format_int_alpha
|
||||
from pdfminer.utils import (
|
||||
open_filename,
|
||||
Plane,
|
||||
shorten_str,
|
||||
format_int_roman,
|
||||
format_int_alpha,
|
||||
)
|
||||
|
||||
|
||||
class TestOpenFilename:
|
||||
|
@ -48,14 +53,12 @@ class TestPlane:
|
|||
assert result == [obj]
|
||||
|
||||
def test_find_if_object_is_smaller_than_gridsize(self):
|
||||
plane, obj = self.given_plane_with_one_object(object_size=1,
|
||||
gridsize=100)
|
||||
plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100)
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert result == [obj]
|
||||
|
||||
def test_find_object_if_much_larger_than_gridsize(self):
|
||||
plane, obj = self.given_plane_with_one_object(object_size=100,
|
||||
gridsize=10)
|
||||
plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10)
|
||||
result = list(plane.find((0, 0, 100, 100)))
|
||||
assert result == [obj]
|
||||
|
||||
|
@ -70,43 +73,43 @@ class TestPlane:
|
|||
|
||||
class TestFunctions(object):
|
||||
def test_shorten_str(self):
|
||||
s = shorten_str('Hello there World', 15)
|
||||
assert s == 'Hello ... World'
|
||||
s = shorten_str("Hello there World", 15)
|
||||
assert s == "Hello ... World"
|
||||
|
||||
def test_shorten_short_str_is_same(self):
|
||||
s = 'Hello World'
|
||||
s = "Hello World"
|
||||
assert shorten_str(s, 50) == s
|
||||
|
||||
def test_shorten_to_really_short(self):
|
||||
assert shorten_str('Hello World', 5) == 'Hello'
|
||||
assert shorten_str("Hello World", 5) == "Hello"
|
||||
|
||||
def test_format_int_alpha(self):
|
||||
assert format_int_alpha(1) == 'a'
|
||||
assert format_int_alpha(2) == 'b'
|
||||
assert format_int_alpha(26) == 'z'
|
||||
assert format_int_alpha(27) == 'aa'
|
||||
assert format_int_alpha(28) == 'ab'
|
||||
assert format_int_alpha(26 * 2) == 'az'
|
||||
assert format_int_alpha(26 * 2 + 1) == 'ba'
|
||||
assert format_int_alpha(26 * 27) == 'zz'
|
||||
assert format_int_alpha(26 * 27 + 1) == 'aaa'
|
||||
assert format_int_alpha(1) == "a"
|
||||
assert format_int_alpha(2) == "b"
|
||||
assert format_int_alpha(26) == "z"
|
||||
assert format_int_alpha(27) == "aa"
|
||||
assert format_int_alpha(28) == "ab"
|
||||
assert format_int_alpha(26 * 2) == "az"
|
||||
assert format_int_alpha(26 * 2 + 1) == "ba"
|
||||
assert format_int_alpha(26 * 27) == "zz"
|
||||
assert format_int_alpha(26 * 27 + 1) == "aaa"
|
||||
|
||||
def test_format_int_roman(self):
|
||||
assert format_int_roman(1) == 'i'
|
||||
assert format_int_roman(2) == 'ii'
|
||||
assert format_int_roman(3) == 'iii'
|
||||
assert format_int_roman(4) == 'iv'
|
||||
assert format_int_roman(5) == 'v'
|
||||
assert format_int_roman(6) == 'vi'
|
||||
assert format_int_roman(7) == 'vii'
|
||||
assert format_int_roman(8) == 'viii'
|
||||
assert format_int_roman(9) == 'ix'
|
||||
assert format_int_roman(10) == 'x'
|
||||
assert format_int_roman(11) == 'xi'
|
||||
assert format_int_roman(20) == 'xx'
|
||||
assert format_int_roman(40) == 'xl'
|
||||
assert format_int_roman(45) == 'xlv'
|
||||
assert format_int_roman(50) == 'l'
|
||||
assert format_int_roman(90) == 'xc'
|
||||
assert format_int_roman(91) == 'xci'
|
||||
assert format_int_roman(100) == 'c'
|
||||
assert format_int_roman(1) == "i"
|
||||
assert format_int_roman(2) == "ii"
|
||||
assert format_int_roman(3) == "iii"
|
||||
assert format_int_roman(4) == "iv"
|
||||
assert format_int_roman(5) == "v"
|
||||
assert format_int_roman(6) == "vi"
|
||||
assert format_int_roman(7) == "vii"
|
||||
assert format_int_roman(8) == "viii"
|
||||
assert format_int_roman(9) == "ix"
|
||||
assert format_int_roman(10) == "x"
|
||||
assert format_int_roman(11) == "xi"
|
||||
assert format_int_roman(20) == "xx"
|
||||
assert format_int_roman(40) == "xl"
|
||||
assert format_int_roman(45) == "xlv"
|
||||
assert format_int_roman(50) == "l"
|
||||
assert format_int_roman(90) == "xc"
|
||||
assert format_int_roman(91) == "xci"
|
||||
assert format_int_roman(100) == "c"
|
||||
|
|
|
@ -7,39 +7,38 @@ import fileinput
|
|||
def main(argv):
|
||||
fonts = {}
|
||||
for line in fileinput.input():
|
||||
f = line.strip().split(' ')
|
||||
f = line.strip().split(" ")
|
||||
if not f:
|
||||
continue
|
||||
k = f[0]
|
||||
if k == 'FontName':
|
||||
if k == "FontName":
|
||||
fontname = f[1]
|
||||
props = {'FontName': fontname, 'Flags': 0}
|
||||
props = {"FontName": fontname, "Flags": 0}
|
||||
chars = {}
|
||||
fonts[fontname] = (props, chars)
|
||||
elif k == 'C':
|
||||
elif k == "C":
|
||||
cid = int(f[1])
|
||||
if 0 <= cid and cid <= 255:
|
||||
width = int(f[4])
|
||||
chars[cid] = width
|
||||
elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
|
||||
'Ascender', 'Descender'):
|
||||
k = {'Ascender': 'Ascent', 'Descender': 'Descent'}.get(k, k)
|
||||
elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"):
|
||||
k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k)
|
||||
props[k] = float(f[1])
|
||||
elif k in ('FontName', 'FamilyName', 'Weight'):
|
||||
k = {'FamilyName': 'FontFamily', 'Weight': 'FontWeight'}.get(k, k)
|
||||
elif k in ("FontName", "FamilyName", "Weight"):
|
||||
k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k)
|
||||
props[k] = f[1]
|
||||
elif k == 'IsFixedPitch':
|
||||
if f[1].lower() == 'true':
|
||||
props['Flags'] = 64
|
||||
elif k == 'FontBBox':
|
||||
elif k == "IsFixedPitch":
|
||||
if f[1].lower() == "true":
|
||||
props["Flags"] = 64
|
||||
elif k == "FontBBox":
|
||||
props[k] = tuple(map(float, f[1:5]))
|
||||
print('# -*- python -*-')
|
||||
print('FONT_METRICS = {')
|
||||
print("# -*- python -*-")
|
||||
print("FONT_METRICS = {")
|
||||
for (fontname, (props, chars)) in fonts.items():
|
||||
print(' {!r}: {!r},'.format(fontname, (props, chars)))
|
||||
print('}')
|
||||
print(" {!r}: {!r},".format(fontname, (props, chars)))
|
||||
print("}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
|
||||
|
|
|
@ -6,7 +6,6 @@ import codecs
|
|||
|
||||
|
||||
class CMapConverter:
|
||||
|
||||
def __init__(self, enc2codec={}):
|
||||
self.enc2codec = enc2codec
|
||||
self.code2cid = {} # {'cmapname': ...}
|
||||
|
@ -19,12 +18,12 @@ class CMapConverter:
|
|||
return self.code2cid.keys()
|
||||
|
||||
def get_maps(self, enc):
|
||||
if enc.endswith('-H'):
|
||||
if enc.endswith("-H"):
|
||||
(hmapenc, vmapenc) = (enc, None)
|
||||
elif enc == 'H':
|
||||
(hmapenc, vmapenc) = ('H', 'V')
|
||||
elif enc == "H":
|
||||
(hmapenc, vmapenc) = ("H", "V")
|
||||
else:
|
||||
(hmapenc, vmapenc) = (enc+'-H', enc+'-V')
|
||||
(hmapenc, vmapenc) = (enc + "-H", enc + "-V")
|
||||
if hmapenc in self.code2cid:
|
||||
hmap = self.code2cid[hmapenc]
|
||||
else:
|
||||
|
@ -43,12 +42,12 @@ class CMapConverter:
|
|||
def load(self, fp):
|
||||
encs = None
|
||||
for line in fp:
|
||||
(line, _, _) = line.strip().partition('#')
|
||||
(line, _, _) = line.strip().partition("#")
|
||||
if not line:
|
||||
continue
|
||||
values = line.split('\t')
|
||||
values = line.split("\t")
|
||||
if encs is None:
|
||||
assert values[0] == 'CID', str(values)
|
||||
assert values[0] == "CID", str(values)
|
||||
encs = values
|
||||
continue
|
||||
|
||||
|
@ -68,7 +67,7 @@ class CMapConverter:
|
|||
def add(unimap, enc, code):
|
||||
try:
|
||||
codec = self.enc2codec[enc]
|
||||
c = code.decode(codec, 'strict')
|
||||
c = code.decode(codec, "strict")
|
||||
if len(c) == 1:
|
||||
if c not in unimap:
|
||||
unimap[c] = 0
|
||||
|
@ -89,20 +88,20 @@ class CMapConverter:
|
|||
unimap_h = {}
|
||||
unimap_v = {}
|
||||
for (enc, value) in zip(encs, values):
|
||||
if enc == 'CID':
|
||||
if enc == "CID":
|
||||
continue
|
||||
if value == '*':
|
||||
if value == "*":
|
||||
continue
|
||||
|
||||
# hcodes, vcodes: encoded bytes for each writing mode.
|
||||
hcodes = []
|
||||
vcodes = []
|
||||
for code in value.split(','):
|
||||
vertical = code.endswith('v')
|
||||
for code in value.split(","):
|
||||
vertical = code.endswith("v")
|
||||
if vertical:
|
||||
code = code[:-1]
|
||||
try:
|
||||
code = codecs.decode(code, 'hex_codec')
|
||||
code = codecs.decode(code, "hex_codec")
|
||||
except Exception:
|
||||
code = chr(int(code, 16))
|
||||
if vertical:
|
||||
|
@ -155,17 +154,19 @@ def main(argv):
|
|||
import os.path
|
||||
|
||||
def usage():
|
||||
print('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]'
|
||||
% argv[0])
|
||||
print(
|
||||
"usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]" % argv[0]
|
||||
)
|
||||
return 100
|
||||
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'c:')
|
||||
(opts, args) = getopt.getopt(argv[1:], "c:")
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
enc2codec = {}
|
||||
for (k, v) in opts:
|
||||
if k == '-c':
|
||||
(enc, _, codec) = v.partition('=')
|
||||
if k == "-c":
|
||||
(enc, _, codec) = v.partition("=")
|
||||
enc2codec[enc] = codec
|
||||
if not args:
|
||||
return usage()
|
||||
|
@ -176,27 +177,27 @@ def main(argv):
|
|||
|
||||
converter = CMapConverter(enc2codec)
|
||||
for path in args:
|
||||
print('reading: %r...' % path)
|
||||
print("reading: %r..." % path)
|
||||
fp = open(path)
|
||||
converter.load(fp)
|
||||
fp.close()
|
||||
|
||||
for enc in converter.get_encs():
|
||||
fname = '%s.pickle.gz' % enc
|
||||
fname = "%s.pickle.gz" % enc
|
||||
path = os.path.join(outdir, fname)
|
||||
print('writing: %r...' % path)
|
||||
fp = gzip.open(path, 'wb')
|
||||
print("writing: %r..." % path)
|
||||
fp = gzip.open(path, "wb")
|
||||
converter.dump_cmap(fp, enc)
|
||||
fp.close()
|
||||
|
||||
fname = 'to-unicode-%s.pickle.gz' % regname
|
||||
fname = "to-unicode-%s.pickle.gz" % regname
|
||||
path = os.path.join(outdir, fname)
|
||||
print('writing: %r...' % path)
|
||||
fp = gzip.open(path, 'wb')
|
||||
print("writing: %r..." % path)
|
||||
fp = gzip.open(path, "wb")
|
||||
converter.dump_unicodemap(fp)
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
|
||||
|
|
|
@ -8,20 +8,19 @@ def main(argv):
|
|||
state = 0
|
||||
for line in fileinput.input():
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
if not line or line.startswith("#"):
|
||||
if state == 1:
|
||||
state = 2
|
||||
print('}\n')
|
||||
print("}\n")
|
||||
print(line)
|
||||
continue
|
||||
if state == 0:
|
||||
print('\nglyphname2unicode = {')
|
||||
print("\nglyphname2unicode = {")
|
||||
state = 1
|
||||
(name, x) = line.split(';')
|
||||
codes = x.split(' ')
|
||||
print(' {!r}: u\'{}\','
|
||||
.format(name, ''.join('\\u%s' % code for code in codes)))
|
||||
(name, x) = line.split(";")
|
||||
codes = x.split(" ")
|
||||
print(" {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
|
||||
|
|
320
tools/dumppdf.py
320
tools/dumppdf.py
|
@ -4,8 +4,7 @@ import logging
|
|||
import os.path
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \
|
||||
Union, cast
|
||||
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import pdfminer
|
||||
|
@ -25,33 +24,33 @@ ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
|||
|
||||
def escape(s: Union[str, bytes]) -> str:
|
||||
if isinstance(s, bytes):
|
||||
us = str(s, 'latin-1')
|
||||
us = str(s, "latin-1")
|
||||
else:
|
||||
us = s
|
||||
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), us)
|
||||
return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us)
|
||||
|
||||
|
||||
def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
|
||||
if obj is None:
|
||||
out.write('<null />')
|
||||
out.write("<null />")
|
||||
return
|
||||
|
||||
if isinstance(obj, dict):
|
||||
out.write('<dict size="%d">\n' % len(obj))
|
||||
for (k, v) in obj.items():
|
||||
out.write('<key>%s</key>\n' % k)
|
||||
out.write('<value>')
|
||||
out.write("<key>%s</key>\n" % k)
|
||||
out.write("<value>")
|
||||
dumpxml(out, v)
|
||||
out.write('</value>\n')
|
||||
out.write('</dict>')
|
||||
out.write("</value>\n")
|
||||
out.write("</dict>")
|
||||
return
|
||||
|
||||
if isinstance(obj, list):
|
||||
out.write('<list size="%d">\n' % len(obj))
|
||||
for v in obj:
|
||||
dumpxml(out, v)
|
||||
out.write('\n')
|
||||
out.write('</list>')
|
||||
out.write("\n")
|
||||
out.write("</list>")
|
||||
return
|
||||
|
||||
if isinstance(obj, (str, bytes)):
|
||||
|
@ -59,21 +58,20 @@ def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
|
|||
return
|
||||
|
||||
if isinstance(obj, PDFStream):
|
||||
if codec == 'raw':
|
||||
if codec == "raw":
|
||||
# Bug: writing bytes to text I/O. This will raise TypeError.
|
||||
out.write(obj.get_rawdata()) # type: ignore [arg-type]
|
||||
elif codec == 'binary':
|
||||
elif codec == "binary":
|
||||
# Bug: writing bytes to text I/O. This will raise TypeError.
|
||||
out.write(obj.get_data()) # type: ignore [arg-type]
|
||||
else:
|
||||
out.write('<stream>\n<props>\n')
|
||||
out.write("<stream>\n<props>\n")
|
||||
dumpxml(out, obj.attrs)
|
||||
out.write('\n</props>\n')
|
||||
if codec == 'text':
|
||||
out.write("\n</props>\n")
|
||||
if codec == "text":
|
||||
data = obj.get_data()
|
||||
out.write('<data size="%d">%s</data>\n'
|
||||
% (len(data), escape(data)))
|
||||
out.write('</stream>')
|
||||
out.write('<data size="%d">%s</data>\n' % (len(data), escape(data)))
|
||||
out.write("</stream>")
|
||||
return
|
||||
|
||||
if isinstance(obj, PDFObjRef):
|
||||
|
@ -82,38 +80,36 @@ def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
|
|||
|
||||
if isinstance(obj, PSKeyword):
|
||||
# Likely bug: obj.name is bytes, not str
|
||||
out.write('<keyword>%s</keyword>'
|
||||
% obj.name) # type: ignore [str-bytes-safe]
|
||||
out.write("<keyword>%s</keyword>" % obj.name) # type: ignore [str-bytes-safe]
|
||||
return
|
||||
|
||||
if isinstance(obj, PSLiteral):
|
||||
# Likely bug: obj.name may be bytes, not str
|
||||
out.write('<literal>%s</literal>'
|
||||
% obj.name) # type: ignore [str-bytes-safe]
|
||||
out.write("<literal>%s</literal>" % obj.name) # type: ignore [str-bytes-safe]
|
||||
return
|
||||
|
||||
if isnumber(obj):
|
||||
out.write('<number>%s</number>' % obj)
|
||||
out.write("<number>%s</number>" % obj)
|
||||
return
|
||||
|
||||
raise TypeError(obj)
|
||||
|
||||
|
||||
def dumptrailers(
|
||||
out: TextIO,
|
||||
doc: PDFDocument,
|
||||
show_fallback_xref: bool = False
|
||||
out: TextIO, doc: PDFDocument, show_fallback_xref: bool = False
|
||||
) -> None:
|
||||
for xref in doc.xrefs:
|
||||
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
|
||||
out.write('<trailer>\n')
|
||||
out.write("<trailer>\n")
|
||||
dumpxml(out, xref.get_trailer())
|
||||
out.write('\n</trailer>\n\n')
|
||||
out.write("\n</trailer>\n\n")
|
||||
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
|
||||
if no_xrefs and not show_fallback_xref:
|
||||
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
|
||||
'you want to display the content of a fallback xref that ' \
|
||||
'contains all objects.'
|
||||
msg = (
|
||||
"This PDF does not have an xref. Use --show-fallback-xref if "
|
||||
"you want to display the content of a fallback xref that "
|
||||
"contains all objects."
|
||||
)
|
||||
logger.warning(msg)
|
||||
return
|
||||
|
||||
|
@ -122,10 +118,10 @@ def dumpallobjs(
|
|||
out: TextIO,
|
||||
doc: PDFDocument,
|
||||
codec: Optional[str] = None,
|
||||
show_fallback_xref: bool = False
|
||||
show_fallback_xref: bool = False,
|
||||
) -> None:
|
||||
visited = set()
|
||||
out.write('<pdf>')
|
||||
out.write("<pdf>")
|
||||
for xref in doc.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
if objid in visited:
|
||||
|
@ -137,11 +133,11 @@ def dumpallobjs(
|
|||
continue
|
||||
out.write('<object id="%d">\n' % objid)
|
||||
dumpxml(out, obj, codec=codec)
|
||||
out.write('\n</object>\n\n')
|
||||
out.write("\n</object>\n\n")
|
||||
except PDFObjectNotFound as e:
|
||||
print('not found: %r' % e)
|
||||
print("not found: %r" % e)
|
||||
dumptrailers(out, doc, show_fallback_xref)
|
||||
out.write('</pdf>')
|
||||
out.write("</pdf>")
|
||||
return
|
||||
|
||||
|
||||
|
@ -150,16 +146,18 @@ def dumpoutline(
|
|||
fname: str,
|
||||
objids: Any,
|
||||
pagenos: Container[int],
|
||||
password: str = '',
|
||||
password: str = "",
|
||||
dumpall: bool = False,
|
||||
codec: Optional[str] = None,
|
||||
extractdir: Optional[str] = None
|
||||
extractdir: Optional[str] = None,
|
||||
) -> None:
|
||||
fp = open(fname, 'rb')
|
||||
fp = open(fname, "rb")
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser, password)
|
||||
pages = {page.pageid: pageno for (pageno, page)
|
||||
in enumerate(PDFPage.create_pages(doc), 1)}
|
||||
pages = {
|
||||
page.pageid: pageno
|
||||
for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
|
||||
}
|
||||
|
||||
def resolve_dest(dest: object) -> Any:
|
||||
if isinstance(dest, (str, bytes)):
|
||||
|
@ -167,14 +165,14 @@ def dumpoutline(
|
|||
elif isinstance(dest, PSLiteral):
|
||||
dest = resolve1(doc.get_dest(dest.name))
|
||||
if isinstance(dest, dict):
|
||||
dest = dest['D']
|
||||
dest = dest["D"]
|
||||
if isinstance(dest, PDFObjRef):
|
||||
dest = dest.resolve()
|
||||
return dest
|
||||
|
||||
try:
|
||||
outlines = doc.get_outlines()
|
||||
outfp.write('<outlines>\n')
|
||||
outfp.write("<outlines>\n")
|
||||
for (level, title, dest, a, se) in outlines:
|
||||
pageno = None
|
||||
if dest:
|
||||
|
@ -183,21 +181,20 @@ def dumpoutline(
|
|||
elif a:
|
||||
action = a
|
||||
if isinstance(action, dict):
|
||||
subtype = action.get('S')
|
||||
if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
|
||||
'D'):
|
||||
dest = resolve_dest(action['D'])
|
||||
subtype = action.get("S")
|
||||
if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
|
||||
dest = resolve_dest(action["D"])
|
||||
pageno = pages[dest[0].objid]
|
||||
s = escape(title)
|
||||
outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
|
||||
if dest is not None:
|
||||
outfp.write('<dest>')
|
||||
outfp.write("<dest>")
|
||||
dumpxml(outfp, dest)
|
||||
outfp.write('</dest>\n')
|
||||
outfp.write("</dest>\n")
|
||||
if pageno is not None:
|
||||
outfp.write('<pageno>%r</pageno>\n' % pageno)
|
||||
outfp.write('</outline>\n')
|
||||
outfp.write('</outlines>\n')
|
||||
outfp.write("<pageno>%r</pageno>\n" % pageno)
|
||||
outfp.write("</outline>\n")
|
||||
outfp.write("</outlines>\n")
|
||||
except PDFNoOutlines:
|
||||
pass
|
||||
parser.close()
|
||||
|
@ -205,43 +202,48 @@ def dumpoutline(
|
|||
return
|
||||
|
||||
|
||||
LITERAL_FILESPEC = LIT('Filespec')
|
||||
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
|
||||
LITERAL_FILESPEC = LIT("Filespec")
|
||||
LITERAL_EMBEDDEDFILE = LIT("EmbeddedFile")
|
||||
|
||||
|
||||
def extractembedded(fname: str, password: str, extractdir: str) -> None:
|
||||
def extract1(objid: int, obj: Dict[str, Any]) -> None:
|
||||
filename = os.path.basename(obj.get('UF') or
|
||||
cast(bytes, obj.get('F')).decode())
|
||||
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
|
||||
filename = os.path.basename(obj.get("UF") or cast(bytes, obj.get("F")).decode())
|
||||
fileref = obj["EF"].get("UF") or obj["EF"].get("F")
|
||||
fileobj = doc.getobj(fileref.objid)
|
||||
if not isinstance(fileobj, PDFStream):
|
||||
error_msg = 'unable to process PDF: reference for %r is not a ' \
|
||||
'PDFStream' % filename
|
||||
error_msg = (
|
||||
"unable to process PDF: reference for %r is not a "
|
||||
"PDFStream" % filename
|
||||
)
|
||||
raise PDFValueError(error_msg)
|
||||
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
|
||||
if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE:
|
||||
raise PDFValueError(
|
||||
'unable to process PDF: reference for %r '
|
||||
'is not an EmbeddedFile' % (filename))
|
||||
path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
|
||||
"unable to process PDF: reference for %r "
|
||||
"is not an EmbeddedFile" % (filename)
|
||||
)
|
||||
path = os.path.join(extractdir, "%.6d-%s" % (objid, filename))
|
||||
if os.path.exists(path):
|
||||
raise IOError('file exists: %r' % path)
|
||||
print('extracting: %r' % path)
|
||||
raise IOError("file exists: %r" % path)
|
||||
print("extracting: %r" % path)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
out = open(path, 'wb')
|
||||
out = open(path, "wb")
|
||||
out.write(fileobj.get_data())
|
||||
out.close()
|
||||
return
|
||||
|
||||
with open(fname, 'rb') as fp:
|
||||
with open(fname, "rb") as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser, password)
|
||||
extracted_objids = set()
|
||||
for xref in doc.xrefs:
|
||||
for objid in xref.get_objids():
|
||||
obj = doc.getobj(objid)
|
||||
if objid not in extracted_objids and isinstance(obj, dict) \
|
||||
and obj.get('Type') is LITERAL_FILESPEC:
|
||||
if (
|
||||
objid not in extracted_objids
|
||||
and isinstance(obj, dict)
|
||||
and obj.get("Type") is LITERAL_FILESPEC
|
||||
):
|
||||
extracted_objids.add(objid)
|
||||
extract1(objid, obj)
|
||||
return
|
||||
|
@ -252,13 +254,13 @@ def dumppdf(
|
|||
fname: str,
|
||||
objids: Iterable[int],
|
||||
pagenos: Container[int],
|
||||
password: str = '',
|
||||
password: str = "",
|
||||
dumpall: bool = False,
|
||||
codec: Optional[str] = None,
|
||||
extractdir: Optional[str] = None,
|
||||
show_fallback_xref: bool = False
|
||||
show_fallback_xref: bool = False,
|
||||
) -> None:
|
||||
fp = open(fname, 'rb')
|
||||
fp = open(fname, "rb")
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser, password)
|
||||
if objids:
|
||||
|
@ -279,71 +281,125 @@ def dumppdf(
|
|||
if (not objids) and (not pagenos) and (not dumpall):
|
||||
dumptrailers(outfp, doc, show_fallback_xref)
|
||||
fp.close()
|
||||
if codec not in ('raw', 'binary'):
|
||||
outfp.write('\n')
|
||||
if codec not in ("raw", "binary"):
|
||||
outfp.write("\n")
|
||||
return
|
||||
|
||||
|
||||
def create_parser() -> ArgumentParser:
|
||||
parser = ArgumentParser(description=__doc__, add_help=True)
|
||||
parser.add_argument('files', type=str, default=None, nargs='+',
|
||||
help='One or more paths to PDF files.')
|
||||
parser.add_argument(
|
||||
"files",
|
||||
type=str,
|
||||
default=None,
|
||||
nargs="+",
|
||||
help="One or more paths to PDF files.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--version", "-v", action="version",
|
||||
version="pdfminer.six v{}".format(pdfminer.__version__))
|
||||
"--version",
|
||||
"-v",
|
||||
action="version",
|
||||
version="pdfminer.six v{}".format(pdfminer.__version__),
|
||||
)
|
||||
parser.add_argument(
|
||||
'--debug', '-d', default=False, action='store_true',
|
||||
help='Use debug logging level.')
|
||||
"--debug",
|
||||
"-d",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Use debug logging level.",
|
||||
)
|
||||
procedure_parser = parser.add_mutually_exclusive_group()
|
||||
procedure_parser.add_argument(
|
||||
'--extract-toc', '-T', default=False, action='store_true',
|
||||
help='Extract structure of outline')
|
||||
"--extract-toc",
|
||||
"-T",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Extract structure of outline",
|
||||
)
|
||||
procedure_parser.add_argument(
|
||||
'--extract-embedded', '-E', type=str,
|
||||
help='Extract embedded files')
|
||||
"--extract-embedded", "-E", type=str, help="Extract embedded files"
|
||||
)
|
||||
|
||||
parse_params = parser.add_argument_group(
|
||||
'Parser', description='Used during PDF parsing')
|
||||
"Parser", description="Used during PDF parsing"
|
||||
)
|
||||
parse_params.add_argument(
|
||||
'--page-numbers', type=int, default=None, nargs='+',
|
||||
help='A space-seperated list of page numbers to parse.')
|
||||
"--page-numbers",
|
||||
type=int,
|
||||
default=None,
|
||||
nargs="+",
|
||||
help="A space-seperated list of page numbers to parse.",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
'--pagenos', '-p', type=str,
|
||||
help='A comma-separated list of page numbers to parse. Included for '
|
||||
'legacy applications, use --page-numbers for more idiomatic '
|
||||
'argument entry.')
|
||||
"--pagenos",
|
||||
"-p",
|
||||
type=str,
|
||||
help="A comma-separated list of page numbers to parse. Included for "
|
||||
"legacy applications, use --page-numbers for more idiomatic "
|
||||
"argument entry.",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
'--objects', '-i', type=str,
|
||||
help='Comma separated list of object numbers to extract')
|
||||
"--objects",
|
||||
"-i",
|
||||
type=str,
|
||||
help="Comma separated list of object numbers to extract",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
'--all', '-a', default=False, action='store_true',
|
||||
help='If the structure of all objects should be extracted')
|
||||
"--all",
|
||||
"-a",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If the structure of all objects should be extracted",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
'--show-fallback-xref', action='store_true',
|
||||
help='Additionally show the fallback xref. Use this if the PDF '
|
||||
'has zero or only invalid xref\'s. This setting is ignored if '
|
||||
'--extract-toc or --extract-embedded is used.')
|
||||
"--show-fallback-xref",
|
||||
action="store_true",
|
||||
help="Additionally show the fallback xref. Use this if the PDF "
|
||||
"has zero or only invalid xref's. This setting is ignored if "
|
||||
"--extract-toc or --extract-embedded is used.",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
'--password', '-P', type=str, default='',
|
||||
help='The password to use for decrypting PDF file.')
|
||||
"--password",
|
||||
"-P",
|
||||
type=str,
|
||||
default="",
|
||||
help="The password to use for decrypting PDF file.",
|
||||
)
|
||||
|
||||
output_params = parser.add_argument_group(
|
||||
'Output', description='Used during output generation.')
|
||||
"Output", description="Used during output generation."
|
||||
)
|
||||
output_params.add_argument(
|
||||
'--outfile', '-o', type=str, default='-',
|
||||
"--outfile",
|
||||
"-o",
|
||||
type=str,
|
||||
default="-",
|
||||
help='Path to file where output is written. Or "-" (default) to '
|
||||
'write to stdout.')
|
||||
"write to stdout.",
|
||||
)
|
||||
codec_parser = output_params.add_mutually_exclusive_group()
|
||||
codec_parser.add_argument(
|
||||
'--raw-stream', '-r', default=False, action='store_true',
|
||||
help='Write stream objects without encoding')
|
||||
"--raw-stream",
|
||||
"-r",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Write stream objects without encoding",
|
||||
)
|
||||
codec_parser.add_argument(
|
||||
'--binary-stream', '-b', default=False, action='store_true',
|
||||
help='Write stream objects with binary encoding')
|
||||
"--binary-stream",
|
||||
"-b",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Write stream objects with binary encoding",
|
||||
)
|
||||
codec_parser.add_argument(
|
||||
'--text-stream', '-t', default=False, action='store_true',
|
||||
help='Write stream objects as plain text')
|
||||
"--text-stream",
|
||||
"-t",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Write stream objects as plain text",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
@ -355,53 +411,63 @@ def main(argv: Optional[List[str]] = None) -> None:
|
|||
if args.debug:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
if args.outfile == '-':
|
||||
if args.outfile == "-":
|
||||
outfp = sys.stdout
|
||||
else:
|
||||
outfp = open(args.outfile, 'w')
|
||||
outfp = open(args.outfile, "w")
|
||||
|
||||
if args.objects:
|
||||
objids = [int(x) for x in args.objects.split(',')]
|
||||
objids = [int(x) for x in args.objects.split(",")]
|
||||
else:
|
||||
objids = []
|
||||
|
||||
if args.page_numbers:
|
||||
pagenos = {x - 1 for x in args.page_numbers}
|
||||
elif args.pagenos:
|
||||
pagenos = {int(x) - 1 for x in args.pagenos.split(',')}
|
||||
pagenos = {int(x) - 1 for x in args.pagenos.split(",")}
|
||||
else:
|
||||
pagenos = set()
|
||||
|
||||
password = args.password
|
||||
|
||||
if args.raw_stream:
|
||||
codec: Optional[str] = 'raw'
|
||||
codec: Optional[str] = "raw"
|
||||
elif args.binary_stream:
|
||||
codec = 'binary'
|
||||
codec = "binary"
|
||||
elif args.text_stream:
|
||||
codec = 'text'
|
||||
codec = "text"
|
||||
else:
|
||||
codec = None
|
||||
|
||||
for fname in args.files:
|
||||
if args.extract_toc:
|
||||
dumpoutline(
|
||||
outfp, fname, objids, pagenos, password=password,
|
||||
dumpall=args.all, codec=codec, extractdir=None
|
||||
outfp,
|
||||
fname,
|
||||
objids,
|
||||
pagenos,
|
||||
password=password,
|
||||
dumpall=args.all,
|
||||
codec=codec,
|
||||
extractdir=None,
|
||||
)
|
||||
elif args.extract_embedded:
|
||||
extractembedded(
|
||||
fname, password=password, extractdir=args.extract_embedded
|
||||
)
|
||||
extractembedded(fname, password=password, extractdir=args.extract_embedded)
|
||||
else:
|
||||
dumppdf(
|
||||
outfp, fname, objids, pagenos, password=password,
|
||||
dumpall=args.all, codec=codec, extractdir=None,
|
||||
show_fallback_xref=args.show_fallback_xref
|
||||
outfp,
|
||||
fname,
|
||||
objids,
|
||||
pagenos,
|
||||
password=password,
|
||||
dumpall=args.all,
|
||||
codec=codec,
|
||||
extractdir=None,
|
||||
show_fallback_xref=args.show_fallback_xref,
|
||||
)
|
||||
|
||||
outfp.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
225
tools/pdf2txt.py
225
tools/pdf2txt.py
|
@ -12,10 +12,7 @@ from pdfminer.utils import AnyIO
|
|||
|
||||
logging.basicConfig()
|
||||
|
||||
OUTPUT_TYPES = ((".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml"),
|
||||
(".tag", "tag"))
|
||||
OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
|
||||
|
||||
|
||||
def float_or_disabled(x: str) -> Optional[float]:
|
||||
|
@ -29,17 +26,17 @@ def float_or_disabled(x: str) -> Optional[float]:
|
|||
|
||||
def extract_text(
|
||||
files: Iterable[str] = [],
|
||||
outfile: str = '-',
|
||||
outfile: str = "-",
|
||||
laparams: Optional[LAParams] = None,
|
||||
output_type: str = 'text',
|
||||
codec: str = 'utf-8',
|
||||
output_type: str = "text",
|
||||
codec: str = "utf-8",
|
||||
strip_control: bool = False,
|
||||
maxpages: int = 0,
|
||||
page_numbers: Optional[Container[int]] = None,
|
||||
password: str = "",
|
||||
scale: float = 1.0,
|
||||
rotation: int = 0,
|
||||
layoutmode: str = 'normal',
|
||||
layoutmode: str = "normal",
|
||||
output_dir: Optional[str] = None,
|
||||
debug: bool = False,
|
||||
disable_caching: bool = False,
|
||||
|
@ -56,7 +53,7 @@ def extract_text(
|
|||
if outfile == "-":
|
||||
outfp: AnyIO = sys.stdout
|
||||
if sys.stdout.encoding is not None:
|
||||
codec = 'utf-8'
|
||||
codec = "utf-8"
|
||||
else:
|
||||
outfp = open(outfile, "wb")
|
||||
|
||||
|
@ -69,73 +66,133 @@ def extract_text(
|
|||
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
||||
parser.add_argument(
|
||||
"files", type=str, default=None, nargs="+",
|
||||
help="One or more paths to PDF files.")
|
||||
"files",
|
||||
type=str,
|
||||
default=None,
|
||||
nargs="+",
|
||||
help="One or more paths to PDF files.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--version", "-v", action="version",
|
||||
version="pdfminer.six v{}".format(pdfminer.__version__))
|
||||
"--version",
|
||||
"-v",
|
||||
action="version",
|
||||
version="pdfminer.six v{}".format(pdfminer.__version__),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug", "-d", default=False, action="store_true",
|
||||
help="Use debug logging level.")
|
||||
"--debug",
|
||||
"-d",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Use debug logging level.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-caching", "-C", default=False, action="store_true",
|
||||
help="If caching or resources, such as fonts, should be disabled.")
|
||||
"--disable-caching",
|
||||
"-C",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If caching or resources, such as fonts, should be disabled.",
|
||||
)
|
||||
|
||||
parse_params = parser.add_argument_group(
|
||||
'Parser', description='Used during PDF parsing')
|
||||
"Parser", description="Used during PDF parsing"
|
||||
)
|
||||
parse_params.add_argument(
|
||||
"--page-numbers", type=int, default=None, nargs="+",
|
||||
help="A space-seperated list of page numbers to parse.")
|
||||
"--page-numbers",
|
||||
type=int,
|
||||
default=None,
|
||||
nargs="+",
|
||||
help="A space-seperated list of page numbers to parse.",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
"--pagenos", "-p", type=str,
|
||||
"--pagenos",
|
||||
"-p",
|
||||
type=str,
|
||||
help="A comma-separated list of page numbers to parse. "
|
||||
"Included for legacy applications, use --page-numbers "
|
||||
"for more idiomatic argument entry.")
|
||||
"for more idiomatic argument entry.",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
"--maxpages", "-m", type=int, default=0,
|
||||
help="The maximum number of pages to parse.")
|
||||
"--maxpages",
|
||||
"-m",
|
||||
type=int,
|
||||
default=0,
|
||||
help="The maximum number of pages to parse.",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
"--password", "-P", type=str, default="",
|
||||
help="The password to use for decrypting PDF file.")
|
||||
"--password",
|
||||
"-P",
|
||||
type=str,
|
||||
default="",
|
||||
help="The password to use for decrypting PDF file.",
|
||||
)
|
||||
parse_params.add_argument(
|
||||
"--rotation", "-R", default=0, type=int,
|
||||
"--rotation",
|
||||
"-R",
|
||||
default=0,
|
||||
type=int,
|
||||
help="The number of degrees to rotate the PDF "
|
||||
"before other types of processing.")
|
||||
"before other types of processing.",
|
||||
)
|
||||
|
||||
la_params = LAParams() # will be used for defaults
|
||||
la_param_group = parser.add_argument_group(
|
||||
'Layout analysis', description='Used during layout analysis.')
|
||||
"Layout analysis", description="Used during layout analysis."
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--no-laparams", "-n", default=False, action="store_true",
|
||||
help="If layout analysis parameters should be ignored.")
|
||||
la_param_group.add_argument(
|
||||
"--detect-vertical", "-V", default=la_params.detect_vertical,
|
||||
"--no-laparams",
|
||||
"-n",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If vertical text should be considered during layout analysis")
|
||||
help="If layout analysis parameters should be ignored.",
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--line-overlap", type=float, default=la_params.line_overlap,
|
||||
help='If two characters have more overlap than this they '
|
||||
'are considered to be on the same line. The overlap is specified '
|
||||
'relative to the minimum height of both characters.')
|
||||
"--detect-vertical",
|
||||
"-V",
|
||||
default=la_params.detect_vertical,
|
||||
action="store_true",
|
||||
help="If vertical text should be considered during layout analysis",
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--char-margin", "-M", type=float, default=la_params.char_margin,
|
||||
"--line-overlap",
|
||||
type=float,
|
||||
default=la_params.line_overlap,
|
||||
help="If two characters have more overlap than this they "
|
||||
"are considered to be on the same line. The overlap is specified "
|
||||
"relative to the minimum height of both characters.",
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--char-margin",
|
||||
"-M",
|
||||
type=float,
|
||||
default=la_params.char_margin,
|
||||
help="If two characters are closer together than this margin they "
|
||||
"are considered to be part of the same line. The margin is "
|
||||
"specified relative to the width of the character.")
|
||||
"specified relative to the width of the character.",
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--word-margin", "-W", type=float, default=la_params.word_margin,
|
||||
"--word-margin",
|
||||
"-W",
|
||||
type=float,
|
||||
default=la_params.word_margin,
|
||||
help="If two characters on the same line are further apart than this "
|
||||
"margin then they are considered to be two separate words, and "
|
||||
"an intermediate space will be added for readability. The margin "
|
||||
"is specified relative to the width of the character.")
|
||||
"is specified relative to the width of the character.",
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--line-margin", "-L", type=float, default=la_params.line_margin,
|
||||
"--line-margin",
|
||||
"-L",
|
||||
type=float,
|
||||
default=la_params.line_margin,
|
||||
help="If two lines are close together they are considered to "
|
||||
"be part of the same paragraph. The margin is specified "
|
||||
"relative to the height of a line.")
|
||||
"relative to the height of a line.",
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--boxes-flow", "-F", type=float_or_disabled,
|
||||
"--boxes-flow",
|
||||
"-F",
|
||||
type=float_or_disabled,
|
||||
default=la_params.boxes_flow,
|
||||
help="Specifies how much a horizontal and vertical position of a "
|
||||
"text matters when determining the order of lines. The value "
|
||||
|
@ -143,44 +200,77 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
|||
"matters) to +1.0 (only vertical position matters). You can also "
|
||||
"pass `disabled` to disable advanced layout analysis, and "
|
||||
"instead return text based on the position of the bottom left "
|
||||
"corner of the text box.")
|
||||
"corner of the text box.",
|
||||
)
|
||||
la_param_group.add_argument(
|
||||
"--all-texts", "-A", default=la_params.all_texts, action="store_true",
|
||||
help="If layout analysis should be performed on text in figures.")
|
||||
"--all-texts",
|
||||
"-A",
|
||||
default=la_params.all_texts,
|
||||
action="store_true",
|
||||
help="If layout analysis should be performed on text in figures.",
|
||||
)
|
||||
|
||||
output_params = parser.add_argument_group(
|
||||
'Output', description='Used during output generation.')
|
||||
"Output", description="Used during output generation."
|
||||
)
|
||||
output_params.add_argument(
|
||||
"--outfile", "-o", type=str, default="-",
|
||||
"--outfile",
|
||||
"-o",
|
||||
type=str,
|
||||
default="-",
|
||||
help="Path to file where output is written. "
|
||||
"Or \"-\" (default) to write to stdout.")
|
||||
'Or "-" (default) to write to stdout.',
|
||||
)
|
||||
output_params.add_argument(
|
||||
"--output_type", "-t", type=str, default="text",
|
||||
help="Type of output to generate {text,html,xml,tag}.")
|
||||
"--output_type",
|
||||
"-t",
|
||||
type=str,
|
||||
default="text",
|
||||
help="Type of output to generate {text,html,xml,tag}.",
|
||||
)
|
||||
output_params.add_argument(
|
||||
"--codec", "-c", type=str, default="utf-8",
|
||||
help="Text encoding to use in output file.")
|
||||
"--codec",
|
||||
"-c",
|
||||
type=str,
|
||||
default="utf-8",
|
||||
help="Text encoding to use in output file.",
|
||||
)
|
||||
output_params.add_argument(
|
||||
"--output-dir", "-O", default=None,
|
||||
"--output-dir",
|
||||
"-O",
|
||||
default=None,
|
||||
help="The output directory to put extracted images in. If not given, "
|
||||
"images are not extracted.")
|
||||
"images are not extracted.",
|
||||
)
|
||||
output_params.add_argument(
|
||||
"--layoutmode", "-Y", default="normal",
|
||||
type=str, help="Type of layout to use when generating html "
|
||||
"--layoutmode",
|
||||
"-Y",
|
||||
default="normal",
|
||||
type=str,
|
||||
help="Type of layout to use when generating html "
|
||||
"{normal,exact,loose}. If normal,each line is"
|
||||
" positioned separately in the html. If exact"
|
||||
", each character is positioned separately in"
|
||||
" the html. If loose, same result as normal "
|
||||
"but with an additional newline after each "
|
||||
"text line. Only used when output_type is html.")
|
||||
"text line. Only used when output_type is html.",
|
||||
)
|
||||
output_params.add_argument(
|
||||
"--scale", "-s", type=float, default=1.0,
|
||||
"--scale",
|
||||
"-s",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="The amount of zoom to use when generating html file. "
|
||||
"Only used when output_type is html.")
|
||||
"Only used when output_type is html.",
|
||||
)
|
||||
output_params.add_argument(
|
||||
"--strip-control", "-S", default=False, action="store_true",
|
||||
"--strip-control",
|
||||
"-S",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Remove control statement from text. "
|
||||
"Only used when output_type is xml.")
|
||||
"Only used when output_type is xml.",
|
||||
)
|
||||
|
||||
parsed_args = parser.parse_args(args=args)
|
||||
|
||||
|
@ -202,10 +292,7 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
|||
parsed_args.page_numbers = {x - 1 for x in parsed_args.page_numbers}
|
||||
|
||||
if parsed_args.pagenos:
|
||||
parsed_args.page_numbers = {
|
||||
int(x) - 1
|
||||
for x in parsed_args.pagenos.split(",")
|
||||
}
|
||||
parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
|
||||
|
||||
if parsed_args.output_type == "text" and parsed_args.outfile != "-":
|
||||
for override, alttype in OUTPUT_TYPES:
|
||||
|
@ -222,5 +309,5 @@ def main(args: Optional[List[str]] = None) -> int:
|
|||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
173
tools/pdfdiff.py
173
tools/pdfdiff.py
|
@ -21,14 +21,20 @@ def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]:
|
|||
# If any LAParams group arguments were passed,
|
||||
# create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if kwargs.get('laparams', None) is None:
|
||||
if kwargs.get("laparams", None) is None:
|
||||
laparams = layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin",
|
||||
"char_margin", "line_margin", "boxes_flow"):
|
||||
for param in (
|
||||
"all_texts",
|
||||
"detect_vertical",
|
||||
"word_margin",
|
||||
"char_margin",
|
||||
"line_margin",
|
||||
"boxes_flow",
|
||||
):
|
||||
paramv = kwargs.get(param, None)
|
||||
if paramv is not None:
|
||||
setattr(laparams, param, paramv)
|
||||
kwargs['laparams'] = laparams
|
||||
kwargs["laparams"] = laparams
|
||||
|
||||
s1 = io.StringIO()
|
||||
with open(file1, "rb") as fp:
|
||||
|
@ -39,81 +45,140 @@ def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]:
|
|||
high_level.extract_text_to_fp(fp, s2, **kwargs)
|
||||
|
||||
import difflib
|
||||
|
||||
s1.seek(0)
|
||||
s2.seek(0)
|
||||
s1_lines, s2_lines = s1.readlines(), s2.readlines()
|
||||
|
||||
import os.path
|
||||
|
||||
try:
|
||||
extension = os.path.splitext(kwargs['outfile'])[1][1:4]
|
||||
if extension.lower() == 'htm':
|
||||
extension = os.path.splitext(kwargs["outfile"])[1][1:4]
|
||||
if extension.lower() == "htm":
|
||||
return difflib.HtmlDiff().make_file(s1_lines, s2_lines)
|
||||
except KeyError:
|
||||
pass
|
||||
return difflib.unified_diff(s1_lines, s2_lines, n=kwargs['context_lines'])
|
||||
return difflib.unified_diff(s1_lines, s2_lines, n=kwargs["context_lines"])
|
||||
|
||||
|
||||
# main
|
||||
def main(args: Optional[List[str]] = None) -> int:
|
||||
import argparse
|
||||
|
||||
P = argparse.ArgumentParser(description=__doc__)
|
||||
P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
|
||||
P.add_argument("file2", type=str, default=None, help="File 2 to compare.")
|
||||
P.add_argument("-o", "--outfile", type=str, default="-",
|
||||
P.add_argument(
|
||||
"-o",
|
||||
"--outfile",
|
||||
type=str,
|
||||
default="-",
|
||||
help="Output file(default/'-' is stdout) if .htm or .html,"
|
||||
" create an HTML table (or a complete HTML file "
|
||||
"containing the table) showing a side by side, "
|
||||
"line by line comparison of text with inter-line and "
|
||||
"intra-line change highlights. The table can be "
|
||||
"generated in either full or "
|
||||
"contextual difference mode.")
|
||||
P.add_argument("-N", "--context-lines", default=3, type=int,
|
||||
help="context lines shown")
|
||||
P.add_argument("-d", "--debug", default=False, action="store_true",
|
||||
help="Debug output.")
|
||||
"contextual difference mode.",
|
||||
)
|
||||
P.add_argument(
|
||||
"-N", "--context-lines", default=3, type=int, help="context lines shown"
|
||||
)
|
||||
P.add_argument(
|
||||
"-d", "--debug", default=False, action="store_true", help="Debug output."
|
||||
)
|
||||
|
||||
# params for pdf2txt
|
||||
P.add_argument("-p", "--pagenos", type=str,
|
||||
P.add_argument(
|
||||
"-p",
|
||||
"--pagenos",
|
||||
type=str,
|
||||
help="Comma-separated list of page numbers to parse. "
|
||||
"Included for legacy applications, "
|
||||
"use --page-numbers for more "
|
||||
"idiomatic argument entry.")
|
||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+",
|
||||
"idiomatic argument entry.",
|
||||
)
|
||||
P.add_argument(
|
||||
"--page-numbers",
|
||||
type=int,
|
||||
default=None,
|
||||
nargs="+",
|
||||
help="Alternative to --pagenos with space-separated "
|
||||
"numbers; supercedes --pagenos where it is used.")
|
||||
P.add_argument("-m", "--maxpages", type=int, default=0,
|
||||
help="Maximum pages to parse")
|
||||
P.add_argument("-P", "--password", type=str, default="",
|
||||
help="Decryption password for both PDFs")
|
||||
P.add_argument("-t", "--output_type", type=str, default="text",
|
||||
help="pdf2txt type: text|html|xml|tag (default is text)")
|
||||
P.add_argument("-c", "--codec", type=str, default="utf-8",
|
||||
help="Text encoding")
|
||||
"numbers; supercedes --pagenos where it is used.",
|
||||
)
|
||||
P.add_argument(
|
||||
"-m", "--maxpages", type=int, default=0, help="Maximum pages to parse"
|
||||
)
|
||||
P.add_argument(
|
||||
"-P",
|
||||
"--password",
|
||||
type=str,
|
||||
default="",
|
||||
help="Decryption password for both PDFs",
|
||||
)
|
||||
P.add_argument(
|
||||
"-t",
|
||||
"--output_type",
|
||||
type=str,
|
||||
default="text",
|
||||
help="pdf2txt type: text|html|xml|tag (default is text)",
|
||||
)
|
||||
P.add_argument("-c", "--codec", type=str, default="utf-8", help="Text encoding")
|
||||
P.add_argument("-s", "--scale", type=float, default=1.0, help="Scale")
|
||||
P.add_argument("-A", "--all-texts", default=None, action="store_true",
|
||||
help="LAParams all texts")
|
||||
P.add_argument("-V", "--detect-vertical", default=None,
|
||||
action="store_true", help="LAParams detect vertical")
|
||||
P.add_argument("-W", "--word-margin", type=float, default=None,
|
||||
help="LAParams word margin")
|
||||
P.add_argument("-M", "--char-margin", type=float, default=None,
|
||||
help="LAParams char margin")
|
||||
P.add_argument("-L", "--line-margin", type=float, default=None,
|
||||
help="LAParams line margin")
|
||||
P.add_argument("-F", "--boxes-flow", type=float, default=None,
|
||||
help="LAParams boxes flow")
|
||||
P.add_argument("-Y", "--layoutmode", default="normal", type=str,
|
||||
help="HTML Layout Mode")
|
||||
P.add_argument("-n", "--no-laparams", default=False,
|
||||
action="store_true", help="Pass None as LAParams")
|
||||
P.add_argument("-R", "--rotation", default=0, type=int,
|
||||
help="Rotation")
|
||||
P.add_argument("-O", "--output-dir", default=None,
|
||||
help="Output directory for images")
|
||||
P.add_argument("-C", "--disable-caching", default=False,
|
||||
action="store_true", help="Disable caching")
|
||||
P.add_argument("-S", "--strip-control", default=False,
|
||||
action="store_true", help="Strip control in XML mode")
|
||||
P.add_argument(
|
||||
"-A",
|
||||
"--all-texts",
|
||||
default=None,
|
||||
action="store_true",
|
||||
help="LAParams all texts",
|
||||
)
|
||||
P.add_argument(
|
||||
"-V",
|
||||
"--detect-vertical",
|
||||
default=None,
|
||||
action="store_true",
|
||||
help="LAParams detect vertical",
|
||||
)
|
||||
P.add_argument(
|
||||
"-W", "--word-margin", type=float, default=None, help="LAParams word margin"
|
||||
)
|
||||
P.add_argument(
|
||||
"-M", "--char-margin", type=float, default=None, help="LAParams char margin"
|
||||
)
|
||||
P.add_argument(
|
||||
"-L", "--line-margin", type=float, default=None, help="LAParams line margin"
|
||||
)
|
||||
P.add_argument(
|
||||
"-F", "--boxes-flow", type=float, default=None, help="LAParams boxes flow"
|
||||
)
|
||||
P.add_argument(
|
||||
"-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode"
|
||||
)
|
||||
P.add_argument(
|
||||
"-n",
|
||||
"--no-laparams",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Pass None as LAParams",
|
||||
)
|
||||
P.add_argument("-R", "--rotation", default=0, type=int, help="Rotation")
|
||||
P.add_argument(
|
||||
"-O", "--output-dir", default=None, help="Output directory for images"
|
||||
)
|
||||
P.add_argument(
|
||||
"-C",
|
||||
"--disable-caching",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Disable caching",
|
||||
)
|
||||
P.add_argument(
|
||||
"-S",
|
||||
"--strip-control",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Strip control in XML mode",
|
||||
)
|
||||
|
||||
A = P.parse_args(args=args)
|
||||
|
||||
|
@ -126,21 +191,23 @@ def main(args: Optional[List[str]] = None) -> int:
|
|||
A.page_numbers = {int(x) - 1 for x in A.pagenos.split(",")}
|
||||
|
||||
if A.output_type == "text" and A.outfile != "-":
|
||||
for override, alttype in ((".htm", "html"),
|
||||
for override, alttype in (
|
||||
(".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml"),
|
||||
(".tag", "tag")):
|
||||
(".tag", "tag"),
|
||||
):
|
||||
if A.outfile.endswith(override):
|
||||
A.output_type = alttype
|
||||
|
||||
if A.outfile == "-":
|
||||
outfp = sys.stdout
|
||||
else:
|
||||
outfp = open(A.outfile, "w", encoding='utf-8')
|
||||
outfp = open(A.outfile, "w", encoding="utf-8")
|
||||
outfp.writelines(compare(**vars(A)))
|
||||
outfp.close()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
|
@ -21,7 +21,7 @@ _, SCRIPT = os.path.split(__file__)
|
|||
|
||||
|
||||
def msg(*args: object, **kwargs: Any) -> None:
|
||||
print(' '.join(map(str, args)), **kwargs) # noqa E999
|
||||
print(" ".join(map(str, args)), **kwargs) # noqa E999
|
||||
|
||||
|
||||
def flat_iter(obj: object) -> Iterator[object]:
|
||||
|
@ -35,22 +35,22 @@ def main(args: List[str]) -> int:
|
|||
msg(SCRIPT, args)
|
||||
|
||||
if len(args) != 1:
|
||||
msg('Parse a PDF file and print some pdfminer-specific stats')
|
||||
msg('Usage:', SCRIPT, '<PDF-filename>')
|
||||
msg("Parse a PDF file and print some pdfminer-specific stats")
|
||||
msg("Usage:", SCRIPT, "<PDF-filename>")
|
||||
return 1
|
||||
|
||||
infilename, = args
|
||||
(infilename,) = args
|
||||
|
||||
lt_types: Counter[str] = collections.Counter()
|
||||
|
||||
with open(infilename, 'rb') as pdf_file:
|
||||
with open(infilename, "rb") as pdf_file:
|
||||
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(pdf_file)
|
||||
|
||||
# Create a PDF document object that stores the document structure.
|
||||
# Supply the password for initialization.
|
||||
password = ''
|
||||
password = ""
|
||||
document = PDFDocument(parser, password)
|
||||
# Check if the document allows text extraction.
|
||||
if not document.is_extractable:
|
||||
|
@ -75,11 +75,11 @@ def main(args: List[str]) -> int:
|
|||
|
||||
lt_types.update(type(item).__name__ for item in flat_iter(layout))
|
||||
|
||||
msg('page_count', page_count)
|
||||
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
|
||||
msg("page_count", page_count)
|
||||
msg("lt_types:", " ".join("{}:{}".format(*tc) for tc in lt_types.items()))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
|
|
|
@ -7,14 +7,15 @@ def prof_main(argv: List[str]) -> int:
|
|||
import hotshot.stats # type: ignore[import]
|
||||
|
||||
def usage() -> int:
|
||||
print('usage: %s module.function [args ...]' % argv[0])
|
||||
print("usage: %s module.function [args ...]" % argv[0])
|
||||
return 100
|
||||
|
||||
args = argv[1:]
|
||||
if len(args) < 1:
|
||||
return usage()
|
||||
name = args.pop(0)
|
||||
prof = name+'.prof'
|
||||
i = name.rindex('.')
|
||||
prof = name + ".prof"
|
||||
i = name.rindex(".")
|
||||
(modname, funcname) = (name[:i], name[i + 1 :])
|
||||
|
||||
# Type error: fromlist expects sequence of strings; presumably the intent
|
||||
|
@ -31,10 +32,10 @@ def prof_main(argv: List[str]) -> int:
|
|||
else:
|
||||
stats = hotshot.stats.load(prof)
|
||||
stats.strip_dirs()
|
||||
stats.sort_stats('time', 'calls')
|
||||
stats.sort_stats("time", "calls")
|
||||
stats.print_stats(1000)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
sys.exit(prof_main(sys.argv))
|
||||
|
|
Loading…
Reference in New Issue