Check blackness in github actions (#711)

* Check blackness in github actions

* Blacken code

* Update github action names

* Add contributing guidelines on using black

* Add to checklist for PR
pull/688/head^2
Pieter Marsman 2022-02-11 22:46:51 +01:00 committed by GitHub
parent 830acff94c
commit b9a8920cdf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
60 changed files with 12836 additions and 7435 deletions

5
.flake8 Normal file
View File

@ -0,0 +1,5 @@
[flake8]
max-line-length = 88
extend-ignore =
# See https://github.com/PyCQA/pycodestyle/issues/373
E203,

View File

@ -1,22 +1,17 @@
**Pull request** **Pull request**
Thanks for improving pdfminer.six! Please include the following information to Please remove this paragraph and replace it with a description of your PR.
help us discuss and merge this PR: Also include links to the issues that it fixes.
- A description of why this PR is needed. What does it fix? What does it
improve?
- A summary of the things that this PR changes.
- Reference the issues that this PR fixes (use the fixes #(issue nr) syntax).
If this PR does not fix any issue, create the issue first and mention that
you are willing to work on it.
**How Has This Been Tested?** **How Has This Been Tested?**
Please describe the tests that you ran to verify your changes. Provide Please repalce this paragraph with a description of how this PR has been
instructions so we can reproduce. Include an example pdf if you have one. tested. Include the necessary instructions and files such that other can
reproduce it.
**Checklist** **Checklist**
- [ ] I have formatted my code with [black](https://github.com/psf/black).
- [ ] I have added tests that prove my fix is effective or that my feature - [ ] I have added tests that prove my fix is effective or that my feature
works works
- [ ] I have added docstrings to newly created methods and classes - [ ] I have added docstrings to newly created methods and classes

View File

@ -15,6 +15,15 @@ env:
jobs: jobs:
check-code-formatting:
name: Check code formatting
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Check code formatting
uses: psf/black@stable
check-coding-style: check-coding-style:
name: Check coding style name: Check coding style
runs-on: ubuntu-latest runs-on: ubuntu-latest

View File

@ -31,7 +31,7 @@ Any contribution is appreciated! You might want to:
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case * Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
of features, this will show that your code works correctly. of features, this will show that your code works correctly.
* Code should work for Python 3.6+. * Code should work for Python 3.6+.
* Code should conform to PEP8 coding style. * Code should be formatted with [black](https://github.com/psf/black).
* New features should be well documented using docstrings. * New features should be well documented using docstrings.
* Check spelling and grammar. * Check spelling and grammar.
* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased]) * Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])
@ -68,3 +68,9 @@ Any contribution is appreciated! You might want to:
```sh ```sh
nox -e py36 nox -e py36
``` ```
4. After changing the code, run the black formatter.
```sh
black .
```

View File

@ -16,14 +16,13 @@ from typing import List
import pdfminer import pdfminer
sys.path.insert(0, os.path.join( sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../"))
os.path.abspath(os.path.dirname(__file__)), '../../'))
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = 'pdfminer.six' project = "pdfminer.six"
copyright = '2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman' copyright = "2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
author = 'Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman' author = "Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
release = pdfminer.__version__ release = pdfminer.__version__
@ -35,16 +34,16 @@ release = pdfminer.__version__
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones. # ones.
extensions = [ extensions = [
'sphinxarg.ext', "sphinxarg.ext",
'sphinx.ext.autodoc', "sphinx.ext.autodoc",
'sphinx.ext.doctest', "sphinx.ext.doctest",
] ]
# Root rst file # Root rst file
master_doc = 'index' master_doc = "index"
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and # List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files. # directories to ignore when looking for source files.
@ -57,9 +56,9 @@ exclude_patterns: List[str] = []
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes. # a list of builtin themes.
# #
html_theme = 'alabaster' html_theme = "alabaster"
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static'] html_static_path = ["_static"]

View File

@ -6,53 +6,30 @@ PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"]
@nox.session @nox.session
def lint(session): def lint(session):
session.install('flake8') session.install("flake8")
session.run( session.run("flake8", "pdfminer/", "tools/", "tests/", "--count", "--statistics")
'flake8',
'pdfminer/',
'tools/',
'tests/',
'--count',
'--statistics'
)
@nox.session @nox.session
def types(session): def types(session):
session.install('mypy') session.install("mypy")
session.run( session.run(
'mypy', "mypy", "--install-types", "--non-interactive", "--show-error-codes", "."
'--install-types',
'--non-interactive',
'--show-error-codes',
'.'
) )
@nox.session(python=PYTHON_ALL_VERSIONS) @nox.session(python=PYTHON_ALL_VERSIONS)
def tests(session): def tests(session):
session.install("-e", ".[dev]") session.install("-e", ".[dev]")
session.run('pytest') session.run("pytest")
@nox.session @nox.session
def docs(session): def docs(session):
session.install("-e", ".[docs]") session.install("-e", ".[docs]")
session.run( session.run(
'python', "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html"
'-m',
'sphinx',
'-b',
'html',
'docs/source',
'docs/build/html'
) )
session.run( session.run(
'python', "python", "-m", "sphinx", "-b", "doctest", "docs/source", "docs/build/doctest"
'-m',
'sphinx',
'-b',
'doctest',
'docs/source',
'docs/build/doctest'
) )

View File

@ -1,4 +1,4 @@
__version__ = '20211012' __version__ = "20211012"
if __name__ == '__main__': if __name__ == "__main__":
print(__version__) print(__version__)

View File

@ -18,7 +18,7 @@
"""An implementation of RFC4013 SASLprep.""" """An implementation of RFC4013 SASLprep."""
__all__ = ['saslprep'] __all__ = ["saslprep"]
import stringprep import stringprep
from typing import Callable, Tuple from typing import Callable, Tuple
@ -37,7 +37,8 @@ _PROHIBITED: Tuple[Callable[[str], bool], ...] = (
stringprep.in_table_c6, stringprep.in_table_c6,
stringprep.in_table_c7, stringprep.in_table_c7,
stringprep.in_table_c8, stringprep.in_table_c8,
stringprep.in_table_c9) stringprep.in_table_c9,
)
def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str: def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
@ -63,12 +64,12 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
in_table_c12 = stringprep.in_table_c12 in_table_c12 = stringprep.in_table_c12
in_table_b1 = stringprep.in_table_b1 in_table_b1 = stringprep.in_table_b1
data = "".join( data = "".join(
["\u0020" if in_table_c12(elt) else elt ["\u0020" if in_table_c12(elt) else elt for elt in data if not in_table_b1(elt)]
for elt in data if not in_table_b1(elt)]) )
# RFC3454 section 2, step 2 - Normalize # RFC3454 section 2, step 2 - Normalize
# RFC4013 section 2.2 normalization # RFC4013 section 2.2 normalization
data = unicodedata.ucd_3_2_0.normalize('NFKC', data) data = unicodedata.ucd_3_2_0.normalize("NFKC", data)
in_table_d1 = stringprep.in_table_d1 in_table_d1 = stringprep.in_table_d1
if in_table_d1(data[0]): if in_table_d1(data[0]):
@ -89,7 +90,6 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
# RFC3454 section 2, step 3 and 4 - Prohibit and check bidi # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
for char in data: for char in data:
if any(in_table(char) for in_table in prohibited): if any(in_table(char) for in_table in prohibited):
raise ValueError( raise ValueError("SASLprep: failed prohibited character check")
"SASLprep: failed prohibited character check")
return data return data

View File

@ -9,7 +9,6 @@ from typing import Sequence
class Arcfour: class Arcfour:
def __init__(self, key: Sequence[int]) -> None: def __init__(self, key: Sequence[int]) -> None:
# because Py3 range is not indexable # because Py3 range is not indexable
s = [i for i in range(256)] s = [i for i in range(256)]
@ -24,12 +23,12 @@ class Arcfour:
def process(self, data: bytes) -> bytes: def process(self, data: bytes) -> bytes:
(i, j) = (self.i, self.j) (i, j) = (self.i, self.j)
s = self.s s = self.s
r = b'' r = b""
for c in iter(data): for c in iter(data):
i = (i+1) % 256 i = (i + 1) % 256
j = (j+s[i]) % 256 j = (j + s[i]) % 256
(s[i], s[j]) = (s[j], s[i]) (s[i], s[j]) = (s[j], s[i])
k = s[(s[i]+s[j]) % 256] k = s[(s[i] + s[j]) % 256]
r += bytes((c ^ k,)) r += bytes((c ^ k,))
(self.i, self.j) = (i, j) (self.i, self.j) = (i, j)
return r return r

View File

@ -21,30 +21,30 @@ def ascii85decode(data: bytes) -> bytes:
""" """
n = b = 0 n = b = 0
out = b'' out = b""
for i in iter(data): for i in iter(data):
c = bytes((i,)) c = bytes((i,))
if b'!' <= c and c <= b'u': if b"!" <= c and c <= b"u":
n += 1 n += 1
b = b*85+(ord(c)-33) b = b * 85 + (ord(c) - 33)
if n == 5: if n == 5:
out += struct.pack('>L', b) out += struct.pack(">L", b)
n = b = 0 n = b = 0
elif c == b'z': elif c == b"z":
assert n == 0, str(n) assert n == 0, str(n)
out += b'\0\0\0\0' out += b"\0\0\0\0"
elif c == b'~': elif c == b"~":
if n: if n:
for _ in range(5-n): for _ in range(5 - n):
b = b*85+84 b = b * 85 + 84
out += struct.pack('>L', b)[:n-1] out += struct.pack(">L", b)[: n - 1]
break break
return out return out
# asciihexdecode(data) # asciihexdecode(data)
hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE) hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
def asciihexdecode(data: bytes) -> bytes: def asciihexdecode(data: bytes) -> bytes:
@ -57,15 +57,16 @@ def asciihexdecode(data: bytes) -> bytes:
the EOD marker after reading an odd number of hexadecimal digits, it the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit. will behave as if a 0 followed the last digit.
""" """
def decode(x: bytes) -> bytes: def decode(x: bytes) -> bytes:
i = int(x, 16) i = int(x, 16)
return bytes((i,)) return bytes((i,))
out = b'' out = b""
for x in hex_re.findall(data): for x in hex_re.findall(data):
out += decode(x) out += decode(x)
m = trail_re.search(data) m = trail_re.search(data)
if m: if m:
out += decode(m.group(1)+b'0') out += decode(m.group(1) + b"0")
return out return out

View File

@ -12,8 +12,18 @@
import array import array
from typing import (Any, Callable, Dict, Iterator, List, MutableSequence, from typing import (
Optional, Sequence, Union, cast) Any,
Callable,
Dict,
Iterator,
List,
MutableSequence,
Optional,
Sequence,
Union,
cast,
)
def get_bytes(data: bytes) -> Iterator[int]: def get_bytes(data: bytes) -> Iterator[int]:
@ -46,7 +56,7 @@ class BitParser:
if p[b] is None: if p[b] is None:
p[b] = [None, None] p[b] = [None, None]
p = p[b] p = p[b]
if bits[i] == '1': if bits[i] == "1":
b = 1 b = 1
else: else:
b = 0 b = 0
@ -74,252 +84,252 @@ class BitParser:
class CCITTG4Parser(BitParser): class CCITTG4Parser(BitParser):
MODE = [None, None] MODE = [None, None]
BitParser.add(MODE, 0, '1') BitParser.add(MODE, 0, "1")
BitParser.add(MODE, +1, '011') BitParser.add(MODE, +1, "011")
BitParser.add(MODE, -1, '010') BitParser.add(MODE, -1, "010")
BitParser.add(MODE, 'h', '001') BitParser.add(MODE, "h", "001")
BitParser.add(MODE, 'p', '0001') BitParser.add(MODE, "p", "0001")
BitParser.add(MODE, +2, '000011') BitParser.add(MODE, +2, "000011")
BitParser.add(MODE, -2, '000010') BitParser.add(MODE, -2, "000010")
BitParser.add(MODE, +3, '0000011') BitParser.add(MODE, +3, "0000011")
BitParser.add(MODE, -3, '0000010') BitParser.add(MODE, -3, "0000010")
BitParser.add(MODE, 'u', '0000001111') BitParser.add(MODE, "u", "0000001111")
BitParser.add(MODE, 'x1', '0000001000') BitParser.add(MODE, "x1", "0000001000")
BitParser.add(MODE, 'x2', '0000001001') BitParser.add(MODE, "x2", "0000001001")
BitParser.add(MODE, 'x3', '0000001010') BitParser.add(MODE, "x3", "0000001010")
BitParser.add(MODE, 'x4', '0000001011') BitParser.add(MODE, "x4", "0000001011")
BitParser.add(MODE, 'x5', '0000001100') BitParser.add(MODE, "x5", "0000001100")
BitParser.add(MODE, 'x6', '0000001101') BitParser.add(MODE, "x6", "0000001101")
BitParser.add(MODE, 'x7', '0000001110') BitParser.add(MODE, "x7", "0000001110")
BitParser.add(MODE, 'e', '000000000001000000000001') BitParser.add(MODE, "e", "000000000001000000000001")
WHITE = [None, None] WHITE = [None, None]
BitParser.add(WHITE, 0, '00110101') BitParser.add(WHITE, 0, "00110101")
BitParser.add(WHITE, 1, '000111') BitParser.add(WHITE, 1, "000111")
BitParser.add(WHITE, 2, '0111') BitParser.add(WHITE, 2, "0111")
BitParser.add(WHITE, 3, '1000') BitParser.add(WHITE, 3, "1000")
BitParser.add(WHITE, 4, '1011') BitParser.add(WHITE, 4, "1011")
BitParser.add(WHITE, 5, '1100') BitParser.add(WHITE, 5, "1100")
BitParser.add(WHITE, 6, '1110') BitParser.add(WHITE, 6, "1110")
BitParser.add(WHITE, 7, '1111') BitParser.add(WHITE, 7, "1111")
BitParser.add(WHITE, 8, '10011') BitParser.add(WHITE, 8, "10011")
BitParser.add(WHITE, 9, '10100') BitParser.add(WHITE, 9, "10100")
BitParser.add(WHITE, 10, '00111') BitParser.add(WHITE, 10, "00111")
BitParser.add(WHITE, 11, '01000') BitParser.add(WHITE, 11, "01000")
BitParser.add(WHITE, 12, '001000') BitParser.add(WHITE, 12, "001000")
BitParser.add(WHITE, 13, '000011') BitParser.add(WHITE, 13, "000011")
BitParser.add(WHITE, 14, '110100') BitParser.add(WHITE, 14, "110100")
BitParser.add(WHITE, 15, '110101') BitParser.add(WHITE, 15, "110101")
BitParser.add(WHITE, 16, '101010') BitParser.add(WHITE, 16, "101010")
BitParser.add(WHITE, 17, '101011') BitParser.add(WHITE, 17, "101011")
BitParser.add(WHITE, 18, '0100111') BitParser.add(WHITE, 18, "0100111")
BitParser.add(WHITE, 19, '0001100') BitParser.add(WHITE, 19, "0001100")
BitParser.add(WHITE, 20, '0001000') BitParser.add(WHITE, 20, "0001000")
BitParser.add(WHITE, 21, '0010111') BitParser.add(WHITE, 21, "0010111")
BitParser.add(WHITE, 22, '0000011') BitParser.add(WHITE, 22, "0000011")
BitParser.add(WHITE, 23, '0000100') BitParser.add(WHITE, 23, "0000100")
BitParser.add(WHITE, 24, '0101000') BitParser.add(WHITE, 24, "0101000")
BitParser.add(WHITE, 25, '0101011') BitParser.add(WHITE, 25, "0101011")
BitParser.add(WHITE, 26, '0010011') BitParser.add(WHITE, 26, "0010011")
BitParser.add(WHITE, 27, '0100100') BitParser.add(WHITE, 27, "0100100")
BitParser.add(WHITE, 28, '0011000') BitParser.add(WHITE, 28, "0011000")
BitParser.add(WHITE, 29, '00000010') BitParser.add(WHITE, 29, "00000010")
BitParser.add(WHITE, 30, '00000011') BitParser.add(WHITE, 30, "00000011")
BitParser.add(WHITE, 31, '00011010') BitParser.add(WHITE, 31, "00011010")
BitParser.add(WHITE, 32, '00011011') BitParser.add(WHITE, 32, "00011011")
BitParser.add(WHITE, 33, '00010010') BitParser.add(WHITE, 33, "00010010")
BitParser.add(WHITE, 34, '00010011') BitParser.add(WHITE, 34, "00010011")
BitParser.add(WHITE, 35, '00010100') BitParser.add(WHITE, 35, "00010100")
BitParser.add(WHITE, 36, '00010101') BitParser.add(WHITE, 36, "00010101")
BitParser.add(WHITE, 37, '00010110') BitParser.add(WHITE, 37, "00010110")
BitParser.add(WHITE, 38, '00010111') BitParser.add(WHITE, 38, "00010111")
BitParser.add(WHITE, 39, '00101000') BitParser.add(WHITE, 39, "00101000")
BitParser.add(WHITE, 40, '00101001') BitParser.add(WHITE, 40, "00101001")
BitParser.add(WHITE, 41, '00101010') BitParser.add(WHITE, 41, "00101010")
BitParser.add(WHITE, 42, '00101011') BitParser.add(WHITE, 42, "00101011")
BitParser.add(WHITE, 43, '00101100') BitParser.add(WHITE, 43, "00101100")
BitParser.add(WHITE, 44, '00101101') BitParser.add(WHITE, 44, "00101101")
BitParser.add(WHITE, 45, '00000100') BitParser.add(WHITE, 45, "00000100")
BitParser.add(WHITE, 46, '00000101') BitParser.add(WHITE, 46, "00000101")
BitParser.add(WHITE, 47, '00001010') BitParser.add(WHITE, 47, "00001010")
BitParser.add(WHITE, 48, '00001011') BitParser.add(WHITE, 48, "00001011")
BitParser.add(WHITE, 49, '01010010') BitParser.add(WHITE, 49, "01010010")
BitParser.add(WHITE, 50, '01010011') BitParser.add(WHITE, 50, "01010011")
BitParser.add(WHITE, 51, '01010100') BitParser.add(WHITE, 51, "01010100")
BitParser.add(WHITE, 52, '01010101') BitParser.add(WHITE, 52, "01010101")
BitParser.add(WHITE, 53, '00100100') BitParser.add(WHITE, 53, "00100100")
BitParser.add(WHITE, 54, '00100101') BitParser.add(WHITE, 54, "00100101")
BitParser.add(WHITE, 55, '01011000') BitParser.add(WHITE, 55, "01011000")
BitParser.add(WHITE, 56, '01011001') BitParser.add(WHITE, 56, "01011001")
BitParser.add(WHITE, 57, '01011010') BitParser.add(WHITE, 57, "01011010")
BitParser.add(WHITE, 58, '01011011') BitParser.add(WHITE, 58, "01011011")
BitParser.add(WHITE, 59, '01001010') BitParser.add(WHITE, 59, "01001010")
BitParser.add(WHITE, 60, '01001011') BitParser.add(WHITE, 60, "01001011")
BitParser.add(WHITE, 61, '00110010') BitParser.add(WHITE, 61, "00110010")
BitParser.add(WHITE, 62, '00110011') BitParser.add(WHITE, 62, "00110011")
BitParser.add(WHITE, 63, '00110100') BitParser.add(WHITE, 63, "00110100")
BitParser.add(WHITE, 64, '11011') BitParser.add(WHITE, 64, "11011")
BitParser.add(WHITE, 128, '10010') BitParser.add(WHITE, 128, "10010")
BitParser.add(WHITE, 192, '010111') BitParser.add(WHITE, 192, "010111")
BitParser.add(WHITE, 256, '0110111') BitParser.add(WHITE, 256, "0110111")
BitParser.add(WHITE, 320, '00110110') BitParser.add(WHITE, 320, "00110110")
BitParser.add(WHITE, 384, '00110111') BitParser.add(WHITE, 384, "00110111")
BitParser.add(WHITE, 448, '01100100') BitParser.add(WHITE, 448, "01100100")
BitParser.add(WHITE, 512, '01100101') BitParser.add(WHITE, 512, "01100101")
BitParser.add(WHITE, 576, '01101000') BitParser.add(WHITE, 576, "01101000")
BitParser.add(WHITE, 640, '01100111') BitParser.add(WHITE, 640, "01100111")
BitParser.add(WHITE, 704, '011001100') BitParser.add(WHITE, 704, "011001100")
BitParser.add(WHITE, 768, '011001101') BitParser.add(WHITE, 768, "011001101")
BitParser.add(WHITE, 832, '011010010') BitParser.add(WHITE, 832, "011010010")
BitParser.add(WHITE, 896, '011010011') BitParser.add(WHITE, 896, "011010011")
BitParser.add(WHITE, 960, '011010100') BitParser.add(WHITE, 960, "011010100")
BitParser.add(WHITE, 1024, '011010101') BitParser.add(WHITE, 1024, "011010101")
BitParser.add(WHITE, 1088, '011010110') BitParser.add(WHITE, 1088, "011010110")
BitParser.add(WHITE, 1152, '011010111') BitParser.add(WHITE, 1152, "011010111")
BitParser.add(WHITE, 1216, '011011000') BitParser.add(WHITE, 1216, "011011000")
BitParser.add(WHITE, 1280, '011011001') BitParser.add(WHITE, 1280, "011011001")
BitParser.add(WHITE, 1344, '011011010') BitParser.add(WHITE, 1344, "011011010")
BitParser.add(WHITE, 1408, '011011011') BitParser.add(WHITE, 1408, "011011011")
BitParser.add(WHITE, 1472, '010011000') BitParser.add(WHITE, 1472, "010011000")
BitParser.add(WHITE, 1536, '010011001') BitParser.add(WHITE, 1536, "010011001")
BitParser.add(WHITE, 1600, '010011010') BitParser.add(WHITE, 1600, "010011010")
BitParser.add(WHITE, 1664, '011000') BitParser.add(WHITE, 1664, "011000")
BitParser.add(WHITE, 1728, '010011011') BitParser.add(WHITE, 1728, "010011011")
BitParser.add(WHITE, 1792, '00000001000') BitParser.add(WHITE, 1792, "00000001000")
BitParser.add(WHITE, 1856, '00000001100') BitParser.add(WHITE, 1856, "00000001100")
BitParser.add(WHITE, 1920, '00000001101') BitParser.add(WHITE, 1920, "00000001101")
BitParser.add(WHITE, 1984, '000000010010') BitParser.add(WHITE, 1984, "000000010010")
BitParser.add(WHITE, 2048, '000000010011') BitParser.add(WHITE, 2048, "000000010011")
BitParser.add(WHITE, 2112, '000000010100') BitParser.add(WHITE, 2112, "000000010100")
BitParser.add(WHITE, 2176, '000000010101') BitParser.add(WHITE, 2176, "000000010101")
BitParser.add(WHITE, 2240, '000000010110') BitParser.add(WHITE, 2240, "000000010110")
BitParser.add(WHITE, 2304, '000000010111') BitParser.add(WHITE, 2304, "000000010111")
BitParser.add(WHITE, 2368, '000000011100') BitParser.add(WHITE, 2368, "000000011100")
BitParser.add(WHITE, 2432, '000000011101') BitParser.add(WHITE, 2432, "000000011101")
BitParser.add(WHITE, 2496, '000000011110') BitParser.add(WHITE, 2496, "000000011110")
BitParser.add(WHITE, 2560, '000000011111') BitParser.add(WHITE, 2560, "000000011111")
BLACK = [None, None] BLACK = [None, None]
BitParser.add(BLACK, 0, '0000110111') BitParser.add(BLACK, 0, "0000110111")
BitParser.add(BLACK, 1, '010') BitParser.add(BLACK, 1, "010")
BitParser.add(BLACK, 2, '11') BitParser.add(BLACK, 2, "11")
BitParser.add(BLACK, 3, '10') BitParser.add(BLACK, 3, "10")
BitParser.add(BLACK, 4, '011') BitParser.add(BLACK, 4, "011")
BitParser.add(BLACK, 5, '0011') BitParser.add(BLACK, 5, "0011")
BitParser.add(BLACK, 6, '0010') BitParser.add(BLACK, 6, "0010")
BitParser.add(BLACK, 7, '00011') BitParser.add(BLACK, 7, "00011")
BitParser.add(BLACK, 8, '000101') BitParser.add(BLACK, 8, "000101")
BitParser.add(BLACK, 9, '000100') BitParser.add(BLACK, 9, "000100")
BitParser.add(BLACK, 10, '0000100') BitParser.add(BLACK, 10, "0000100")
BitParser.add(BLACK, 11, '0000101') BitParser.add(BLACK, 11, "0000101")
BitParser.add(BLACK, 12, '0000111') BitParser.add(BLACK, 12, "0000111")
BitParser.add(BLACK, 13, '00000100') BitParser.add(BLACK, 13, "00000100")
BitParser.add(BLACK, 14, '00000111') BitParser.add(BLACK, 14, "00000111")
BitParser.add(BLACK, 15, '000011000') BitParser.add(BLACK, 15, "000011000")
BitParser.add(BLACK, 16, '0000010111') BitParser.add(BLACK, 16, "0000010111")
BitParser.add(BLACK, 17, '0000011000') BitParser.add(BLACK, 17, "0000011000")
BitParser.add(BLACK, 18, '0000001000') BitParser.add(BLACK, 18, "0000001000")
BitParser.add(BLACK, 19, '00001100111') BitParser.add(BLACK, 19, "00001100111")
BitParser.add(BLACK, 20, '00001101000') BitParser.add(BLACK, 20, "00001101000")
BitParser.add(BLACK, 21, '00001101100') BitParser.add(BLACK, 21, "00001101100")
BitParser.add(BLACK, 22, '00000110111') BitParser.add(BLACK, 22, "00000110111")
BitParser.add(BLACK, 23, '00000101000') BitParser.add(BLACK, 23, "00000101000")
BitParser.add(BLACK, 24, '00000010111') BitParser.add(BLACK, 24, "00000010111")
BitParser.add(BLACK, 25, '00000011000') BitParser.add(BLACK, 25, "00000011000")
BitParser.add(BLACK, 26, '000011001010') BitParser.add(BLACK, 26, "000011001010")
BitParser.add(BLACK, 27, '000011001011') BitParser.add(BLACK, 27, "000011001011")
BitParser.add(BLACK, 28, '000011001100') BitParser.add(BLACK, 28, "000011001100")
BitParser.add(BLACK, 29, '000011001101') BitParser.add(BLACK, 29, "000011001101")
BitParser.add(BLACK, 30, '000001101000') BitParser.add(BLACK, 30, "000001101000")
BitParser.add(BLACK, 31, '000001101001') BitParser.add(BLACK, 31, "000001101001")
BitParser.add(BLACK, 32, '000001101010') BitParser.add(BLACK, 32, "000001101010")
BitParser.add(BLACK, 33, '000001101011') BitParser.add(BLACK, 33, "000001101011")
BitParser.add(BLACK, 34, '000011010010') BitParser.add(BLACK, 34, "000011010010")
BitParser.add(BLACK, 35, '000011010011') BitParser.add(BLACK, 35, "000011010011")
BitParser.add(BLACK, 36, '000011010100') BitParser.add(BLACK, 36, "000011010100")
BitParser.add(BLACK, 37, '000011010101') BitParser.add(BLACK, 37, "000011010101")
BitParser.add(BLACK, 38, '000011010110') BitParser.add(BLACK, 38, "000011010110")
BitParser.add(BLACK, 39, '000011010111') BitParser.add(BLACK, 39, "000011010111")
BitParser.add(BLACK, 40, '000001101100') BitParser.add(BLACK, 40, "000001101100")
BitParser.add(BLACK, 41, '000001101101') BitParser.add(BLACK, 41, "000001101101")
BitParser.add(BLACK, 42, '000011011010') BitParser.add(BLACK, 42, "000011011010")
BitParser.add(BLACK, 43, '000011011011') BitParser.add(BLACK, 43, "000011011011")
BitParser.add(BLACK, 44, '000001010100') BitParser.add(BLACK, 44, "000001010100")
BitParser.add(BLACK, 45, '000001010101') BitParser.add(BLACK, 45, "000001010101")
BitParser.add(BLACK, 46, '000001010110') BitParser.add(BLACK, 46, "000001010110")
BitParser.add(BLACK, 47, '000001010111') BitParser.add(BLACK, 47, "000001010111")
BitParser.add(BLACK, 48, '000001100100') BitParser.add(BLACK, 48, "000001100100")
BitParser.add(BLACK, 49, '000001100101') BitParser.add(BLACK, 49, "000001100101")
BitParser.add(BLACK, 50, '000001010010') BitParser.add(BLACK, 50, "000001010010")
BitParser.add(BLACK, 51, '000001010011') BitParser.add(BLACK, 51, "000001010011")
BitParser.add(BLACK, 52, '000000100100') BitParser.add(BLACK, 52, "000000100100")
BitParser.add(BLACK, 53, '000000110111') BitParser.add(BLACK, 53, "000000110111")
BitParser.add(BLACK, 54, '000000111000') BitParser.add(BLACK, 54, "000000111000")
BitParser.add(BLACK, 55, '000000100111') BitParser.add(BLACK, 55, "000000100111")
BitParser.add(BLACK, 56, '000000101000') BitParser.add(BLACK, 56, "000000101000")
BitParser.add(BLACK, 57, '000001011000') BitParser.add(BLACK, 57, "000001011000")
BitParser.add(BLACK, 58, '000001011001') BitParser.add(BLACK, 58, "000001011001")
BitParser.add(BLACK, 59, '000000101011') BitParser.add(BLACK, 59, "000000101011")
BitParser.add(BLACK, 60, '000000101100') BitParser.add(BLACK, 60, "000000101100")
BitParser.add(BLACK, 61, '000001011010') BitParser.add(BLACK, 61, "000001011010")
BitParser.add(BLACK, 62, '000001100110') BitParser.add(BLACK, 62, "000001100110")
BitParser.add(BLACK, 63, '000001100111') BitParser.add(BLACK, 63, "000001100111")
BitParser.add(BLACK, 64, '0000001111') BitParser.add(BLACK, 64, "0000001111")
BitParser.add(BLACK, 128, '000011001000') BitParser.add(BLACK, 128, "000011001000")
BitParser.add(BLACK, 192, '000011001001') BitParser.add(BLACK, 192, "000011001001")
BitParser.add(BLACK, 256, '000001011011') BitParser.add(BLACK, 256, "000001011011")
BitParser.add(BLACK, 320, '000000110011') BitParser.add(BLACK, 320, "000000110011")
BitParser.add(BLACK, 384, '000000110100') BitParser.add(BLACK, 384, "000000110100")
BitParser.add(BLACK, 448, '000000110101') BitParser.add(BLACK, 448, "000000110101")
BitParser.add(BLACK, 512, '0000001101100') BitParser.add(BLACK, 512, "0000001101100")
BitParser.add(BLACK, 576, '0000001101101') BitParser.add(BLACK, 576, "0000001101101")
BitParser.add(BLACK, 640, '0000001001010') BitParser.add(BLACK, 640, "0000001001010")
BitParser.add(BLACK, 704, '0000001001011') BitParser.add(BLACK, 704, "0000001001011")
BitParser.add(BLACK, 768, '0000001001100') BitParser.add(BLACK, 768, "0000001001100")
BitParser.add(BLACK, 832, '0000001001101') BitParser.add(BLACK, 832, "0000001001101")
BitParser.add(BLACK, 896, '0000001110010') BitParser.add(BLACK, 896, "0000001110010")
BitParser.add(BLACK, 960, '0000001110011') BitParser.add(BLACK, 960, "0000001110011")
BitParser.add(BLACK, 1024, '0000001110100') BitParser.add(BLACK, 1024, "0000001110100")
BitParser.add(BLACK, 1088, '0000001110101') BitParser.add(BLACK, 1088, "0000001110101")
BitParser.add(BLACK, 1152, '0000001110110') BitParser.add(BLACK, 1152, "0000001110110")
BitParser.add(BLACK, 1216, '0000001110111') BitParser.add(BLACK, 1216, "0000001110111")
BitParser.add(BLACK, 1280, '0000001010010') BitParser.add(BLACK, 1280, "0000001010010")
BitParser.add(BLACK, 1344, '0000001010011') BitParser.add(BLACK, 1344, "0000001010011")
BitParser.add(BLACK, 1408, '0000001010100') BitParser.add(BLACK, 1408, "0000001010100")
BitParser.add(BLACK, 1472, '0000001010101') BitParser.add(BLACK, 1472, "0000001010101")
BitParser.add(BLACK, 1536, '0000001011010') BitParser.add(BLACK, 1536, "0000001011010")
BitParser.add(BLACK, 1600, '0000001011011') BitParser.add(BLACK, 1600, "0000001011011")
BitParser.add(BLACK, 1664, '0000001100100') BitParser.add(BLACK, 1664, "0000001100100")
BitParser.add(BLACK, 1728, '0000001100101') BitParser.add(BLACK, 1728, "0000001100101")
BitParser.add(BLACK, 1792, '00000001000') BitParser.add(BLACK, 1792, "00000001000")
BitParser.add(BLACK, 1856, '00000001100') BitParser.add(BLACK, 1856, "00000001100")
BitParser.add(BLACK, 1920, '00000001101') BitParser.add(BLACK, 1920, "00000001101")
BitParser.add(BLACK, 1984, '000000010010') BitParser.add(BLACK, 1984, "000000010010")
BitParser.add(BLACK, 2048, '000000010011') BitParser.add(BLACK, 2048, "000000010011")
BitParser.add(BLACK, 2112, '000000010100') BitParser.add(BLACK, 2112, "000000010100")
BitParser.add(BLACK, 2176, '000000010101') BitParser.add(BLACK, 2176, "000000010101")
BitParser.add(BLACK, 2240, '000000010110') BitParser.add(BLACK, 2240, "000000010110")
BitParser.add(BLACK, 2304, '000000010111') BitParser.add(BLACK, 2304, "000000010111")
BitParser.add(BLACK, 2368, '000000011100') BitParser.add(BLACK, 2368, "000000011100")
BitParser.add(BLACK, 2432, '000000011101') BitParser.add(BLACK, 2432, "000000011101")
BitParser.add(BLACK, 2496, '000000011110') BitParser.add(BLACK, 2496, "000000011110")
BitParser.add(BLACK, 2560, '000000011111') BitParser.add(BLACK, 2560, "000000011111")
UNCOMPRESSED = [None, None] UNCOMPRESSED = [None, None]
BitParser.add(UNCOMPRESSED, '1', '1') BitParser.add(UNCOMPRESSED, "1", "1")
BitParser.add(UNCOMPRESSED, '01', '01') BitParser.add(UNCOMPRESSED, "01", "01")
BitParser.add(UNCOMPRESSED, '001', '001') BitParser.add(UNCOMPRESSED, "001", "001")
BitParser.add(UNCOMPRESSED, '0001', '0001') BitParser.add(UNCOMPRESSED, "0001", "0001")
BitParser.add(UNCOMPRESSED, '00001', '00001') BitParser.add(UNCOMPRESSED, "00001", "00001")
BitParser.add(UNCOMPRESSED, '00000', '000001') BitParser.add(UNCOMPRESSED, "00000", "000001")
BitParser.add(UNCOMPRESSED, 'T00', '00000011') BitParser.add(UNCOMPRESSED, "T00", "00000011")
BitParser.add(UNCOMPRESSED, 'T10', '00000010') BitParser.add(UNCOMPRESSED, "T10", "00000010")
BitParser.add(UNCOMPRESSED, 'T000', '000000011') BitParser.add(UNCOMPRESSED, "T000", "000000011")
BitParser.add(UNCOMPRESSED, 'T100', '000000010') BitParser.add(UNCOMPRESSED, "T100", "000000010")
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011') BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010') BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011') BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010') BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
class EOFB(Exception): class EOFB(Exception):
pass pass
@ -352,21 +362,21 @@ class CCITTG4Parser(BitParser):
return return
def _parse_mode(self, mode: object) -> BitParserState: def _parse_mode(self, mode: object) -> BitParserState:
if mode == 'p': if mode == "p":
self._do_pass() self._do_pass()
self._flush_line() self._flush_line()
return self.MODE return self.MODE
elif mode == 'h': elif mode == "h":
self._n1 = 0 self._n1 = 0
self._accept = self._parse_horiz1 self._accept = self._parse_horiz1
if self._color: if self._color:
return self.WHITE return self.WHITE
else: else:
return self.BLACK return self.BLACK
elif mode == 'u': elif mode == "u":
self._accept = self._parse_uncompressed self._accept = self._parse_uncompressed
return self.UNCOMPRESSED return self.UNCOMPRESSED
elif mode == 'e': elif mode == "e":
raise self.EOFB raise self.EOFB
elif isinstance(mode, int): elif isinstance(mode, int):
self._do_vertical(mode) self._do_vertical(mode)
@ -381,7 +391,7 @@ class CCITTG4Parser(BitParser):
self._n1 += n self._n1 += n
if n < 64: if n < 64:
self._n2 = 0 self._n2 = 0
self._color = 1-self._color self._color = 1 - self._color
self._accept = self._parse_horiz2 self._accept = self._parse_horiz2
if self._color: if self._color:
return self.WHITE return self.WHITE
@ -393,7 +403,7 @@ class CCITTG4Parser(BitParser):
raise self.InvalidData raise self.InvalidData
self._n2 += n self._n2 += n
if n < 64: if n < 64:
self._color = 1-self._color self._color = 1 - self._color
self._accept = self._parse_mode self._accept = self._parse_mode
self._do_horizontal(self._n1, self._n2) self._do_horizontal(self._n1, self._n2)
self._flush_line() self._flush_line()
@ -406,7 +416,7 @@ class CCITTG4Parser(BitParser):
def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState: def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
if not bits: if not bits:
raise self.InvalidData raise self.InvalidData
if bits.startswith('T'): if bits.startswith("T"):
self._accept = self._parse_mode self._accept = self._parse_mode
self._color = int(bits[1]) self._color = int(bits[1])
self._do_uncompressed(bits[2:]) self._do_uncompressed(bits[2:])
@ -416,33 +426,37 @@ class CCITTG4Parser(BitParser):
return self.UNCOMPRESSED return self.UNCOMPRESSED
def _get_bits(self) -> str: def _get_bits(self) -> str:
return ''.join(str(b) for b in self._curline[:self._curpos]) return "".join(str(b) for b in self._curline[: self._curpos])
def _get_refline(self, i: int) -> str: def _get_refline(self, i: int) -> str:
if i < 0: if i < 0:
return '[]'+''.join(str(b) for b in self._refline) return "[]" + "".join(str(b) for b in self._refline)
elif len(self._refline) <= i: elif len(self._refline) <= i:
return ''.join(str(b) for b in self._refline)+'[]' return "".join(str(b) for b in self._refline) + "[]"
else: else:
return (''.join(str(b) for b in self._refline[:i]) + return (
'['+str(self._refline[i])+']' + "".join(str(b) for b in self._refline[:i])
''.join(str(b) for b in self._refline[i+1:])) + "["
+ str(self._refline[i])
+ "]"
+ "".join(str(b) for b in self._refline[i + 1 :])
)
def reset(self) -> None: def reset(self) -> None:
self._y = 0 self._y = 0
self._curline = array.array('b', [1]*self.width) self._curline = array.array("b", [1] * self.width)
self._reset_line() self._reset_line()
self._accept = self._parse_mode self._accept = self._parse_mode
self._state = self.MODE self._state = self.MODE
return return
def output_line(self, y: int, bits: Sequence[int]) -> None: def output_line(self, y: int, bits: Sequence[int]) -> None:
print(y, ''.join(str(b) for b in bits)) print(y, "".join(str(b) for b in bits))
return return
def _reset_line(self) -> None: def _reset_line(self) -> None:
self._refline = self._curline self._refline = self._curline
self._curline = array.array('b', [1]*self.width) self._curline = array.array("b", [1] * self.width)
self._curpos = -1 self._curpos = -1
self._color = 1 self._color = 1
return return
@ -457,15 +471,17 @@ class CCITTG4Parser(BitParser):
return return
def _do_vertical(self, dx: int) -> None: def _do_vertical(self, dx: int) -> None:
x1 = self._curpos+1 x1 = self._curpos + 1
while 1: while 1:
if x1 == 0: if x1 == 0:
if (self._color == 1 and self._refline[x1] != self._color): if self._color == 1 and self._refline[x1] != self._color:
break break
elif x1 == len(self._refline): elif x1 == len(self._refline):
break break
elif (self._refline[x1-1] == self._color and elif (
self._refline[x1] != self._color): self._refline[x1 - 1] == self._color
and self._refline[x1] != self._color
):
break break
x1 += 1 x1 += 1
x1 += dx x1 += dx
@ -478,29 +494,33 @@ class CCITTG4Parser(BitParser):
for x in range(x0, x1): for x in range(x0, x1):
self._curline[x] = self._color self._curline[x] = self._color
self._curpos = x1 self._curpos = x1
self._color = 1-self._color self._color = 1 - self._color
return return
def _do_pass(self) -> None: def _do_pass(self) -> None:
x1 = self._curpos+1 x1 = self._curpos + 1
while 1: while 1:
if x1 == 0: if x1 == 0:
if (self._color == 1 and self._refline[x1] != self._color): if self._color == 1 and self._refline[x1] != self._color:
break break
elif x1 == len(self._refline): elif x1 == len(self._refline):
break break
elif (self._refline[x1-1] == self._color and elif (
self._refline[x1] != self._color): self._refline[x1 - 1] == self._color
and self._refline[x1] != self._color
):
break break
x1 += 1 x1 += 1
while 1: while 1:
if x1 == 0: if x1 == 0:
if (self._color == 0 and self._refline[x1] == self._color): if self._color == 0 and self._refline[x1] == self._color:
break break
elif x1 == len(self._refline): elif x1 == len(self._refline):
break break
elif (self._refline[x1-1] != self._color and elif (
self._refline[x1] == self._color): self._refline[x1 - 1] != self._color
and self._refline[x1] == self._color
):
break break
x1 += 1 x1 += 1
for x in range(self._curpos, x1): for x in range(self._curpos, x1):
@ -520,7 +540,7 @@ class CCITTG4Parser(BitParser):
for _ in range(n2): for _ in range(n2):
if len(self._curline) <= x: if len(self._curline) <= x:
break break
self._curline[x] = 1-self._color self._curline[x] = 1 - self._color
x += 1 x += 1
self._curpos = x self._curpos = x
return return
@ -534,34 +554,34 @@ class CCITTG4Parser(BitParser):
class CCITTFaxDecoder(CCITTG4Parser): class CCITTFaxDecoder(CCITTG4Parser):
def __init__(
def __init__(self, width: int, bytealign: bool = False, self, width: int, bytealign: bool = False, reversed: bool = False
reversed: bool = False) -> None: ) -> None:
CCITTG4Parser.__init__(self, width, bytealign=bytealign) CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed self.reversed = reversed
self._buf = b'' self._buf = b""
return return
def close(self) -> bytes: def close(self) -> bytes:
return self._buf return self._buf
def output_line(self, y: int, bits: Sequence[int]) -> None: def output_line(self, y: int, bits: Sequence[int]) -> None:
arr = array.array('B', [0]*((len(bits)+7)//8)) arr = array.array("B", [0] * ((len(bits) + 7) // 8))
if self.reversed: if self.reversed:
bits = [1-b for b in bits] bits = [1 - b for b in bits]
for (i, b) in enumerate(bits): for (i, b) in enumerate(bits):
if b: if b:
arr[i//8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8] arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
self._buf += arr.tobytes() self._buf += arr.tobytes()
return return
def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes: def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
K = params.get('K') K = params.get("K")
if K == -1: if K == -1:
cols = cast(int, params.get('Columns')) cols = cast(int, params.get("Columns"))
bytealign = cast(bool, params.get('EncodedByteAlign')) bytealign = cast(bool, params.get("EncodedByteAlign"))
reversed = cast(bool, params.get('BlackIs1')) reversed = cast(bool, params.get("BlackIs1"))
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed) parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
else: else:
raise ValueError(K) raise ValueError(K)
@ -573,12 +593,14 @@ def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
def main(argv: List[str]) -> None: def main(argv: List[str]) -> None:
if not argv[1:]: if not argv[1:]:
import unittest import unittest
unittest.main() unittest.main()
return return
class Parser(CCITTG4Parser): class Parser(CCITTG4Parser):
def __init__(self, width: int, bytealign: bool = False) -> None: def __init__(self, width: int, bytealign: bool = False) -> None:
import pygame # type: ignore[import] import pygame # type: ignore[import]
CCITTG4Parser.__init__(self, width, bytealign=bytealign) CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width, 1000)) self.img = pygame.Surface((self.width, 1000))
return return
@ -593,11 +615,13 @@ def main(argv: List[str]) -> None:
def close(self) -> None: def close(self) -> None:
import pygame import pygame
pygame.image.save(self.img, 'out.bmp')
pygame.image.save(self.img, "out.bmp")
return return
for path in argv[1:]: for path in argv[1:]:
fp = open(path, 'rb') fp = open(path, "rb")
(_, _, k, w, h, _) = path.split('.') (_, _, k, w, h, _) = path.split(".")
parser = Parser(int(w)) parser = Parser(int(w))
parser.feedbytes(fp.read()) parser.feedbytes(fp.read())
parser.close() parser.close()

View File

@ -16,8 +16,20 @@ import os.path
import pickle as pickle import pickle as pickle
import struct import struct
import sys import sys
from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, from typing import (
MutableMapping, Optional, TextIO, Tuple, Union, cast) Any,
BinaryIO,
Dict,
Iterable,
Iterator,
List,
MutableMapping,
Optional,
TextIO,
Tuple,
Union,
cast,
)
from .encodingdb import name2unicode from .encodingdb import name2unicode
from .psparser import KWD from .psparser import KWD
@ -45,7 +57,7 @@ class CMapBase:
self.attrs: MutableMapping[str, object] = kwargs.copy() self.attrs: MutableMapping[str, object] = kwargs.copy()
def is_vertical(self) -> bool: def is_vertical(self) -> bool:
return self.attrs.get('WMode', 0) != 0 return self.attrs.get("WMode", 0) != 0
def set_attr(self, k: str, v: object) -> None: def set_attr(self, k: str, v: object) -> None:
self.attrs[k] = v self.attrs[k] = v
@ -53,8 +65,7 @@ class CMapBase:
def add_code2cid(self, code: str, cid: int) -> None: def add_code2cid(self, code: str, cid: int) -> None:
pass pass
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int] def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
) -> None:
pass pass
def use_cmap(self, cmap: "CMapBase") -> None: def use_cmap(self, cmap: "CMapBase") -> None:
@ -65,13 +76,12 @@ class CMapBase:
class CMap(CMapBase): class CMap(CMapBase):
def __init__(self, **kwargs: Union[str, int]) -> None: def __init__(self, **kwargs: Union[str, int]) -> None:
CMapBase.__init__(self, **kwargs) CMapBase.__init__(self, **kwargs)
self.code2cid: Dict[int, object] = {} self.code2cid: Dict[int, object] = {}
def __repr__(self) -> str: def __repr__(self) -> str:
return '<CMap: %s>' % self.attrs.get('CMapName') return "<CMap: %s>" % self.attrs.get("CMapName")
def use_cmap(self, cmap: CMapBase) -> None: def use_cmap(self, cmap: CMapBase) -> None:
assert isinstance(cmap, CMap), str(type(cmap)) assert isinstance(cmap, CMap), str(type(cmap))
@ -84,10 +94,11 @@ class CMap(CMapBase):
copy(d, v) copy(d, v)
else: else:
dst[k] = v dst[k] = v
copy(self.code2cid, cmap.code2cid) copy(self.code2cid, cmap.code2cid)
def decode(self, code: bytes) -> Iterator[int]: def decode(self, code: bytes) -> Iterator[int]:
log.debug('decode: %r, %r', self, code) log.debug("decode: %r, %r", self, code)
d = self.code2cid d = self.code2cid
for i in iter(code): for i in iter(code):
if i in d: if i in d:
@ -100,70 +111,70 @@ class CMap(CMapBase):
else: else:
d = self.code2cid d = self.code2cid
def dump(self, out: TextIO = sys.stdout, def dump(
code2cid: Optional[Dict[int, object]] = None, self,
code: Tuple[int, ...] = ()) -> None: out: TextIO = sys.stdout,
code2cid: Optional[Dict[int, object]] = None,
code: Tuple[int, ...] = (),
) -> None:
if code2cid is None: if code2cid is None:
code2cid = self.code2cid code2cid = self.code2cid
code = () code = ()
for (k, v) in sorted(code2cid.items()): for (k, v) in sorted(code2cid.items()):
c = code+(k,) c = code + (k,)
if isinstance(v, int): if isinstance(v, int):
out.write('code %r = cid %d\n' % (c, v)) out.write("code %r = cid %d\n" % (c, v))
else: else:
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c) self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
class IdentityCMap(CMapBase): class IdentityCMap(CMapBase):
def decode(self, code: bytes) -> Tuple[int, ...]: def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code)//2 n = len(code) // 2
if n: if n:
return struct.unpack('>%dH' % n, code) return struct.unpack(">%dH" % n, code)
else: else:
return () return ()
class IdentityCMapByte(IdentityCMap): class IdentityCMapByte(IdentityCMap):
def decode(self, code: bytes) -> Tuple[int, ...]: def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code) n = len(code)
if n: if n:
return struct.unpack('>%dB' % n, code) return struct.unpack(">%dB" % n, code)
else: else:
return () return ()
class UnicodeMap(CMapBase): class UnicodeMap(CMapBase):
def __init__(self, **kwargs: Union[str, int]) -> None: def __init__(self, **kwargs: Union[str, int]) -> None:
CMapBase.__init__(self, **kwargs) CMapBase.__init__(self, **kwargs)
self.cid2unichr: Dict[int, str] = {} self.cid2unichr: Dict[int, str] = {}
def __repr__(self) -> str: def __repr__(self) -> str:
return '<UnicodeMap: %s>' % self.attrs.get('CMapName') return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
def get_unichr(self, cid: int) -> str: def get_unichr(self, cid: int) -> str:
log.debug('get_unichr: %r, %r', self, cid) log.debug("get_unichr: %r, %r", self, cid)
return self.cid2unichr[cid] return self.cid2unichr[cid]
def dump(self, out: TextIO = sys.stdout) -> None: def dump(self, out: TextIO = sys.stdout) -> None:
for (k, v) in sorted(self.cid2unichr.items()): for (k, v) in sorted(self.cid2unichr.items()):
out.write('cid %d = unicode %r\n' % (k, v)) out.write("cid %d = unicode %r\n" % (k, v))
class IdentityUnicodeMap(UnicodeMap): class IdentityUnicodeMap(UnicodeMap):
def get_unichr(self, cid: int) -> str: def get_unichr(self, cid: int) -> str:
"""Interpret character id as unicode codepoint""" """Interpret character id as unicode codepoint"""
log.debug('get_unichr: %r, %r', self, cid) log.debug("get_unichr: %r, %r", self, cid)
return chr(cid) return chr(cid)
class FileCMap(CMap): class FileCMap(CMap):
def add_code2cid(self, code: str, cid: int) -> None: def add_code2cid(self, code: str, cid: int) -> None:
assert isinstance(code, str) and isinstance(cid, int),\ assert isinstance(code, str) and isinstance(cid, int), str(
str((type(code), type(cid))) (type(code), type(cid))
)
d = self.code2cid d = self.code2cid
for c in code[:-1]: for c in code[:-1]:
ci = ord(c) ci = ord(c)
@ -178,9 +189,7 @@ class FileCMap(CMap):
class FileUnicodeMap(UnicodeMap): class FileUnicodeMap(UnicodeMap):
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
) -> None:
assert isinstance(cid, int), str(type(cid)) assert isinstance(cid, int), str(type(cid))
if isinstance(code, PSLiteral): if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name. # Interpret as an Adobe glyph name.
@ -188,7 +197,7 @@ class FileUnicodeMap(UnicodeMap):
self.cid2unichr[cid] = name2unicode(code.name) self.cid2unichr[cid] = name2unicode(code.name)
elif isinstance(code, bytes): elif isinstance(code, bytes):
# Interpret as UTF-16BE. # Interpret as UTF-16BE.
self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore') self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore")
elif isinstance(code, int): elif isinstance(code, int):
self.cid2unichr[cid] = chr(code) self.cid2unichr[cid] = chr(code)
else: else:
@ -196,21 +205,19 @@ class FileUnicodeMap(UnicodeMap):
class PyCMap(CMap): class PyCMap(CMap):
def __init__(self, name: str, module: Any) -> None: def __init__(self, name: str, module: Any) -> None:
super().__init__(CMapName=name) super().__init__(CMapName=name)
self.code2cid = module.CODE2CID self.code2cid = module.CODE2CID
if module.IS_VERTICAL: if module.IS_VERTICAL:
self.attrs['WMode'] = 1 self.attrs["WMode"] = 1
class PyUnicodeMap(UnicodeMap): class PyUnicodeMap(UnicodeMap):
def __init__(self, name: str, module: Any, vertical: bool) -> None: def __init__(self, name: str, module: Any, vertical: bool) -> None:
super().__init__(CMapName=name) super().__init__(CMapName=name)
if vertical: if vertical:
self.cid2unichr = module.CID2UNICHR_V self.cid2unichr = module.CID2UNICHR_V
self.attrs['WMode'] = 1 self.attrs["WMode"] = 1
else: else:
self.cid2unichr = module.CID2UNICHR_H self.cid2unichr = module.CID2UNICHR_H
@ -226,10 +233,12 @@ class CMapDB:
@classmethod @classmethod
def _load_data(cls, name: str) -> Any: def _load_data(cls, name: str) -> Any:
name = name.replace("\0", "") name = name.replace("\0", "")
filename = '%s.pickle.gz' % name filename = "%s.pickle.gz" % name
log.debug('loading: %r', name) log.debug("loading: %r", name)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'), cmap_paths = (
os.path.join(os.path.dirname(__file__), 'cmap'),) os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
os.path.join(os.path.dirname(__file__), "cmap"),
)
for directory in cmap_paths: for directory in cmap_paths:
path = os.path.join(directory, filename) path = os.path.join(directory, filename)
if os.path.exists(path): if os.path.exists(path):
@ -243,13 +252,13 @@ class CMapDB:
@classmethod @classmethod
def get_cmap(cls, name: str) -> CMapBase: def get_cmap(cls, name: str) -> CMapBase:
if name == 'Identity-H': if name == "Identity-H":
return IdentityCMap(WMode=0) return IdentityCMap(WMode=0)
elif name == 'Identity-V': elif name == "Identity-V":
return IdentityCMap(WMode=1) return IdentityCMap(WMode=1)
elif name == 'OneByteIdentityH': elif name == "OneByteIdentityH":
return IdentityCMapByte(WMode=0) return IdentityCMapByte(WMode=0)
elif name == 'OneByteIdentityV': elif name == "OneByteIdentityV":
return IdentityCMapByte(WMode=1) return IdentityCMapByte(WMode=1)
try: try:
return cls._cmap_cache[name] return cls._cmap_cache[name]
@ -265,14 +274,12 @@ class CMapDB:
return cls._umap_cache[name][vertical] return cls._umap_cache[name][vertical]
except KeyError: except KeyError:
pass pass
data = cls._load_data('to-unicode-%s' % name) data = cls._load_data("to-unicode-%s" % name)
cls._umap_cache[name] = [PyUnicodeMap(name, data, v) cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
for v in (False, True)]
return cls._umap_cache[name][vertical] return cls._umap_cache[name][vertical]
class CMapParser(PSStackParser[PSKeyword]): class CMapParser(PSStackParser[PSKeyword]):
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp) PSStackParser.__init__(self, fp)
self.cmap = cmap self.cmap = cmap
@ -287,22 +294,22 @@ class CMapParser(PSStackParser[PSKeyword]):
pass pass
return return
KEYWORD_BEGINCMAP = KWD(b'begincmap') KEYWORD_BEGINCMAP = KWD(b"begincmap")
KEYWORD_ENDCMAP = KWD(b'endcmap') KEYWORD_ENDCMAP = KWD(b"endcmap")
KEYWORD_USECMAP = KWD(b'usecmap') KEYWORD_USECMAP = KWD(b"usecmap")
KEYWORD_DEF = KWD(b'def') KEYWORD_DEF = KWD(b"def")
KEYWORD_BEGINCODESPACERANGE = KWD(b'begincodespacerange') KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
KEYWORD_ENDCODESPACERANGE = KWD(b'endcodespacerange') KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
KEYWORD_BEGINCIDRANGE = KWD(b'begincidrange') KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
KEYWORD_ENDCIDRANGE = KWD(b'endcidrange') KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
KEYWORD_BEGINCIDCHAR = KWD(b'begincidchar') KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
KEYWORD_ENDCIDCHAR = KWD(b'endcidchar') KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
KEYWORD_BEGINBFRANGE = KWD(b'beginbfrange') KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
KEYWORD_ENDBFRANGE = KWD(b'endbfrange') KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
KEYWORD_BEGINBFCHAR = KWD(b'beginbfchar') KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
KEYWORD_ENDBFCHAR = KWD(b'endbfchar') KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange') KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange') KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
def do_keyword(self, pos: int, token: PSKeyword) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BEGINCMAP: if token is self.KEYWORD_BEGINCMAP:
@ -346,8 +353,12 @@ class CMapParser(PSStackParser[PSKeyword]):
if token is self.KEYWORD_ENDCIDRANGE: if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs): for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, bytes) or not isinstance(e, bytes) or if (
not isinstance(cid, int) or len(s) != len(e)): not isinstance(s, bytes)
or not isinstance(e, bytes)
or not isinstance(cid, int)
or len(s) != len(e)
):
continue continue
sprefix = s[:-4] sprefix = s[:-4]
eprefix = e[:-4] eprefix = e[:-4]
@ -358,9 +369,9 @@ class CMapParser(PSStackParser[PSKeyword]):
s1 = nunpack(svar) s1 = nunpack(svar)
e1 = nunpack(evar) e1 = nunpack(evar)
vlen = len(svar) vlen = len(svar)
for i in range(e1-s1+1): for i in range(e1 - s1 + 1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:] x = sprefix + struct.pack(">L", s1 + i)[-vlen:]
self.cmap.add_cid2unichr(cid+i, x) self.cmap.add_cid2unichr(cid + i, x)
return return
if token is self.KEYWORD_BEGINCIDCHAR: if token is self.KEYWORD_BEGINCIDCHAR:
@ -379,23 +390,26 @@ class CMapParser(PSStackParser[PSKeyword]):
if token is self.KEYWORD_ENDBFRANGE: if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs): for (s, e, code) in choplist(3, objs):
if (not isinstance(s, bytes) or not isinstance(e, bytes) or if (
len(s) != len(e)): not isinstance(s, bytes)
or not isinstance(e, bytes)
or len(s) != len(e)
):
continue continue
s1 = nunpack(s) s1 = nunpack(s)
e1 = nunpack(e) e1 = nunpack(e)
if isinstance(code, list): if isinstance(code, list):
for i in range(e1-s1+1): for i in range(e1 - s1 + 1):
self.cmap.add_cid2unichr(s1+i, code[i]) self.cmap.add_cid2unichr(s1 + i, code[i])
else: else:
assert isinstance(code, bytes) assert isinstance(code, bytes)
var = code[-4:] var = code[-4:]
base = nunpack(var) base = nunpack(var)
prefix = code[:-4] prefix = code[:-4]
vlen = len(var) vlen = len(var)
for i in range(e1-s1+1): for i in range(e1 - s1 + 1):
x = prefix+struct.pack('>L', base+i)[-vlen:] x = prefix + struct.pack(">L", base + i)[-vlen:]
self.cmap.add_cid2unichr(s1+i, x) self.cmap.add_cid2unichr(s1 + i, x)
return return
if token is self.KEYWORD_BEGINBFCHAR: if token is self.KEYWORD_BEGINBFCHAR:
@ -422,7 +436,7 @@ class CMapParser(PSStackParser[PSKeyword]):
def main(argv: List[str]) -> None: def main(argv: List[str]) -> None:
args = argv[1:] args = argv[1:]
for fname in args: for fname in args:
fp = open(fname, 'rb') fp = open(fname, "rb")
cmap = FileUnicodeMap() cmap = FileUnicodeMap()
CMapParser(cmap, fp).run() CMapParser(cmap, fp).run()
fp.close() fp.close()
@ -430,5 +444,5 @@ def main(argv: List[str]) -> None:
return return
if __name__ == '__main__': if __name__ == "__main__":
main(sys.argv) main(sys.argv)

View File

@ -1,8 +1,19 @@
import io import io
import logging import logging
import re import re
from typing import (BinaryIO, Dict, Generic, List, Optional, Sequence, TextIO, from typing import (
Tuple, TypeVar, Union, cast) BinaryIO,
Dict,
Generic,
List,
Optional,
Sequence,
TextIO,
Tuple,
TypeVar,
Union,
cast,
)
from pdfminer.pdfcolor import PDFColorSpace from pdfminer.pdfcolor import PDFColorSpace
from . import utils from . import utils
@ -46,7 +57,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
self, self,
rsrcmgr: PDFResourceManager, rsrcmgr: PDFResourceManager,
pageno: int = 1, pageno: int = 1,
laparams: Optional[LAParams] = None laparams: Optional[LAParams] = None,
) -> None: ) -> None:
PDFTextDevice.__init__(self, rsrcmgr) PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno self.pageno = pageno
@ -57,7 +68,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
(x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
(x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(ctm, (x1, y1)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
self.cur_item = LTPage(self.pageno, mediabox) self.cur_item = LTPage(self.pageno, mediabox)
def end_page(self, page: PDFPage) -> None: def end_page(self, page: PDFPage) -> None:
@ -80,9 +91,11 @@ class PDFLayoutAnalyzer(PDFTextDevice):
def render_image(self, name: str, stream: PDFStream) -> None: def render_image(self, name: str, stream: PDFStream) -> None:
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
item = LTImage(name, stream, item = LTImage(
(self.cur_item.x0, self.cur_item.y0, name,
self.cur_item.x1, self.cur_item.y1)) stream,
(self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
)
self.cur_item.add(item) self.cur_item.add(item)
def paint_path( def paint_path(
@ -91,15 +104,15 @@ class PDFLayoutAnalyzer(PDFTextDevice):
stroke: bool, stroke: bool,
fill: bool, fill: bool,
evenodd: bool, evenodd: bool,
path: Sequence[PathSegment] path: Sequence[PathSegment],
) -> None: ) -> None:
"""Paint paths described in section 4.4 of the PDF reference manual""" """Paint paths described in section 4.4 of the PDF reference manual"""
shape = ''.join(x[0] for x in path) shape = "".join(x[0] for x in path)
if shape.count('m') > 1: if shape.count("m") > 1:
# recurse if there are multiple m's in this shape # recurse if there are multiple m's in this shape
for m in re.finditer(r'm[^m]+', shape): for m in re.finditer(r"m[^m]+", shape):
subpath = path[m.start(0):m.end(0)] subpath = path[m.start(0) : m.end(0)]
self.paint_path(gstate, stroke, fill, evenodd, subpath) self.paint_path(gstate, stroke, fill, evenodd, subpath)
else: else:
@ -110,38 +123,68 @@ class PDFLayoutAnalyzer(PDFTextDevice):
# And, per Section 4.4's Table 4.9, all other path commands place # And, per Section 4.4's Table 4.9, all other path commands place
# their point-position in their final two arguments. (Any preceding # their point-position in their final two arguments. (Any preceding
# arguments represent control points on Bézier curves.) # arguments represent control points on Bézier curves.)
raw_pts = [cast(Point, p[-2:] if p[0] != 'h' else path[0][-2:]) raw_pts = [
for p in path] cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
]
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
if shape in {'mlh', 'ml'}: if shape in {"mlh", "ml"}:
# single line segment # single line segment
# #
# Note: 'ml', in conditional above, is a frequent anomaly # Note: 'ml', in conditional above, is a frequent anomaly
# that we want to support. # that we want to support.
line = LTLine(gstate.linewidth, pts[0], pts[1], stroke, line = LTLine(
fill, evenodd, gstate.scolor, gstate.ncolor) gstate.linewidth,
pts[0],
pts[1],
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(line) self.cur_item.add(line)
elif shape in {'mlllh', 'mllll'}: elif shape in {"mlllh", "mllll"}:
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
is_closed_loop = (pts[0] == pts[4]) is_closed_loop = pts[0] == pts[4]
has_square_coordinates = \ has_square_coordinates = (
(x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) \ x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
if is_closed_loop and has_square_coordinates: if is_closed_loop and has_square_coordinates:
rect = LTRect(gstate.linewidth, (*pts[0], *pts[2]), stroke, rect = LTRect(
fill, evenodd, gstate.scolor, gstate.ncolor) gstate.linewidth,
(*pts[0], *pts[2]),
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(rect) self.cur_item.add(rect)
else: else:
curve = LTCurve(gstate.linewidth, pts, stroke, fill, curve = LTCurve(
evenodd, gstate.scolor, gstate.ncolor) gstate.linewidth,
pts,
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(curve) self.cur_item.add(curve)
else: else:
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, curve = LTCurve(
gstate.scolor, gstate.ncolor) gstate.linewidth,
pts,
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(curve) self.cur_item.add(curve)
def render_char( def render_char(
@ -153,7 +196,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
rise: float, rise: float,
cid: int, cid: int,
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: PDFGraphicState graphicstate: PDFGraphicState,
) -> float: ) -> float:
try: try:
text = font.to_unichr(cid) text = font.to_unichr(cid)
@ -162,14 +205,24 @@ class PDFLayoutAnalyzer(PDFTextDevice):
text = self.handle_undefined_char(font, cid) text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid) textwidth = font.char_width(cid)
textdisp = font.char_disp(cid) textdisp = font.char_disp(cid)
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, item = LTChar(
textdisp, ncs, graphicstate) matrix,
font,
fontsize,
scaling,
rise,
text,
textwidth,
textdisp,
ncs,
graphicstate,
)
self.cur_item.add(item) self.cur_item.add(item)
return item.adv return item.adv
def handle_undefined_char(self, font: PDFFont, cid: int) -> str: def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
log.debug('undefined: %r, %r', font, cid) log.debug("undefined: %r, %r", font, cid)
return '(cid:%d)' % cid return "(cid:%d)" % cid
def receive_layout(self, ltpage: LTPage) -> None: def receive_layout(self, ltpage: LTPage) -> None:
pass pass
@ -180,10 +233,9 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
self, self,
rsrcmgr: PDFResourceManager, rsrcmgr: PDFResourceManager,
pageno: int = 1, pageno: int = 1,
laparams: Optional[LAParams] = None laparams: Optional[LAParams] = None,
) -> None: ) -> None:
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
laparams=laparams)
self.result: Optional[LTPage] = None self.result: Optional[LTPage] = None
def receive_layout(self, ltpage: LTPage) -> None: def receive_layout(self, ltpage: LTPage) -> None:
@ -195,7 +247,7 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
# Some PDFConverter children support only binary I/O # Some PDFConverter children support only binary I/O
IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO) IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]): class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
@ -203,12 +255,11 @@ class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
self, self,
rsrcmgr: PDFResourceManager, rsrcmgr: PDFResourceManager,
outfp: IOType, outfp: IOType,
codec: str = 'utf-8', codec: str = "utf-8",
pageno: int = 1, pageno: int = 1,
laparams: Optional[LAParams] = None laparams: Optional[LAParams] = None,
) -> None: ) -> None:
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
laparams=laparams)
self.outfp: IOType = outfp self.outfp: IOType = outfp
self.codec = codec self.codec = codec
self.outfp_binary = self._is_binary_stream(self.outfp) self.outfp_binary = self._is_binary_stream(self.outfp)
@ -216,9 +267,9 @@ class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
@staticmethod @staticmethod
def _is_binary_stream(outfp: AnyIO) -> bool: def _is_binary_stream(outfp: AnyIO) -> bool:
"""Test if an stream is binary or not""" """Test if an stream is binary or not"""
if 'b' in getattr(outfp, 'mode', ''): if "b" in getattr(outfp, "mode", ""):
return True return True
elif hasattr(outfp, 'mode'): elif hasattr(outfp, "mode"):
# output stream has a mode, but it does not contain 'b' # output stream has a mode, but it does not contain 'b'
return False return False
elif isinstance(outfp, io.BytesIO): elif isinstance(outfp, io.BytesIO):
@ -236,19 +287,18 @@ class TextConverter(PDFConverter[AnyIO]):
self, self,
rsrcmgr: PDFResourceManager, rsrcmgr: PDFResourceManager,
outfp: AnyIO, outfp: AnyIO,
codec: str = 'utf-8', codec: str = "utf-8",
pageno: int = 1, pageno: int = 1,
laparams: Optional[LAParams] = None, laparams: Optional[LAParams] = None,
showpageno: bool = False, showpageno: bool = False,
imagewriter: Optional[ImageWriter] = None imagewriter: Optional[ImageWriter] = None,
) -> None: ) -> None:
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
self.imagewriter = imagewriter self.imagewriter = imagewriter
def write_text(self, text: str) -> None: def write_text(self, text: str) -> None:
text = utils.compatible_encode_method(text, self.codec, 'ignore') text = utils.compatible_encode_method(text, self.codec, "ignore")
if self.outfp_binary: if self.outfp_binary:
cast(BinaryIO, self.outfp).write(text.encode()) cast(BinaryIO, self.outfp).write(text.encode())
else: else:
@ -262,14 +312,15 @@ class TextConverter(PDFConverter[AnyIO]):
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.write_text(item.get_text()) self.write_text(item.get_text())
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write_text('\n') self.write_text("\n")
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
if self.imagewriter is not None: if self.imagewriter is not None:
self.imagewriter.export_image(item) self.imagewriter.export_image(item)
if self.showpageno: if self.showpageno:
self.write_text('Page %s\n' % ltpage.pageid) self.write_text("Page %s\n" % ltpage.pageid)
render(ltpage) render(ltpage)
self.write_text('\f') self.write_text("\f")
# Some dummy functions to save memory/CPU when all that is wanted # Some dummy functions to save memory/CPU when all that is wanted
# is text. This stops all the image and drawing output from being # is text. This stops all the image and drawing output from being
@ -286,54 +337,55 @@ class TextConverter(PDFConverter[AnyIO]):
stroke: bool, stroke: bool,
fill: bool, fill: bool,
evenodd: bool, evenodd: bool,
path: Sequence[PathSegment] path: Sequence[PathSegment],
) -> None: ) -> None:
return return
class HTMLConverter(PDFConverter[AnyIO]): class HTMLConverter(PDFConverter[AnyIO]):
RECT_COLORS = { RECT_COLORS = {
'figure': 'yellow', "figure": "yellow",
'textline': 'magenta', "textline": "magenta",
'textbox': 'cyan', "textbox": "cyan",
'textgroup': 'red', "textgroup": "red",
'curve': 'black', "curve": "black",
'page': 'gray', "page": "gray",
} }
TEXT_COLORS = { TEXT_COLORS = {
'textbox': 'blue', "textbox": "blue",
'char': 'black', "char": "black",
} }
def __init__( def __init__(
self, self,
rsrcmgr: PDFResourceManager, rsrcmgr: PDFResourceManager,
outfp: AnyIO, outfp: AnyIO,
codec: str = 'utf-8', codec: str = "utf-8",
pageno: int = 1, pageno: int = 1,
laparams: Optional[LAParams] = None, laparams: Optional[LAParams] = None,
scale: float = 1, scale: float = 1,
fontscale: float = 1.0, fontscale: float = 1.0,
layoutmode: str = 'normal', layoutmode: str = "normal",
showpageno: bool = True, showpageno: bool = True,
pagemargin: int = 50, pagemargin: int = 50,
imagewriter: Optional[ImageWriter] = None, imagewriter: Optional[ImageWriter] = None,
debug: int = 0, debug: int = 0,
rect_colors: Optional[Dict[str, str]] = None, rect_colors: Optional[Dict[str, str]] = None,
text_colors: Optional[Dict[str, str]] = None text_colors: Optional[Dict[str, str]] = None,
) -> None: ) -> None:
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, PDFConverter.__init__(
laparams=laparams) self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
)
# write() assumes a codec for binary I/O, or no codec for text I/O. # write() assumes a codec for binary I/O, or no codec for text I/O.
if self.outfp_binary == (not self.codec): if self.outfp_binary == (not self.codec):
raise ValueError("Codec is required for a binary I/O output") raise ValueError("Codec is required for a binary I/O output")
if text_colors is None: if text_colors is None:
text_colors = {'char': 'black'} text_colors = {"char": "black"}
if rect_colors is None: if rect_colors is None:
rect_colors = {'curve': 'black', 'page': 'gray'} rect_colors = {"curve": "black", "page": "gray"}
self.scale = scale self.scale = scale
self.fontscale = fontscale self.fontscale = fontscale
@ -360,23 +412,27 @@ class HTMLConverter(PDFConverter[AnyIO]):
return return
def write_header(self) -> None: def write_header(self) -> None:
self.write('<html><head>\n') self.write("<html><head>\n")
if self.codec: if self.codec:
s = '<meta http-equiv="Content-Type" content="text/html; ' \ s = (
'<meta http-equiv="Content-Type" content="text/html; '
'charset=%s">\n' % self.codec 'charset=%s">\n' % self.codec
)
else: else:
s = '<meta http-equiv="Content-Type" content="text/html">\n' s = '<meta http-equiv="Content-Type" content="text/html">\n'
self.write(s) self.write(s)
self.write('</head><body>\n') self.write("</head><body>\n")
return return
def write_footer(self) -> None: def write_footer(self) -> None:
page_links = ['<a href="#{}">{}</a>'.format(i, i) page_links = [
for i in range(1, self.pageno)] '<a href="#{}">{}</a>'.format(i, i) for i in range(1, self.pageno)
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \ ]
', '.join(page_links) s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
page_links
)
self.write(s) self.write(s)
self.write('</body></html>\n') self.write("</body></html>\n")
return return
def write_text(self, text: str) -> None: def write_text(self, text: str) -> None:
@ -384,71 +440,67 @@ class HTMLConverter(PDFConverter[AnyIO]):
return return
def place_rect( def place_rect(
self, self, color: str, borderwidth: int, x: float, y: float, w: float, h: float
color: str,
borderwidth: int,
x: float,
y: float,
w: float,
h: float
) -> None: ) -> None:
color2 = self.rect_colors.get(color) color2 = self.rect_colors.get(color)
if color2 is not None: if color2 is not None:
s = '<span style="position:absolute; border: %s %dpx solid; ' \ s = (
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \ '<span style="position:absolute; border: %s %dpx solid; '
(color2, borderwidth, x * self.scale, 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
(self._yoffset - y) * self.scale, w * self.scale, % (
h * self.scale) color2,
self.write( borderwidth,
s) x * self.scale,
(self._yoffset - y) * self.scale,
w * self.scale,
h * self.scale,
)
)
self.write(s)
return return
def place_border( def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
self, self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
color: str,
borderwidth: int,
item: LTComponent
) -> None:
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
item.height)
return return
def place_image( def place_image(
self, self, item: LTImage, borderwidth: int, x: float, y: float, w: float, h: float
item: LTImage,
borderwidth: int,
x: float,
y: float,
w: float,
h: float
) -> None: ) -> None:
if self.imagewriter is not None: if self.imagewriter is not None:
name = self.imagewriter.export_image(item) name = self.imagewriter.export_image(item)
s = '<img src="%s" border="%d" style="position:absolute; ' \ s = (
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \ '<img src="%s" border="%d" style="position:absolute; '
(enc(name), borderwidth, x * self.scale, 'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
(self._yoffset - y) * self.scale, w * self.scale, % (
h * self.scale) enc(name),
borderwidth,
x * self.scale,
(self._yoffset - y) * self.scale,
w * self.scale,
h * self.scale,
)
)
self.write(s) self.write(s)
return return
def place_text( def place_text(
self, self, color: str, text: str, x: float, y: float, size: float
color: str,
text: str,
x: float,
y: float,
size: float
) -> None: ) -> None:
color2 = self.text_colors.get(color) color2 = self.text_colors.get(color)
if color2 is not None: if color2 is not None:
s = '<span style="position:absolute; color:%s; left:%dpx; ' \ s = (
'top:%dpx; font-size:%dpx;">' % \ '<span style="position:absolute; color:%s; left:%dpx; '
(color2, x * self.scale, (self._yoffset - y) * self.scale, 'top:%dpx; font-size:%dpx;">'
size * self.scale * self.fontscale) % (
color2,
x * self.scale,
(self._yoffset - y) * self.scale,
size * self.scale * self.fontscale,
)
)
self.write(s) self.write(s)
self.write_text(text) self.write_text(text)
self.write('</span>\n') self.write("</span>\n")
return return
def begin_div( def begin_div(
@ -459,47 +511,57 @@ class HTMLConverter(PDFConverter[AnyIO]):
y: float, y: float,
w: float, w: float,
h: float, h: float,
writing_mode: str = 'False' writing_mode: str = "False",
) -> None: ) -> None:
self._fontstack.append(self._font) self._fontstack.append(self._font)
self._font = None self._font = None
s = '<div style="position:absolute; border: %s %dpx solid; ' \ s = (
'writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; ' \ '<div style="position:absolute; border: %s %dpx solid; '
'height:%dpx;">' % \ "writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
(color, borderwidth, writing_mode, x * self.scale, 'height:%dpx;">'
(self._yoffset - y) * self.scale, w * self.scale, h * self.scale) % (
color,
borderwidth,
writing_mode,
x * self.scale,
(self._yoffset - y) * self.scale,
w * self.scale,
h * self.scale,
)
)
self.write(s) self.write(s)
return return
def end_div(self, color: str) -> None: def end_div(self, color: str) -> None:
if self._font is not None: if self._font is not None:
self.write('</span>') self.write("</span>")
self._font = self._fontstack.pop() self._font = self._fontstack.pop()
self.write('</div>') self.write("</div>")
return return
def put_text(self, text: str, fontname: str, fontsize: float) -> None: def put_text(self, text: str, fontname: str, fontsize: float) -> None:
font = (fontname, fontsize) font = (fontname, fontsize)
if font != self._font: if font != self._font:
if self._font is not None: if self._font is not None:
self.write('</span>') self.write("</span>")
# Remove subset tag from fontname, see PDF Reference 5.5.3 # Remove subset tag from fontname, see PDF Reference 5.5.3
fontname_without_subset_tag = fontname.split('+')[-1] fontname_without_subset_tag = fontname.split("+")[-1]
self.write('<span style="font-family: %s; font-size:%dpx">' % self.write(
(fontname_without_subset_tag, '<span style="font-family: %s; font-size:%dpx">'
fontsize * self.scale * self.fontscale)) % (fontname_without_subset_tag, fontsize * self.scale * self.fontscale)
)
self._font = font self._font = font
self.write_text(text) self.write_text(text)
return return
def put_newline(self) -> None: def put_newline(self) -> None:
self.write('<br>') self.write("<br>")
return return
def receive_layout(self, ltpage: LTPage) -> None: def receive_layout(self, ltpage: LTPage) -> None:
def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None: def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
if isinstance(item, LTTextGroup): if isinstance(item, LTTextGroup):
self.place_border('textgroup', 1, item) self.place_border("textgroup", 1, item)
for child in item: for child in item:
show_group(child) show_group(child)
return return
@ -508,63 +570,74 @@ class HTMLConverter(PDFConverter[AnyIO]):
child: LTItem child: LTItem
if isinstance(item, LTPage): if isinstance(item, LTPage):
self._yoffset += item.y1 self._yoffset += item.y1
self.place_border('page', 1, item) self.place_border("page", 1, item)
if self.showpageno: if self.showpageno:
self.write('<div style="position:absolute; top:%dpx;">' % self.write(
((self._yoffset-item.y1)*self.scale)) '<div style="position:absolute; top:%dpx;">'
self.write('<a name="{}">Page {}</a></div>\n' % ((self._yoffset - item.y1) * self.scale)
.format(item.pageid, item.pageid)) )
self.write(
'<a name="{}">Page {}</a></div>\n'.format(
item.pageid, item.pageid
)
)
for child in item: for child in item:
render(child) render(child)
if item.groups is not None: if item.groups is not None:
for group in item.groups: for group in item.groups:
show_group(group) show_group(group)
elif isinstance(item, LTCurve): elif isinstance(item, LTCurve):
self.place_border('curve', 1, item) self.place_border("curve", 1, item)
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.begin_div('figure', 1, item.x0, item.y1, item.width, self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
item.height)
for child in item: for child in item:
render(child) render(child)
self.end_div('figure') self.end_div("figure")
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
self.place_image(item, 1, item.x0, item.y1, item.width, self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
item.height)
else: else:
if self.layoutmode == 'exact': if self.layoutmode == "exact":
if isinstance(item, LTTextLine): if isinstance(item, LTTextLine):
self.place_border('textline', 1, item) self.place_border("textline", 1, item)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.place_border('textbox', 1, item) self.place_border("textbox", 1, item)
self.place_text('textbox', str(item.index+1), item.x0, self.place_text(
item.y1, 20) "textbox", str(item.index + 1), item.x0, item.y1, 20
)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.place_border('char', 1, item) self.place_border("char", 1, item)
self.place_text('char', item.get_text(), item.x0, self.place_text(
item.y1, item.size) "char", item.get_text(), item.x0, item.y1, item.size
)
else: else:
if isinstance(item, LTTextLine): if isinstance(item, LTTextLine):
for child in item: for child in item:
render(child) render(child)
if self.layoutmode != 'loose': if self.layoutmode != "loose":
self.put_newline() self.put_newline()
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.begin_div('textbox', 1, item.x0, item.y1, self.begin_div(
item.width, item.height, "textbox",
item.get_writing_mode()) 1,
item.x0,
item.y1,
item.width,
item.height,
item.get_writing_mode(),
)
for child in item: for child in item:
render(child) render(child)
self.end_div('textbox') self.end_div("textbox")
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.put_text(item.get_text(), item.fontname, self.put_text(item.get_text(), item.fontname, item.size)
item.size)
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.write_text(item.get_text()) self.write_text(item.get_text())
return return
render(ltpage) render(ltpage)
self._yoffset += self.pagemargin self._yoffset += self.pagemargin
return return
@ -576,20 +649,21 @@ class HTMLConverter(PDFConverter[AnyIO]):
class XMLConverter(PDFConverter[AnyIO]): class XMLConverter(PDFConverter[AnyIO]):
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]') CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
def __init__( def __init__(
self, self,
rsrcmgr: PDFResourceManager, rsrcmgr: PDFResourceManager,
outfp: AnyIO, outfp: AnyIO,
codec: str = 'utf-8', codec: str = "utf-8",
pageno: int = 1, pageno: int = 1,
laparams: Optional[LAParams] = None, laparams: Optional[LAParams] = None,
imagewriter: Optional[ImageWriter] = None, imagewriter: Optional[ImageWriter] = None,
stripcontrol: bool = False stripcontrol: bool = False,
) -> None: ) -> None:
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, PDFConverter.__init__(
laparams=laparams) self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
)
# write() assumes a codec for binary I/O, or no codec for text I/O. # write() assumes a codec for binary I/O, or no codec for text I/O.
if self.outfp_binary == (not self.codec): if self.outfp_binary == (not self.codec):
@ -612,100 +686,125 @@ class XMLConverter(PDFConverter[AnyIO]):
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec) self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
else: else:
self.write('<?xml version="1.0" ?>\n') self.write('<?xml version="1.0" ?>\n')
self.write('<pages>\n') self.write("<pages>\n")
return return
def write_footer(self) -> None: def write_footer(self) -> None:
self.write('</pages>\n') self.write("</pages>\n")
return return
def write_text(self, text: str) -> None: def write_text(self, text: str) -> None:
if self.stripcontrol: if self.stripcontrol:
text = self.CONTROL.sub('', text) text = self.CONTROL.sub("", text)
self.write(enc(text)) self.write(enc(text))
return return
def receive_layout(self, ltpage: LTPage) -> None: def receive_layout(self, ltpage: LTPage) -> None:
def show_group(item: LTItem) -> None: def show_group(item: LTItem) -> None:
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write('<textbox id="%d" bbox="%s" />\n' % self.write(
(item.index, bbox2str(item.bbox))) '<textbox id="%d" bbox="%s" />\n'
% (item.index, bbox2str(item.bbox))
)
elif isinstance(item, LTTextGroup): elif isinstance(item, LTTextGroup):
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox)) self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
for child in item: for child in item:
show_group(child) show_group(child)
self.write('</textgroup>\n') self.write("</textgroup>\n")
return return
def render(item: LTItem) -> None: def render(item: LTItem) -> None:
child: LTItem child: LTItem
if isinstance(item, LTPage): if isinstance(item, LTPage):
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \ s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
(item.pageid, bbox2str(item.bbox), item.rotate) item.pageid,
bbox2str(item.bbox),
item.rotate,
)
self.write(s) self.write(s)
for child in item: for child in item:
render(child) render(child)
if item.groups is not None: if item.groups is not None:
self.write('<layout>\n') self.write("<layout>\n")
for group in item.groups: for group in item.groups:
show_group(group) show_group(group)
self.write('</layout>\n') self.write("</layout>\n")
self.write('</page>\n') self.write("</page>\n")
elif isinstance(item, LTLine): elif isinstance(item, LTLine):
s = '<line linewidth="%d" bbox="%s" />\n' % \ s = '<line linewidth="%d" bbox="%s" />\n' % (
(item.linewidth, bbox2str(item.bbox)) item.linewidth,
bbox2str(item.bbox),
)
self.write(s) self.write(s)
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
s = '<rect linewidth="%d" bbox="%s" />\n' % \ s = '<rect linewidth="%d" bbox="%s" />\n' % (
(item.linewidth, bbox2str(item.bbox)) item.linewidth,
bbox2str(item.bbox),
)
self.write(s) self.write(s)
elif isinstance(item, LTCurve): elif isinstance(item, LTCurve):
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % \ s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
(item.linewidth, bbox2str(item.bbox), item.get_pts()) item.linewidth,
bbox2str(item.bbox),
item.get_pts(),
)
self.write(s) self.write(s)
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
s = '<figure name="%s" bbox="%s">\n' % \ s = '<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox))
(item.name, bbox2str(item.bbox))
self.write(s) self.write(s)
for child in item: for child in item:
render(child) render(child)
self.write('</figure>\n') self.write("</figure>\n")
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
for child in item: for child in item:
render(child) render(child)
self.write('</textline>\n') self.write("</textline>\n")
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
wmode = '' wmode = ""
if isinstance(item, LTTextBoxVertical): if isinstance(item, LTTextBoxVertical):
wmode = ' wmode="vertical"' wmode = ' wmode="vertical"'
s = '<textbox id="%d" bbox="%s"%s>\n' %\ s = '<textbox id="%d" bbox="%s"%s>\n' % (
(item.index, bbox2str(item.bbox), wmode) item.index,
bbox2str(item.bbox),
wmode,
)
self.write(s) self.write(s)
for child in item: for child in item:
render(child) render(child)
self.write('</textbox>\n') self.write("</textbox>\n")
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
s = '<text font="%s" bbox="%s" colourspace="%s" ' \ s = (
'ncolour="%s" size="%.3f">' % \ '<text font="%s" bbox="%s" colourspace="%s" '
(enc(item.fontname), bbox2str(item.bbox), 'ncolour="%s" size="%.3f">'
item.ncs.name, item.graphicstate.ncolor, item.size) % (
enc(item.fontname),
bbox2str(item.bbox),
item.ncs.name,
item.graphicstate.ncolor,
item.size,
)
)
self.write(s) self.write(s)
self.write_text(item.get_text()) self.write_text(item.get_text())
self.write('</text>\n') self.write("</text>\n")
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.write('<text>%s</text>\n' % item.get_text()) self.write("<text>%s</text>\n" % item.get_text())
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
if self.imagewriter is not None: if self.imagewriter is not None:
name = self.imagewriter.export_image(item) name = self.imagewriter.export_image(item)
self.write('<image src="%s" width="%d" height="%d" />\n' % self.write(
(enc(name), item.width, item.height)) '<image src="%s" width="%d" height="%d" />\n'
% (enc(name), item.width, item.height)
)
else: else:
self.write('<image width="%d" height="%d" />\n' % self.write(
(item.width, item.height)) '<image width="%d" height="%d" />\n' % (item.width, item.height)
)
else: else:
assert False, str(('Unhandled', item)) assert False, str(("Unhandled", item))
return return
render(ltpage) render(ltpage)
return return

View File

@ -11,18 +11,19 @@ class NumberTree:
See Section 3.8.6 of the PDF Reference. See Section 3.8.6 of the PDF Reference.
""" """
def __init__(self, obj: Any): def __init__(self, obj: Any):
self._obj = dict_value(obj) self._obj = dict_value(obj)
self.nums: Optional[Iterable[Any]] = None self.nums: Optional[Iterable[Any]] = None
self.kids: Optional[Iterable[Any]] = None self.kids: Optional[Iterable[Any]] = None
self.limits: Optional[Iterable[Any]] = None self.limits: Optional[Iterable[Any]] = None
if 'Nums' in self._obj: if "Nums" in self._obj:
self.nums = list_value(self._obj['Nums']) self.nums = list_value(self._obj["Nums"])
if 'Kids' in self._obj: if "Kids" in self._obj:
self.kids = list_value(self._obj['Kids']) self.kids = list_value(self._obj["Kids"])
if 'Limits' in self._obj: if "Limits" in self._obj:
self.limits = list_value(self._obj['Limits']) self.limits = list_value(self._obj["Limits"])
def _parse(self) -> List[Tuple[int, Any]]: def _parse(self) -> List[Tuple[int, Any]]:
items = [] items = []
@ -44,7 +45,7 @@ class NumberTree:
if settings.STRICT: if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])): if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError('Number tree elements are out of order') raise PDFSyntaxError("Number tree elements are out of order")
else: else:
values.sort(key=lambda t: t[0]) values.sort(key=lambda t: t[0])

View File

@ -6,7 +6,7 @@ from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING from .latin_enc import ENCODING
from .psparser import PSLiteral from .psparser import PSLiteral
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+') HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -25,39 +25,41 @@ def name2unicode(name: str) -> str:
:returns unicode character if name resembles something, :returns unicode character if name resembles something,
otherwise a KeyError otherwise a KeyError
""" """
name = name.split('.')[0] name = name.split(".")[0]
components = name.split('_') components = name.split("_")
if len(components) > 1: if len(components) > 1:
return ''.join(map(name2unicode, components)) return "".join(map(name2unicode, components))
else: else:
if name in glyphname2unicode: if name in glyphname2unicode:
return glyphname2unicode[name] return glyphname2unicode[name]
elif name.startswith('uni'): elif name.startswith("uni"):
name_without_uni = name.strip('uni') name_without_uni = name.strip("uni")
if HEXADECIMAL.match(name_without_uni) and \ if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
len(name_without_uni) % 4 == 0: unicode_digits = [
unicode_digits = [int(name_without_uni[i:i + 4], base=16) int(name_without_uni[i : i + 4], base=16)
for i in range(0, len(name_without_uni), 4)] for i in range(0, len(name_without_uni), 4)
]
for digit in unicode_digits: for digit in unicode_digits:
raise_key_error_for_invalid_unicode(digit) raise_key_error_for_invalid_unicode(digit)
characters = map(chr, unicode_digits) characters = map(chr, unicode_digits)
return ''.join(characters) return "".join(characters)
elif name.startswith('u'): elif name.startswith("u"):
name_without_u = name.strip('u') name_without_u = name.strip("u")
if HEXADECIMAL.match(name_without_u) and \ if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16) unicode_digit = int(name_without_u, base=16)
raise_key_error_for_invalid_unicode(unicode_digit) raise_key_error_for_invalid_unicode(unicode_digit)
return chr(unicode_digit) return chr(unicode_digit)
raise KeyError('Could not convert unicode name "%s" to character because ' raise KeyError(
'it does not match specification' % name) 'Could not convert unicode name "%s" to character because '
"it does not match specification" % name
)
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
@ -67,8 +69,10 @@ def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
:raises KeyError if unicode digit is invalid :raises KeyError if unicode digit is invalid
""" """
if 55295 < unicode_digit < 57344: if 55295 < unicode_digit < 57344:
raise KeyError('Unicode digit %d is invalid because ' raise KeyError(
'it is in the range D800 through DFFF' % unicode_digit) "Unicode digit %d is invalid because "
"it is in the range D800 through DFFF" % unicode_digit
)
class EncodingDB: class EncodingDB:
@ -89,17 +93,15 @@ class EncodingDB:
pdf2unicode[pdf] = c pdf2unicode[pdf] = c
encodings = { encodings = {
'StandardEncoding': std2unicode, "StandardEncoding": std2unicode,
'MacRomanEncoding': mac2unicode, "MacRomanEncoding": mac2unicode,
'WinAnsiEncoding': win2unicode, "WinAnsiEncoding": win2unicode,
'PDFDocEncoding': pdf2unicode, "PDFDocEncoding": pdf2unicode,
} }
@classmethod @classmethod
def get_encoding( def get_encoding(
cls, cls, name: str, diff: Optional[Iterable[object]] = None
name: str,
diff: Optional[Iterable[object]] = None
) -> Dict[int, str]: ) -> Dict[int, str]:
cid2unicode = cls.encodings.get(name, cls.std2unicode) cid2unicode = cls.encodings.get(name, cls.std2unicode)
if diff: if diff:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,8 +5,7 @@ import sys
from io import StringIO from io import StringIO
from typing import Any, BinaryIO, Container, Iterator, Optional, cast from typing import Any, BinaryIO, Container, Iterator, Optional, cast
from .converter import XMLConverter, HTMLConverter, TextConverter, \ from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
PDFPageAggregator
from .image import ImageWriter from .image import ImageWriter
from .layout import LAParams, LTPage from .layout import LAParams, LTPage
from .pdfdevice import PDFDevice, TagExtractor from .pdfdevice import PDFDevice, TagExtractor
@ -18,20 +17,20 @@ from .utils import open_filename, FileOrName, AnyIO
def extract_text_to_fp( def extract_text_to_fp(
inf: BinaryIO, inf: BinaryIO,
outfp: AnyIO, outfp: AnyIO,
output_type: str = 'text', output_type: str = "text",
codec: str = 'utf-8', codec: str = "utf-8",
laparams: Optional[LAParams] = None, laparams: Optional[LAParams] = None,
maxpages: int = 0, maxpages: int = 0,
page_numbers: Optional[Container[int]] = None, page_numbers: Optional[Container[int]] = None,
password: str = "", password: str = "",
scale: float = 1.0, scale: float = 1.0,
rotation: int = 0, rotation: int = 0,
layoutmode: str = 'normal', layoutmode: str = "normal",
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
strip_control: bool = False, strip_control: bool = False,
debug: bool = False, debug: bool = False,
disable_caching: bool = False, disable_caching: bool = False,
**kwargs: Any **kwargs: Any,
) -> None: ) -> None:
"""Parses text from inf-file and writes to outfp file-like object. """Parses text from inf-file and writes to outfp file-like object.
@ -72,39 +71,52 @@ def extract_text_to_fp(
rsrcmgr = PDFResourceManager(caching=not disable_caching) rsrcmgr = PDFResourceManager(caching=not disable_caching)
device: Optional[PDFDevice] = None device: Optional[PDFDevice] = None
if output_type != 'text' and outfp == sys.stdout: if output_type != "text" and outfp == sys.stdout:
outfp = sys.stdout.buffer outfp = sys.stdout.buffer
if output_type == 'text': if output_type == "text":
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, device = TextConverter(
imagewriter=imagewriter) rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter
)
elif output_type == 'xml': elif output_type == "xml":
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, device = XMLConverter(
imagewriter=imagewriter, rsrcmgr,
stripcontrol=strip_control) outfp,
codec=codec,
laparams=laparams,
imagewriter=imagewriter,
stripcontrol=strip_control,
)
elif output_type == 'html': elif output_type == "html":
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, device = HTMLConverter(
layoutmode=layoutmode, laparams=laparams, rsrcmgr,
imagewriter=imagewriter) outfp,
codec=codec,
scale=scale,
layoutmode=layoutmode,
laparams=laparams,
imagewriter=imagewriter,
)
elif output_type == 'tag': elif output_type == "tag":
# Binary I/O is required, but we have no good way to test it here. # Binary I/O is required, but we have no good way to test it here.
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
else: else:
msg = f"Output type can be text, html, xml or tag but is " \ msg = f"Output type can be text, html, xml or tag but is " f"{output_type}"
f"{output_type}"
raise ValueError(msg) raise ValueError(msg)
assert device is not None assert device is not None
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(inf, for page in PDFPage.get_pages(
page_numbers, inf,
maxpages=maxpages, page_numbers,
password=password, maxpages=maxpages,
caching=not disable_caching): password=password,
caching=not disable_caching,
):
page.rotate = (page.rotate + rotation) % 360 page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page) interpreter.process_page(page)
@ -113,12 +125,12 @@ def extract_text_to_fp(
def extract_text( def extract_text(
pdf_file: FileOrName, pdf_file: FileOrName,
password: str = '', password: str = "",
page_numbers: Optional[Container[int]] = None, page_numbers: Optional[Container[int]] = None,
maxpages: int = 0, maxpages: int = 0,
caching: bool = True, caching: bool = True,
codec: str = 'utf-8', codec: str = "utf-8",
laparams: Optional[LAParams] = None laparams: Optional[LAParams] = None,
) -> str: ) -> str:
"""Parse and return the text contained in a PDF file. """Parse and return the text contained in a PDF file.
@ -139,16 +151,15 @@ def extract_text(
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string: with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
fp = cast(BinaryIO, fp) # we opened in binary mode fp = cast(BinaryIO, fp) # we opened in binary mode
rsrcmgr = PDFResourceManager(caching=caching) rsrcmgr = PDFResourceManager(caching=caching)
device = TextConverter(rsrcmgr, output_string, codec=codec, device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages( for page in PDFPage.get_pages(
fp, fp,
page_numbers, page_numbers,
maxpages=maxpages, maxpages=maxpages,
password=password, password=password,
caching=caching, caching=caching,
): ):
interpreter.process_page(page) interpreter.process_page(page)
@ -157,11 +168,11 @@ def extract_text(
def extract_pages( def extract_pages(
pdf_file: FileOrName, pdf_file: FileOrName,
password: str = '', password: str = "",
page_numbers: Optional[Container[int]] = None, page_numbers: Optional[Container[int]] = None,
maxpages: int = 0, maxpages: int = 0,
caching: bool = True, caching: bool = True,
laparams: Optional[LAParams] = None laparams: Optional[LAParams] = None,
) -> Iterator[LTPage]: ) -> Iterator[LTPage]:
"""Extract and yield LTPage objects """Extract and yield LTPage objects
@ -183,8 +194,9 @@ def extract_pages(
resource_manager = PDFResourceManager(caching=caching) resource_manager = PDFResourceManager(caching=caching)
device = PDFPageAggregator(resource_manager, laparams=laparams) device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device) interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages, for page in PDFPage.get_pages(
password=password, caching=caching): fp, page_numbers, maxpages=maxpages, password=password, caching=caching
):
interpreter.process_page(page) interpreter.process_page(page)
layout = device.get_result() layout = device.get_result()
yield layout yield layout

View File

@ -9,22 +9,15 @@ from .layout import LTImage
from .pdfcolor import LITERAL_DEVICE_CMYK from .pdfcolor import LITERAL_DEVICE_CMYK
from .pdfcolor import LITERAL_DEVICE_GRAY from .pdfcolor import LITERAL_DEVICE_GRAY
from .pdfcolor import LITERAL_DEVICE_RGB from .pdfcolor import LITERAL_DEVICE_RGB
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, \ from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_JPX_DECODE
LITERALS_JPX_DECODE
def align32(x: int) -> int: def align32(x: int) -> int:
return ((x+3)//4)*4 return ((x + 3) // 4) * 4
class BMPWriter: class BMPWriter:
def __init__( def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
self,
fp: BinaryIO,
bits: int,
width: int,
height: int
) -> None:
self.fp = fp self.fp = fp
self.bits = bits self.bits = bits
self.width = width self.width = width
@ -37,30 +30,43 @@ class BMPWriter:
ncols = 0 ncols = 0
else: else:
raise ValueError(bits) raise ValueError(bits)
self.linesize = align32((self.width*self.bits+7)//8) self.linesize = align32((self.width * self.bits + 7) // 8)
self.datasize = self.linesize * self.height self.datasize = self.linesize * self.height
headersize = 14+40+ncols*4 headersize = 14 + 40 + ncols * 4
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height, info = struct.pack(
1, self.bits, 0, self.datasize, 0, 0, ncols, 0) "<IiiHHIIIIII",
40,
self.width,
self.height,
1,
self.bits,
0,
self.datasize,
0,
0,
ncols,
0,
)
assert len(info) == 40, str(len(info)) assert len(info) == 40, str(len(info))
header = struct.pack('<ccIHHI', b'B', b'M', header = struct.pack(
headersize+self.datasize, 0, 0, headersize) "<ccIHHI", b"B", b"M", headersize + self.datasize, 0, 0, headersize
)
assert len(header) == 14, str(len(header)) assert len(header) == 14, str(len(header))
self.fp.write(header) self.fp.write(header)
self.fp.write(info) self.fp.write(info)
if ncols == 2: if ncols == 2:
# B&W color table # B&W color table
for i in (0, 255): for i in (0, 255):
self.fp.write(struct.pack('BBBx', i, i, i)) self.fp.write(struct.pack("BBBx", i, i, i))
elif ncols == 256: elif ncols == 256:
# grayscale color table # grayscale color table
for i in range(256): for i in range(256):
self.fp.write(struct.pack('BBBx', i, i, i)) self.fp.write(struct.pack("BBBx", i, i, i))
self.pos0 = self.fp.tell() self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize self.pos1 = self.pos0 + self.datasize
def write_line(self, y: int, data: bytes) -> None: def write_line(self, y: int, data: bytes) -> None:
self.fp.seek(self.pos1 - (y+1)*self.linesize) self.fp.seek(self.pos1 - (y + 1) * self.linesize)
self.fp.write(data) self.fp.write(data)
@ -80,43 +86,46 @@ class ImageWriter:
is_jbig2 = self.is_jbig2_image(image) is_jbig2 = self.is_jbig2_image(image)
ext = self._get_image_extension(image, width, height, is_jbig2) ext = self._get_image_extension(image, width, height, is_jbig2)
name, path = self._create_unique_image_name(self.outdir, name, path = self._create_unique_image_name(self.outdir, image.name, ext)
image.name, ext)
fp = open(path, 'wb') fp = open(path, "wb")
if ext == '.jpg': if ext == ".jpg":
raw_data = image.stream.get_rawdata() raw_data = image.stream.get_rawdata()
assert raw_data is not None assert raw_data is not None
if LITERAL_DEVICE_CMYK in image.colorspace: if LITERAL_DEVICE_CMYK in image.colorspace:
from PIL import Image # type: ignore[import] from PIL import Image # type: ignore[import]
from PIL import ImageChops from PIL import ImageChops
ifp = BytesIO(raw_data) ifp = BytesIO(raw_data)
i = Image.open(ifp) i = Image.open(ifp)
i = ImageChops.invert(i) i = ImageChops.invert(i)
i = i.convert('RGB') i = i.convert("RGB")
i.save(fp, 'JPEG') i.save(fp, "JPEG")
else: else:
fp.write(raw_data) fp.write(raw_data)
elif ext == '.jp2': elif ext == ".jp2":
# if we just write the raw data, most image programs # if we just write the raw data, most image programs
# that I have tried cannot open the file. However, # that I have tried cannot open the file. However,
# open and saving with PIL produces a file that # open and saving with PIL produces a file that
# seems to be easily opened by other programs # seems to be easily opened by other programs
from PIL import Image from PIL import Image
raw_data = image.stream.get_rawdata() raw_data = image.stream.get_rawdata()
assert raw_data is not None assert raw_data is not None
ifp = BytesIO(raw_data) ifp = BytesIO(raw_data)
i = Image.open(ifp) i = Image.open(ifp)
i.save(fp, 'JPEG2000') i.save(fp, "JPEG2000")
elif is_jbig2: elif is_jbig2:
input_stream = BytesIO() input_stream = BytesIO()
global_streams = self.jbig2_global(image) global_streams = self.jbig2_global(image)
if len(global_streams) > 1: if len(global_streams) > 1:
msg = 'There should never be more than one JBIG2Globals ' \ msg = (
'associated with a JBIG2 embedded image' "There should never be more than one JBIG2Globals "
"associated with a JBIG2 embedded image"
)
raise ValueError(msg) raise ValueError(msg)
if len(global_streams) == 1: if len(global_streams) == 1:
input_stream.write(global_streams[0].get_data().rstrip(b'\n')) input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
input_stream.write(image.stream.get_data()) input_stream.write(image.stream.get_data())
input_stream.seek(0) input_stream.seek(0)
reader = JBIG2StreamReader(input_stream) reader = JBIG2StreamReader(input_stream)
@ -128,24 +137,24 @@ class ImageWriter:
bmp = BMPWriter(fp, 1, width, height) bmp = BMPWriter(fp, 1, width, height)
data = image.stream.get_data() data = image.stream.get_data()
i = 0 i = 0
width = (width+7)//8 width = (width + 7) // 8
for y in range(height): for y in range(height):
bmp.write_line(y, data[i:i+width]) bmp.write_line(y, data[i : i + width])
i += width i += width
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
bmp = BMPWriter(fp, 24, width, height) bmp = BMPWriter(fp, 24, width, height)
data = image.stream.get_data() data = image.stream.get_data()
i = 0 i = 0
width = width*3 width = width * 3
for y in range(height): for y in range(height):
bmp.write_line(y, data[i:i+width]) bmp.write_line(y, data[i : i + width])
i += width i += width
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
bmp = BMPWriter(fp, 8, width, height) bmp = BMPWriter(fp, 8, width, height)
data = image.stream.get_data() data = image.stream.get_data()
i = 0 i = 0
for y in range(height): for y in range(height):
bmp.write_line(y, data[i:i+width]) bmp.write_line(y, data[i : i + width])
i += width i += width
else: else:
fp.write(image.stream.get_data()) fp.write(image.stream.get_data())
@ -168,43 +177,42 @@ class ImageWriter:
filters = image.stream.get_filters() filters = image.stream.get_filters()
for filter_name, params in filters: for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE: if filter_name in LITERALS_JBIG2_DECODE:
global_streams.append(params['JBIG2Globals'].resolve()) global_streams.append(params["JBIG2Globals"].resolve())
return global_streams return global_streams
@staticmethod @staticmethod
def _get_image_extension( def _get_image_extension(
image: LTImage, image: LTImage, width: int, height: int, is_jbig2: bool
width: int,
height: int,
is_jbig2: bool
) -> str: ) -> str:
filters = image.stream.get_filters() filters = image.stream.get_filters()
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg' ext = ".jpg"
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE: elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
ext = '.jp2' ext = ".jp2"
elif is_jbig2: elif is_jbig2:
ext = '.jb2' ext = ".jb2"
elif (image.bits == 1 or elif (
image.bits == 8 and image.bits == 1
(LITERAL_DEVICE_RGB in image.colorspace or or image.bits == 8
LITERAL_DEVICE_GRAY in image.colorspace)): and (
ext = '.%dx%d.bmp' % (width, height) LITERAL_DEVICE_RGB in image.colorspace
or LITERAL_DEVICE_GRAY in image.colorspace
)
):
ext = ".%dx%d.bmp" % (width, height)
else: else:
ext = '.%d.%dx%d.img' % (image.bits, width, height) ext = ".%d.%dx%d.img" % (image.bits, width, height)
return ext return ext
@staticmethod @staticmethod
def _create_unique_image_name( def _create_unique_image_name(
dirname: str, dirname: str, image_name: str, ext: str
image_name: str,
ext: str
) -> Tuple[str, str]: ) -> Tuple[str, str]:
name = image_name + ext name = image_name + ext
path = os.path.join(dirname, name) path = os.path.join(dirname, name)
img_index = 0 img_index = 0
while os.path.exists(path): while os.path.exists(path):
name = '%s.%d%s' % (image_name, img_index, ext) name = "%s.%d%s" % (image_name, img_index, ext)
path = os.path.join(dirname, name) path = os.path.join(dirname, name)
img_index += 1 img_index += 1
return name, path return name, path

View File

@ -19,10 +19,10 @@ HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
SEG_TYPE_MASK = 0b00111111 SEG_TYPE_MASK = 0b00111111
REF_COUNT_SHORT_MASK = 0b11100000 REF_COUNT_SHORT_MASK = 0b11100000
REF_COUNT_LONG_MASK = 0x1fffffff REF_COUNT_LONG_MASK = 0x1FFFFFFF
REF_COUNT_LONG = 7 REF_COUNT_LONG = 7
DATA_LEN_UNKNOWN = 0xffffffff DATA_LEN_UNKNOWN = 0xFFFFFFFF
# segment types # segment types
SEG_TYPE_IMMEDIATE_GEN_REGION = 38 SEG_TYPE_IMMEDIATE_GEN_REGION = 38
@ -30,7 +30,7 @@ SEG_TYPE_END_OF_PAGE = 49
SEG_TYPE_END_OF_FILE = 51 SEG_TYPE_END_OF_FILE = 51
# file literals # file literals
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A' FILE_HEADER_ID = b"\x97\x4A\x42\x32\x0D\x0A\x1A\x0A"
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001 FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
@ -66,12 +66,14 @@ def unpack_int(format: str, buffer: bytes) -> int:
JBIG2SegmentFlags = Dict[str, Union[int, bool]] JBIG2SegmentFlags = Dict[str, Union[int, bool]]
JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]] JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]]
JBIG2Segment = Dict[str, Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2Segment = Dict[
JBIG2RetentionFlags]] str, Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2RetentionFlags]
]
class JBIG2StreamReader: class JBIG2StreamReader:
"""Read segments from a JBIG2 byte stream""" """Read segments from a JBIG2 byte stream"""
def __init__(self, stream: BinaryIO) -> None: def __init__(self, stream: BinaryIO) -> None:
self.stream = stream self.stream = stream
@ -96,29 +98,23 @@ class JBIG2StreamReader:
return segments return segments
def is_eof(self) -> bool: def is_eof(self) -> bool:
if self.stream.read(1) == b'': if self.stream.read(1) == b"":
return True return True
else: else:
self.stream.seek(-1, os.SEEK_CUR) self.stream.seek(-1, os.SEEK_CUR)
return False return False
def parse_flags( def parse_flags(
self, self, segment: JBIG2Segment, flags: int, field: bytes
segment: JBIG2Segment,
flags: int,
field: bytes
) -> JBIG2SegmentFlags: ) -> JBIG2SegmentFlags:
return { return {
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags), "deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags), "page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
"type": masked_value(SEG_TYPE_MASK, flags) "type": masked_value(SEG_TYPE_MASK, flags),
} }
def parse_retention_flags( def parse_retention_flags(
self, self, segment: JBIG2Segment, flags: int, field: bytes
segment: JBIG2Segment,
flags: int,
field: bytes
) -> JBIG2RetentionFlags: ) -> JBIG2RetentionFlags:
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags) ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
retain_segments = [] retain_segments = []
@ -159,31 +155,23 @@ class JBIG2StreamReader:
"ref_segments": ref_segments, "ref_segments": ref_segments,
} }
def parse_page_assoc( def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int:
self,
segment: JBIG2Segment,
page: int,
field: bytes
) -> int:
if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]: if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
field += self.stream.read(3) field += self.stream.read(3)
page = unpack_int(">L", field) page = unpack_int(">L", field)
return page return page
def parse_data_length( def parse_data_length(
self, self, segment: JBIG2Segment, length: int, field: bytes
segment: JBIG2Segment,
length: int,
field: bytes
) -> int: ) -> int:
if length: if length:
if (cast(JBIG2SegmentFlags, segment["flags"])["type"] == if (
SEG_TYPE_IMMEDIATE_GEN_REGION) \ cast(JBIG2SegmentFlags, segment["flags"])["type"]
and (length == DATA_LEN_UNKNOWN): == SEG_TYPE_IMMEDIATE_GEN_REGION
) and (length == DATA_LEN_UNKNOWN):
raise NotImplementedError( raise NotImplementedError(
"Working with unknown segment length " "Working with unknown segment length " "is not implemented yet"
"is not implemented yet"
) )
else: else:
segment["raw_data"] = self.stream.read(length) segment["raw_data"] = self.stream.read(length)
@ -195,18 +183,16 @@ class JBIG2StreamWriter:
"""Write JBIG2 segments to a file in JBIG2 format""" """Write JBIG2 segments to a file in JBIG2 format"""
EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = { EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
'ref_count': 0, "ref_count": 0,
'ref_segments': cast(List[int], []), "ref_segments": cast(List[int], []),
'retain_segments': cast(List[bool], []) "retain_segments": cast(List[bool], []),
} }
def __init__(self, stream: BinaryIO) -> None: def __init__(self, stream: BinaryIO) -> None:
self.stream = stream self.stream = stream
def write_segments( def write_segments(
self, self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True
segments: Iterable[JBIG2Segment],
fix_last_page: bool = True
) -> int: ) -> int:
data_len = 0 data_len = 0
current_page: Optional[int] = None current_page: Optional[int] = None
@ -222,8 +208,10 @@ class JBIG2StreamWriter:
if fix_last_page: if fix_last_page:
seg_page = cast(int, segment.get("page_assoc")) seg_page = cast(int, segment.get("page_assoc"))
if cast(JBIG2SegmentFlags, segment["flags"])["type"] == \ if (
SEG_TYPE_END_OF_PAGE: cast(JBIG2SegmentFlags, segment["flags"])["type"]
== SEG_TYPE_END_OF_PAGE
):
current_page = None current_page = None
elif seg_page: elif seg_page:
current_page = seg_page current_page = seg_page
@ -237,9 +225,7 @@ class JBIG2StreamWriter:
return data_len return data_len
def write_file( def write_file(
self, self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True
segments: Iterable[JBIG2Segment],
fix_last_page: bool = True
) -> int: ) -> int:
header = FILE_HEADER_ID header = FILE_HEADER_ID
header_flags = FILE_HEAD_FLAG_SEQUENTIAL header_flags = FILE_HEAD_FLAG_SEQUENTIAL
@ -270,7 +256,7 @@ class JBIG2StreamWriter:
return data_len return data_len
def encode_segment(self, segment: JBIG2Segment) -> bytes: def encode_segment(self, segment: JBIG2Segment) -> bytes:
data = b'' data = b""
for field_format, name in SEG_STRUCT: for field_format, name in SEG_STRUCT:
value = segment.get(name) value = segment.get(name)
encoder = getattr(self, "encode_%s" % name, None) encoder = getattr(self, "encode_%s" % name, None)
@ -281,27 +267,26 @@ class JBIG2StreamWriter:
data += field data += field
return data return data
def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes:
) -> bytes:
flags = 0 flags = 0
if value.get("deferred"): if value.get("deferred"):
flags |= HEADER_FLAG_DEFERRED flags |= HEADER_FLAG_DEFERRED
if "page_assoc_long" in value: if "page_assoc_long" in value:
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \ flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags
if value["page_assoc_long"] else flags
else: else:
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \ flags |= (
if cast(int, segment.get("page", 0)) > 255 else flags HEADER_FLAG_PAGE_ASSOC_LONG
if cast(int, segment.get("page", 0)) > 255
else flags
)
flags |= mask_value(SEG_TYPE_MASK, value["type"]) flags |= mask_value(SEG_TYPE_MASK, value["type"])
return pack(">B", flags) return pack(">B", flags)
def encode_retention_flags( def encode_retention_flags(
self, self, value: JBIG2RetentionFlags, segment: JBIG2Segment
value: JBIG2RetentionFlags,
segment: JBIG2Segment
) -> bytes: ) -> bytes:
flags = [] flags = []
flags_format = ">B" flags_format = ">B"
@ -318,15 +303,12 @@ class JBIG2StreamWriter:
else: else:
bytes_count = math.ceil((ref_count + 1) / 8) bytes_count = math.ceil((ref_count + 1) / 8)
flags_format = ">L" + ("B" * bytes_count) flags_format = ">L" + ("B" * bytes_count)
flags_dword = mask_value( flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24
REF_COUNT_SHORT_MASK,
REF_COUNT_LONG
) << 24
flags.append(flags_dword) flags.append(flags_dword)
for byte_index in range(bytes_count): for byte_index in range(bytes_count):
ret_byte = 0 ret_byte = 0
ret_part = retain_segments[byte_index * 8:byte_index * 8 + 8] ret_part = retain_segments[byte_index * 8 : byte_index * 8 + 8]
for bit_pos, ret_seg in enumerate(ret_part): for bit_pos, ret_seg in enumerate(ret_part):
ret_byte |= 1 << bit_pos if ret_seg else ret_byte ret_byte |= 1 << bit_pos if ret_seg else ret_byte
@ -353,26 +335,22 @@ class JBIG2StreamWriter:
data += cast(bytes, segment["raw_data"]) data += cast(bytes, segment["raw_data"])
return data return data
def get_eop_segment( def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment:
self,
seg_number: int,
page_number: int
) -> JBIG2Segment:
return { return {
'data_length': 0, "data_length": 0,
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE}, "flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE},
'number': seg_number, "number": seg_number,
'page_assoc': page_number, "page_assoc": page_number,
'raw_data': b'', "raw_data": b"",
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
} }
def get_eof_segment(self, seg_number: int) -> JBIG2Segment: def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
return { return {
'data_length': 0, "data_length": 0,
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE}, "flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE},
'number': seg_number, "number": seg_number,
'page_assoc': 0, "page_assoc": 0,
'raw_data': b'', "raw_data": b"",
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS "retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
} }

View File

@ -7,241 +7,240 @@ This table is extracted from PDF Reference Manual 1.6, pp.925
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
EncodingRow = \ EncodingRow = Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
ENCODING: List[EncodingRow] = [ ENCODING: List[EncodingRow] = [
# (name, std, mac, win, pdf) # (name, std, mac, win, pdf)
('A', 65, 65, 65, 65), ("A", 65, 65, 65, 65),
('AE', 225, 174, 198, 198), ("AE", 225, 174, 198, 198),
('Aacute', None, 231, 193, 193), ("Aacute", None, 231, 193, 193),
('Acircumflex', None, 229, 194, 194), ("Acircumflex", None, 229, 194, 194),
('Adieresis', None, 128, 196, 196), ("Adieresis", None, 128, 196, 196),
('Agrave', None, 203, 192, 192), ("Agrave", None, 203, 192, 192),
('Aring', None, 129, 197, 197), ("Aring", None, 129, 197, 197),
('Atilde', None, 204, 195, 195), ("Atilde", None, 204, 195, 195),
('B', 66, 66, 66, 66), ("B", 66, 66, 66, 66),
('C', 67, 67, 67, 67), ("C", 67, 67, 67, 67),
('Ccedilla', None, 130, 199, 199), ("Ccedilla", None, 130, 199, 199),
('D', 68, 68, 68, 68), ("D", 68, 68, 68, 68),
('E', 69, 69, 69, 69), ("E", 69, 69, 69, 69),
('Eacute', None, 131, 201, 201), ("Eacute", None, 131, 201, 201),
('Ecircumflex', None, 230, 202, 202), ("Ecircumflex", None, 230, 202, 202),
('Edieresis', None, 232, 203, 203), ("Edieresis", None, 232, 203, 203),
('Egrave', None, 233, 200, 200), ("Egrave", None, 233, 200, 200),
('Eth', None, None, 208, 208), ("Eth", None, None, 208, 208),
('Euro', None, None, 128, 160), ("Euro", None, None, 128, 160),
('F', 70, 70, 70, 70), ("F", 70, 70, 70, 70),
('G', 71, 71, 71, 71), ("G", 71, 71, 71, 71),
('H', 72, 72, 72, 72), ("H", 72, 72, 72, 72),
('I', 73, 73, 73, 73), ("I", 73, 73, 73, 73),
('Iacute', None, 234, 205, 205), ("Iacute", None, 234, 205, 205),
('Icircumflex', None, 235, 206, 206), ("Icircumflex", None, 235, 206, 206),
('Idieresis', None, 236, 207, 207), ("Idieresis", None, 236, 207, 207),
('Igrave', None, 237, 204, 204), ("Igrave", None, 237, 204, 204),
('J', 74, 74, 74, 74), ("J", 74, 74, 74, 74),
('K', 75, 75, 75, 75), ("K", 75, 75, 75, 75),
('L', 76, 76, 76, 76), ("L", 76, 76, 76, 76),
('Lslash', 232, None, None, 149), ("Lslash", 232, None, None, 149),
('M', 77, 77, 77, 77), ("M", 77, 77, 77, 77),
('N', 78, 78, 78, 78), ("N", 78, 78, 78, 78),
('Ntilde', None, 132, 209, 209), ("Ntilde", None, 132, 209, 209),
('O', 79, 79, 79, 79), ("O", 79, 79, 79, 79),
('OE', 234, 206, 140, 150), ("OE", 234, 206, 140, 150),
('Oacute', None, 238, 211, 211), ("Oacute", None, 238, 211, 211),
('Ocircumflex', None, 239, 212, 212), ("Ocircumflex", None, 239, 212, 212),
('Odieresis', None, 133, 214, 214), ("Odieresis", None, 133, 214, 214),
('Ograve', None, 241, 210, 210), ("Ograve", None, 241, 210, 210),
('Oslash', 233, 175, 216, 216), ("Oslash", 233, 175, 216, 216),
('Otilde', None, 205, 213, 213), ("Otilde", None, 205, 213, 213),
('P', 80, 80, 80, 80), ("P", 80, 80, 80, 80),
('Q', 81, 81, 81, 81), ("Q", 81, 81, 81, 81),
('R', 82, 82, 82, 82), ("R", 82, 82, 82, 82),
('S', 83, 83, 83, 83), ("S", 83, 83, 83, 83),
('Scaron', None, None, 138, 151), ("Scaron", None, None, 138, 151),
('T', 84, 84, 84, 84), ("T", 84, 84, 84, 84),
('Thorn', None, None, 222, 222), ("Thorn", None, None, 222, 222),
('U', 85, 85, 85, 85), ("U", 85, 85, 85, 85),
('Uacute', None, 242, 218, 218), ("Uacute", None, 242, 218, 218),
('Ucircumflex', None, 243, 219, 219), ("Ucircumflex", None, 243, 219, 219),
('Udieresis', None, 134, 220, 220), ("Udieresis", None, 134, 220, 220),
('Ugrave', None, 244, 217, 217), ("Ugrave", None, 244, 217, 217),
('V', 86, 86, 86, 86), ("V", 86, 86, 86, 86),
('W', 87, 87, 87, 87), ("W", 87, 87, 87, 87),
('X', 88, 88, 88, 88), ("X", 88, 88, 88, 88),
('Y', 89, 89, 89, 89), ("Y", 89, 89, 89, 89),
('Yacute', None, None, 221, 221), ("Yacute", None, None, 221, 221),
('Ydieresis', None, 217, 159, 152), ("Ydieresis", None, 217, 159, 152),
('Z', 90, 90, 90, 90), ("Z", 90, 90, 90, 90),
('Zcaron', None, None, 142, 153), ("Zcaron", None, None, 142, 153),
('a', 97, 97, 97, 97), ("a", 97, 97, 97, 97),
('aacute', None, 135, 225, 225), ("aacute", None, 135, 225, 225),
('acircumflex', None, 137, 226, 226), ("acircumflex", None, 137, 226, 226),
('acute', 194, 171, 180, 180), ("acute", 194, 171, 180, 180),
('adieresis', None, 138, 228, 228), ("adieresis", None, 138, 228, 228),
('ae', 241, 190, 230, 230), ("ae", 241, 190, 230, 230),
('agrave', None, 136, 224, 224), ("agrave", None, 136, 224, 224),
('ampersand', 38, 38, 38, 38), ("ampersand", 38, 38, 38, 38),
('aring', None, 140, 229, 229), ("aring", None, 140, 229, 229),
('asciicircum', 94, 94, 94, 94), ("asciicircum", 94, 94, 94, 94),
('asciitilde', 126, 126, 126, 126), ("asciitilde", 126, 126, 126, 126),
('asterisk', 42, 42, 42, 42), ("asterisk", 42, 42, 42, 42),
('at', 64, 64, 64, 64), ("at", 64, 64, 64, 64),
('atilde', None, 139, 227, 227), ("atilde", None, 139, 227, 227),
('b', 98, 98, 98, 98), ("b", 98, 98, 98, 98),
('backslash', 92, 92, 92, 92), ("backslash", 92, 92, 92, 92),
('bar', 124, 124, 124, 124), ("bar", 124, 124, 124, 124),
('braceleft', 123, 123, 123, 123), ("braceleft", 123, 123, 123, 123),
('braceright', 125, 125, 125, 125), ("braceright", 125, 125, 125, 125),
('bracketleft', 91, 91, 91, 91), ("bracketleft", 91, 91, 91, 91),
('bracketright', 93, 93, 93, 93), ("bracketright", 93, 93, 93, 93),
('breve', 198, 249, None, 24), ("breve", 198, 249, None, 24),
('brokenbar', None, None, 166, 166), ("brokenbar", None, None, 166, 166),
('bullet', 183, 165, 149, 128), ("bullet", 183, 165, 149, 128),
('c', 99, 99, 99, 99), ("c", 99, 99, 99, 99),
('caron', 207, 255, None, 25), ("caron", 207, 255, None, 25),
('ccedilla', None, 141, 231, 231), ("ccedilla", None, 141, 231, 231),
('cedilla', 203, 252, 184, 184), ("cedilla", 203, 252, 184, 184),
('cent', 162, 162, 162, 162), ("cent", 162, 162, 162, 162),
('circumflex', 195, 246, 136, 26), ("circumflex", 195, 246, 136, 26),
('colon', 58, 58, 58, 58), ("colon", 58, 58, 58, 58),
('comma', 44, 44, 44, 44), ("comma", 44, 44, 44, 44),
('copyright', None, 169, 169, 169), ("copyright", None, 169, 169, 169),
('currency', 168, 219, 164, 164), ("currency", 168, 219, 164, 164),
('d', 100, 100, 100, 100), ("d", 100, 100, 100, 100),
('dagger', 178, 160, 134, 129), ("dagger", 178, 160, 134, 129),
('daggerdbl', 179, 224, 135, 130), ("daggerdbl", 179, 224, 135, 130),
('degree', None, 161, 176, 176), ("degree", None, 161, 176, 176),
('dieresis', 200, 172, 168, 168), ("dieresis", 200, 172, 168, 168),
('divide', None, 214, 247, 247), ("divide", None, 214, 247, 247),
('dollar', 36, 36, 36, 36), ("dollar", 36, 36, 36, 36),
('dotaccent', 199, 250, None, 27), ("dotaccent", 199, 250, None, 27),
('dotlessi', 245, 245, None, 154), ("dotlessi", 245, 245, None, 154),
('e', 101, 101, 101, 101), ("e", 101, 101, 101, 101),
('eacute', None, 142, 233, 233), ("eacute", None, 142, 233, 233),
('ecircumflex', None, 144, 234, 234), ("ecircumflex", None, 144, 234, 234),
('edieresis', None, 145, 235, 235), ("edieresis", None, 145, 235, 235),
('egrave', None, 143, 232, 232), ("egrave", None, 143, 232, 232),
('eight', 56, 56, 56, 56), ("eight", 56, 56, 56, 56),
('ellipsis', 188, 201, 133, 131), ("ellipsis", 188, 201, 133, 131),
('emdash', 208, 209, 151, 132), ("emdash", 208, 209, 151, 132),
('endash', 177, 208, 150, 133), ("endash", 177, 208, 150, 133),
('equal', 61, 61, 61, 61), ("equal", 61, 61, 61, 61),
('eth', None, None, 240, 240), ("eth", None, None, 240, 240),
('exclam', 33, 33, 33, 33), ("exclam", 33, 33, 33, 33),
('exclamdown', 161, 193, 161, 161), ("exclamdown", 161, 193, 161, 161),
('f', 102, 102, 102, 102), ("f", 102, 102, 102, 102),
('fi', 174, 222, None, 147), ("fi", 174, 222, None, 147),
('five', 53, 53, 53, 53), ("five", 53, 53, 53, 53),
('fl', 175, 223, None, 148), ("fl", 175, 223, None, 148),
('florin', 166, 196, 131, 134), ("florin", 166, 196, 131, 134),
('four', 52, 52, 52, 52), ("four", 52, 52, 52, 52),
('fraction', 164, 218, None, 135), ("fraction", 164, 218, None, 135),
('g', 103, 103, 103, 103), ("g", 103, 103, 103, 103),
('germandbls', 251, 167, 223, 223), ("germandbls", 251, 167, 223, 223),
('grave', 193, 96, 96, 96), ("grave", 193, 96, 96, 96),
('greater', 62, 62, 62, 62), ("greater", 62, 62, 62, 62),
('guillemotleft', 171, 199, 171, 171), ("guillemotleft", 171, 199, 171, 171),
('guillemotright', 187, 200, 187, 187), ("guillemotright", 187, 200, 187, 187),
('guilsinglleft', 172, 220, 139, 136), ("guilsinglleft", 172, 220, 139, 136),
('guilsinglright', 173, 221, 155, 137), ("guilsinglright", 173, 221, 155, 137),
('h', 104, 104, 104, 104), ("h", 104, 104, 104, 104),
('hungarumlaut', 205, 253, None, 28), ("hungarumlaut", 205, 253, None, 28),
('hyphen', 45, 45, 45, 45), ("hyphen", 45, 45, 45, 45),
('i', 105, 105, 105, 105), ("i", 105, 105, 105, 105),
('iacute', None, 146, 237, 237), ("iacute", None, 146, 237, 237),
('icircumflex', None, 148, 238, 238), ("icircumflex", None, 148, 238, 238),
('idieresis', None, 149, 239, 239), ("idieresis", None, 149, 239, 239),
('igrave', None, 147, 236, 236), ("igrave", None, 147, 236, 236),
('j', 106, 106, 106, 106), ("j", 106, 106, 106, 106),
('k', 107, 107, 107, 107), ("k", 107, 107, 107, 107),
('l', 108, 108, 108, 108), ("l", 108, 108, 108, 108),
('less', 60, 60, 60, 60), ("less", 60, 60, 60, 60),
('logicalnot', None, 194, 172, 172), ("logicalnot", None, 194, 172, 172),
('lslash', 248, None, None, 155), ("lslash", 248, None, None, 155),
('m', 109, 109, 109, 109), ("m", 109, 109, 109, 109),
('macron', 197, 248, 175, 175), ("macron", 197, 248, 175, 175),
('minus', None, None, None, 138), ("minus", None, None, None, 138),
('mu', None, 181, 181, 181), ("mu", None, 181, 181, 181),
('multiply', None, None, 215, 215), ("multiply", None, None, 215, 215),
('n', 110, 110, 110, 110), ("n", 110, 110, 110, 110),
('nbspace', None, 202, 160, None), ("nbspace", None, 202, 160, None),
('nine', 57, 57, 57, 57), ("nine", 57, 57, 57, 57),
('ntilde', None, 150, 241, 241), ("ntilde", None, 150, 241, 241),
('numbersign', 35, 35, 35, 35), ("numbersign", 35, 35, 35, 35),
('o', 111, 111, 111, 111), ("o", 111, 111, 111, 111),
('oacute', None, 151, 243, 243), ("oacute", None, 151, 243, 243),
('ocircumflex', None, 153, 244, 244), ("ocircumflex", None, 153, 244, 244),
('odieresis', None, 154, 246, 246), ("odieresis", None, 154, 246, 246),
('oe', 250, 207, 156, 156), ("oe", 250, 207, 156, 156),
('ogonek', 206, 254, None, 29), ("ogonek", 206, 254, None, 29),
('ograve', None, 152, 242, 242), ("ograve", None, 152, 242, 242),
('one', 49, 49, 49, 49), ("one", 49, 49, 49, 49),
('onehalf', None, None, 189, 189), ("onehalf", None, None, 189, 189),
('onequarter', None, None, 188, 188), ("onequarter", None, None, 188, 188),
('onesuperior', None, None, 185, 185), ("onesuperior", None, None, 185, 185),
('ordfeminine', 227, 187, 170, 170), ("ordfeminine", 227, 187, 170, 170),
('ordmasculine', 235, 188, 186, 186), ("ordmasculine", 235, 188, 186, 186),
('oslash', 249, 191, 248, 248), ("oslash", 249, 191, 248, 248),
('otilde', None, 155, 245, 245), ("otilde", None, 155, 245, 245),
('p', 112, 112, 112, 112), ("p", 112, 112, 112, 112),
('paragraph', 182, 166, 182, 182), ("paragraph", 182, 166, 182, 182),
('parenleft', 40, 40, 40, 40), ("parenleft", 40, 40, 40, 40),
('parenright', 41, 41, 41, 41), ("parenright", 41, 41, 41, 41),
('percent', 37, 37, 37, 37), ("percent", 37, 37, 37, 37),
('period', 46, 46, 46, 46), ("period", 46, 46, 46, 46),
('periodcentered', 180, 225, 183, 183), ("periodcentered", 180, 225, 183, 183),
('perthousand', 189, 228, 137, 139), ("perthousand", 189, 228, 137, 139),
('plus', 43, 43, 43, 43), ("plus", 43, 43, 43, 43),
('plusminus', None, 177, 177, 177), ("plusminus", None, 177, 177, 177),
('q', 113, 113, 113, 113), ("q", 113, 113, 113, 113),
('question', 63, 63, 63, 63), ("question", 63, 63, 63, 63),
('questiondown', 191, 192, 191, 191), ("questiondown", 191, 192, 191, 191),
('quotedbl', 34, 34, 34, 34), ("quotedbl", 34, 34, 34, 34),
('quotedblbase', 185, 227, 132, 140), ("quotedblbase", 185, 227, 132, 140),
('quotedblleft', 170, 210, 147, 141), ("quotedblleft", 170, 210, 147, 141),
('quotedblright', 186, 211, 148, 142), ("quotedblright", 186, 211, 148, 142),
('quoteleft', 96, 212, 145, 143), ("quoteleft", 96, 212, 145, 143),
('quoteright', 39, 213, 146, 144), ("quoteright", 39, 213, 146, 144),
('quotesinglbase', 184, 226, 130, 145), ("quotesinglbase", 184, 226, 130, 145),
('quotesingle', 169, 39, 39, 39), ("quotesingle", 169, 39, 39, 39),
('r', 114, 114, 114, 114), ("r", 114, 114, 114, 114),
('registered', None, 168, 174, 174), ("registered", None, 168, 174, 174),
('ring', 202, 251, None, 30), ("ring", 202, 251, None, 30),
('s', 115, 115, 115, 115), ("s", 115, 115, 115, 115),
('scaron', None, None, 154, 157), ("scaron", None, None, 154, 157),
('section', 167, 164, 167, 167), ("section", 167, 164, 167, 167),
('semicolon', 59, 59, 59, 59), ("semicolon", 59, 59, 59, 59),
('seven', 55, 55, 55, 55), ("seven", 55, 55, 55, 55),
('six', 54, 54, 54, 54), ("six", 54, 54, 54, 54),
('slash', 47, 47, 47, 47), ("slash", 47, 47, 47, 47),
('space', 32, 32, 32, 32), ("space", 32, 32, 32, 32),
('space', None, 202, 160, None), ("space", None, 202, 160, None),
('space', None, 202, 173, None), ("space", None, 202, 173, None),
('sterling', 163, 163, 163, 163), ("sterling", 163, 163, 163, 163),
('t', 116, 116, 116, 116), ("t", 116, 116, 116, 116),
('thorn', None, None, 254, 254), ("thorn", None, None, 254, 254),
('three', 51, 51, 51, 51), ("three", 51, 51, 51, 51),
('threequarters', None, None, 190, 190), ("threequarters", None, None, 190, 190),
('threesuperior', None, None, 179, 179), ("threesuperior", None, None, 179, 179),
('tilde', 196, 247, 152, 31), ("tilde", 196, 247, 152, 31),
('trademark', None, 170, 153, 146), ("trademark", None, 170, 153, 146),
('two', 50, 50, 50, 50), ("two", 50, 50, 50, 50),
('twosuperior', None, None, 178, 178), ("twosuperior", None, None, 178, 178),
('u', 117, 117, 117, 117), ("u", 117, 117, 117, 117),
('uacute', None, 156, 250, 250), ("uacute", None, 156, 250, 250),
('ucircumflex', None, 158, 251, 251), ("ucircumflex", None, 158, 251, 251),
('udieresis', None, 159, 252, 252), ("udieresis", None, 159, 252, 252),
('ugrave', None, 157, 249, 249), ("ugrave", None, 157, 249, 249),
('underscore', 95, 95, 95, 95), ("underscore", 95, 95, 95, 95),
('v', 118, 118, 118, 118), ("v", 118, 118, 118, 118),
('w', 119, 119, 119, 119), ("w", 119, 119, 119, 119),
('x', 120, 120, 120, 120), ("x", 120, 120, 120, 120),
('y', 121, 121, 121, 121), ("y", 121, 121, 121, 121),
('yacute', None, None, 253, 253), ("yacute", None, None, 253, 253),
('ydieresis', None, 216, 255, 255), ("ydieresis", None, 216, 255, 255),
('yen', 165, 180, 165, 165), ("yen", 165, 180, 165, 165),
('z', 122, 122, 122, 122), ("z", 122, 122, 122, 122),
('zcaron', None, None, 158, 158), ("zcaron", None, None, 158, 158),
('zero', 48, 48, 48, 48), ("zero", 48, 48, 48, 48),
] ]

View File

@ -1,7 +1,19 @@
import heapq import heapq
import logging import logging
from typing import (Dict, Generic, Iterable, Iterator, List, Optional, from typing import (
Sequence, Set, Tuple, TypeVar, Union, cast) Dict,
Generic,
Iterable,
Iterator,
List,
Optional,
Sequence,
Set,
Tuple,
TypeVar,
Union,
cast,
)
from .pdfcolor import PDFColorSpace from .pdfcolor import PDFColorSpace
from .pdffont import PDFFont from .pdffont import PDFFont
@ -25,7 +37,6 @@ logger = logging.getLogger(__name__)
class IndexAssigner: class IndexAssigner:
def __init__(self, index: int = 0) -> None: def __init__(self, index: int = 0) -> None:
self.index = index self.index = index
@ -74,7 +85,7 @@ class LAParams:
word_margin: float = 0.1, word_margin: float = 0.1,
boxes_flow: Optional[float] = 0.5, boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = False, detect_vertical: bool = False,
all_texts: bool = False all_texts: bool = False,
) -> None: ) -> None:
self.line_overlap = line_overlap self.line_overlap = line_overlap
self.char_margin = char_margin self.char_margin = char_margin
@ -88,19 +99,22 @@ class LAParams:
def _validate(self) -> None: def _validate(self) -> None:
if self.boxes_flow is not None: if self.boxes_flow is not None:
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a " boxes_flow_err_msg = (
"number between -1 and +1") "LAParam boxes_flow should be None, or a " "number between -1 and +1"
if not (isinstance(self.boxes_flow, int) or )
isinstance(self.boxes_flow, float)): if not (
isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
):
raise TypeError(boxes_flow_err_msg) raise TypeError(boxes_flow_err_msg)
if not -1 <= self.boxes_flow <= 1: if not -1 <= self.boxes_flow <= 1:
raise ValueError(boxes_flow_err_msg) raise ValueError(boxes_flow_err_msg)
def __repr__(self) -> str: def __repr__(self) -> str:
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \ return (
'word_margin=%.1f all_texts=%r>' % \ "<LAParams: char_margin=%.1f, line_margin=%.1f, "
(self.char_margin, self.line_margin, self.word_margin, "word_margin=%.1f all_texts=%r>"
self.all_texts) % (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
)
class LTItem: class LTItem:
@ -115,8 +129,7 @@ class LTText:
"""Interface for things that have text""" """Interface for things that have text"""
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s %r>' % return "<%s %r>" % (self.__class__.__name__, self.get_text())
(self.__class__.__name__, self.get_text()))
def get_text(self) -> str: def get_text(self) -> str:
"""Text contained in this object""" """Text contained in this object"""
@ -131,8 +144,7 @@ class LTComponent(LTItem):
self.set_bbox(bbox) self.set_bbox(bbox)
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s %s>' % return "<%s %s>" % (self.__class__.__name__, bbox2str(self.bbox))
(self.__class__.__name__, bbox2str(self.bbox)))
# Disable comparison. # Disable comparison.
def __lt__(self, _: object) -> bool: def __lt__(self, _: object) -> bool:
@ -153,8 +165,8 @@ class LTComponent(LTItem):
self.y0 = y0 self.y0 = y0
self.x1 = x1 self.x1 = x1
self.y1 = y1 self.y1 = y1
self.width = x1-x0 self.width = x1 - x0
self.height = y1-y0 self.height = y1 - y0
self.bbox = bbox self.bbox = bbox
def is_empty(self) -> bool: def is_empty(self) -> bool:
@ -169,12 +181,12 @@ class LTComponent(LTItem):
if self.is_hoverlap(obj): if self.is_hoverlap(obj):
return 0 return 0
else: else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
def hoverlap(self, obj: "LTComponent") -> float: def hoverlap(self, obj: "LTComponent") -> float:
assert isinstance(obj, LTComponent), str(type(obj)) assert isinstance(obj, LTComponent), str(type(obj))
if self.is_hoverlap(obj): if self.is_hoverlap(obj):
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) return min(abs(self.x0 - obj.x1), abs(self.x1 - obj.x0))
else: else:
return 0 return 0
@ -187,12 +199,12 @@ class LTComponent(LTItem):
if self.is_voverlap(obj): if self.is_voverlap(obj):
return 0 return 0
else: else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
def voverlap(self, obj: "LTComponent") -> float: def voverlap(self, obj: "LTComponent") -> float:
assert isinstance(obj, LTComponent), str(type(obj)) assert isinstance(obj, LTComponent), str(type(obj))
if self.is_voverlap(obj): if self.is_voverlap(obj):
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) return min(abs(self.y0 - obj.y1), abs(self.y1 - obj.y0))
else: else:
return 0 return 0
@ -208,7 +220,7 @@ class LTCurve(LTComponent):
fill: bool = False, fill: bool = False,
evenodd: bool = False, evenodd: bool = False,
stroking_color: Optional[Color] = None, stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None non_stroking_color: Optional[Color] = None,
) -> None: ) -> None:
LTComponent.__init__(self, get_bound(pts)) LTComponent.__init__(self, get_bound(pts))
self.pts = pts self.pts = pts
@ -220,7 +232,7 @@ class LTCurve(LTComponent):
self.non_stroking_color = non_stroking_color self.non_stroking_color = non_stroking_color
def get_pts(self) -> str: def get_pts(self) -> str:
return ','.join('%.3f,%.3f' % p for p in self.pts) return ",".join("%.3f,%.3f" % p for p in self.pts)
class LTLine(LTCurve): class LTLine(LTCurve):
@ -238,10 +250,18 @@ class LTLine(LTCurve):
fill: bool = False, fill: bool = False,
evenodd: bool = False, evenodd: bool = False,
stroking_color: Optional[Color] = None, stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None non_stroking_color: Optional[Color] = None,
) -> None: ) -> None:
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd, LTCurve.__init__(
stroking_color, non_stroking_color) self,
linewidth,
[p0, p1],
stroke,
fill,
evenodd,
stroking_color,
non_stroking_color,
)
class LTRect(LTCurve): class LTRect(LTCurve):
@ -258,12 +278,19 @@ class LTRect(LTCurve):
fill: bool = False, fill: bool = False,
evenodd: bool = False, evenodd: bool = False,
stroking_color: Optional[Color] = None, stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None non_stroking_color: Optional[Color] = None,
) -> None: ) -> None:
(x0, y0, x1, y1) = bbox (x0, y0, x1, y1) = bbox
LTCurve.__init__(self, linewidth, LTCurve.__init__(
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke, self,
fill, evenodd, stroking_color, non_stroking_color) linewidth,
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
stroke,
fill,
evenodd,
stroking_color,
non_stroking_color,
)
class LTImage(LTComponent): class LTImage(LTComponent):
@ -276,18 +303,20 @@ class LTImage(LTComponent):
LTComponent.__init__(self, bbox) LTComponent.__init__(self, bbox)
self.name = name self.name = name
self.stream = stream self.stream = stream
self.srcsize = (stream.get_any(('W', 'Width')), self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
stream.get_any(('H', 'Height'))) self.imagemask = stream.get_any(("IM", "ImageMask"))
self.imagemask = stream.get_any(('IM', 'ImageMask')) self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1) self.colorspace = stream.get_any(("CS", "ColorSpace"))
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
if not isinstance(self.colorspace, list): if not isinstance(self.colorspace, list):
self.colorspace = [self.colorspace] self.colorspace = [self.colorspace]
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s(%s) %s %r>' % return "<%s(%s) %s %r>" % (
(self.__class__.__name__, self.name, self.__class__.__name__,
bbox2str(self.bbox), self.srcsize)) self.name,
bbox2str(self.bbox),
self.srcsize,
)
class LTAnno(LTItem, LTText): class LTAnno(LTItem, LTText):
@ -320,7 +349,7 @@ class LTChar(LTComponent, LTText):
textwidth: float, textwidth: float,
textdisp: Union[float, Tuple[Optional[float], float]], textdisp: Union[float, Tuple[Optional[float], float]],
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: PDFGraphicState graphicstate: PDFGraphicState,
) -> None: ) -> None:
LTText.__init__(self) LTText.__init__(self)
self._text = text self._text = text
@ -337,8 +366,8 @@ class LTChar(LTComponent, LTText):
if vx is None: if vx is None:
vx = fontsize * 0.5 vx = fontsize * 0.5
else: else:
vx = vx * fontsize * .001 vx = vx * fontsize * 0.001
vy = (1000 - vy) * fontsize * .001 vy = (1000 - vy) * fontsize * 0.001
bbox_lower_left = (-vx, vy + rise + self.adv) bbox_lower_left = (-vx, vy + rise + self.adv)
bbox_upper_right = (-vx + fontsize, vy + rise) bbox_upper_right = (-vx + fontsize, vy + rise)
else: else:
@ -347,7 +376,7 @@ class LTChar(LTComponent, LTText):
bbox_lower_left = (0, descent + rise) bbox_lower_left = (0, descent + rise)
bbox_upper_right = (self.adv, descent + rise + fontsize) bbox_upper_right = (self.adv, descent + rise + fontsize)
(a, b, c, d, e, f) = self.matrix (a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0) self.upright = 0 < a * d * scaling and b * c <= 0
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left) (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right) (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
if x1 < x0: if x1 < x0:
@ -362,10 +391,14 @@ class LTChar(LTComponent, LTText):
return return
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' % return "<%s %s matrix=%s font=%r adv=%s text=%r>" % (
(self.__class__.__name__, bbox2str(self.bbox), self.__class__.__name__,
matrix2str(self.matrix), self.fontname, self.adv, bbox2str(self.bbox),
self.get_text())) matrix2str(self.matrix),
self.fontname,
self.adv,
self.get_text(),
)
def get_text(self) -> str: def get_text(self) -> str:
return self._text return self._text
@ -375,7 +408,7 @@ class LTChar(LTComponent, LTText):
return True return True
LTItemT = TypeVar('LTItemT', bound=LTItem) LTItemT = TypeVar("LTItemT", bound=LTItem)
class LTContainer(LTComponent, Generic[LTItemT]): class LTContainer(LTComponent, Generic[LTItemT]):
@ -416,8 +449,14 @@ class LTExpandableContainer(LTContainer[LTItemT]):
# super() LTContainer only considers LTItem (no bounding box). # super() LTContainer only considers LTItem (no bounding box).
def add(self, obj: LTComponent) -> None: # type: ignore[override] def add(self, obj: LTComponent) -> None: # type: ignore[override]
LTContainer.add(self, cast(LTItemT, obj)) LTContainer.add(self, cast(LTItemT, obj))
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0), self.set_bbox(
max(self.x1, obj.x1), max(self.y1, obj.y1))) (
min(self.x0, obj.x0),
min(self.y0, obj.y0),
max(self.x1, obj.x1),
max(self.y1, obj.y1),
)
)
return return
@ -428,8 +467,9 @@ class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
return return
def get_text(self) -> str: def get_text(self) -> str:
return ''.join(cast(LTText, obj).get_text() for obj in self return "".join(
if isinstance(obj, LTText)) cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
)
TextLineElement = Union[LTChar, LTAnno] TextLineElement = Union[LTChar, LTAnno]
@ -448,17 +488,20 @@ class LTTextLine(LTTextContainer[TextLineElement]):
return return
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s %s %r>' % return "<%s %s %r>" % (
(self.__class__.__name__, bbox2str(self.bbox), self.__class__.__name__,
self.get_text())) bbox2str(self.bbox),
self.get_text(),
)
def analyze(self, laparams: LAParams) -> None: def analyze(self, laparams: LAParams) -> None:
LTTextContainer.analyze(self, laparams) LTTextContainer.analyze(self, laparams)
LTContainer.add(self, LTAnno('\n')) LTContainer.add(self, LTAnno("\n"))
return return
def find_neighbors(self, plane: Plane[LTComponentT], ratio: float def find_neighbors(
) -> List["LTTextLine"]: self, plane: Plane[LTComponentT], ratio: float
) -> List["LTTextLine"]:
raise NotImplementedError raise NotImplementedError
@ -474,15 +517,13 @@ class LTTextLineHorizontal(LTTextLine):
if isinstance(obj, LTChar) and self.word_margin: if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height) margin = self.word_margin * max(obj.width, obj.height)
if self._x1 < obj.x0 - margin: if self._x1 < obj.x0 - margin:
LTContainer.add(self, LTAnno(' ')) LTContainer.add(self, LTAnno(" "))
self._x1 = obj.x1 self._x1 = obj.x1
super().add(obj) super().add(obj)
return return
def find_neighbors( def find_neighbors(
self, self, plane: Plane[LTComponentT], ratio: float
plane: Plane[LTComponentT],
ratio: float
) -> List[LTTextLine]: ) -> List[LTTextLine]:
""" """
Finds neighboring LTTextLineHorizontals in the plane. Finds neighboring LTTextLineHorizontals in the plane.
@ -494,49 +535,41 @@ class LTTextLineHorizontal(LTTextLine):
""" """
d = ratio * self.height d = ratio * self.height
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
return [obj for obj in objs return [
if (isinstance(obj, LTTextLineHorizontal) and obj
self._is_same_height_as(obj, tolerance=d) and for obj in objs
(self._is_left_aligned_with(obj, tolerance=d) or if (
self._is_right_aligned_with(obj, tolerance=d) or isinstance(obj, LTTextLineHorizontal)
self._is_centrally_aligned_with(obj, tolerance=d)))] and self._is_same_height_as(obj, tolerance=d)
and (
self._is_left_aligned_with(obj, tolerance=d)
or self._is_right_aligned_with(obj, tolerance=d)
or self._is_centrally_aligned_with(obj, tolerance=d)
)
)
]
def _is_left_aligned_with( def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
""" """
Whether the left-hand edge of `other` is within `tolerance`. Whether the left-hand edge of `other` is within `tolerance`.
""" """
return abs(other.x0 - self.x0) <= tolerance return abs(other.x0 - self.x0) <= tolerance
def _is_right_aligned_with( def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
""" """
Whether the right-hand edge of `other` is within `tolerance`. Whether the right-hand edge of `other` is within `tolerance`.
""" """
return abs(other.x1 - self.x1) <= tolerance return abs(other.x1 - self.x1) <= tolerance
def _is_centrally_aligned_with( def _is_centrally_aligned_with(
self, self, other: LTComponent, tolerance: float = 0
other: LTComponent,
tolerance: float = 0
) -> bool: ) -> bool:
""" """
Whether the horizontal center of `other` is within `tolerance`. Whether the horizontal center of `other` is within `tolerance`.
""" """
return abs( return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
def _is_same_height_as( def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
return abs(other.height - self.height) <= tolerance return abs(other.height - self.height) <= tolerance
@ -552,15 +585,13 @@ class LTTextLineVertical(LTTextLine):
if isinstance(obj, LTChar) and self.word_margin: if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height) margin = self.word_margin * max(obj.width, obj.height)
if obj.y1 + margin < self._y0: if obj.y1 + margin < self._y0:
LTContainer.add(self, LTAnno(' ')) LTContainer.add(self, LTAnno(" "))
self._y0 = obj.y0 self._y0 = obj.y0
super().add(obj) super().add(obj)
return return
def find_neighbors( def find_neighbors(
self, self, plane: Plane[LTComponentT], ratio: float
plane: Plane[LTComponentT],
ratio: float
) -> List[LTTextLine]: ) -> List[LTTextLine]:
""" """
Finds neighboring LTTextLineVerticals in the plane. Finds neighboring LTTextLineVerticals in the plane.
@ -572,43 +603,39 @@ class LTTextLineVertical(LTTextLine):
""" """
d = ratio * self.width d = ratio * self.width
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1)) objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
return [obj for obj in objs return [
if (isinstance(obj, LTTextLineVertical) and obj
self._is_same_width_as(obj, tolerance=d) and for obj in objs
(self._is_lower_aligned_with(obj, tolerance=d) or if (
self._is_upper_aligned_with(obj, tolerance=d) or isinstance(obj, LTTextLineVertical)
self._is_centrally_aligned_with(obj, tolerance=d)))] and self._is_same_width_as(obj, tolerance=d)
and (
self._is_lower_aligned_with(obj, tolerance=d)
or self._is_upper_aligned_with(obj, tolerance=d)
or self._is_centrally_aligned_with(obj, tolerance=d)
)
)
]
def _is_lower_aligned_with( def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
""" """
Whether the lower edge of `other` is within `tolerance`. Whether the lower edge of `other` is within `tolerance`.
""" """
return abs(other.y0 - self.y0) <= tolerance return abs(other.y0 - self.y0) <= tolerance
def _is_upper_aligned_with( def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
""" """
Whether the upper edge of `other` is within `tolerance`. Whether the upper edge of `other` is within `tolerance`.
""" """
return abs(other.y1 - self.y1) <= tolerance return abs(other.y1 - self.y1) <= tolerance
def _is_centrally_aligned_with( def _is_centrally_aligned_with(
self, self, other: LTComponent, tolerance: float = 0
other: LTComponent,
tolerance: float = 0
) -> bool: ) -> bool:
""" """
Whether the vertical center of `other` is within `tolerance`. Whether the vertical center of `other` is within `tolerance`.
""" """
return abs( return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool: def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
return abs(other.width - self.width) <= tolerance return abs(other.width - self.width) <= tolerance
@ -628,9 +655,12 @@ class LTTextBox(LTTextContainer[LTTextLine]):
return return
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s(%s) %s %r>' % return "<%s(%s) %s %r>" % (
(self.__class__.__name__, self.__class__.__name__,
self.index, bbox2str(self.bbox), self.get_text())) self.index,
bbox2str(self.bbox),
self.get_text(),
)
def get_writing_mode(self) -> str: def get_writing_mode(self) -> str:
raise NotImplementedError raise NotImplementedError
@ -643,7 +673,7 @@ class LTTextBoxHorizontal(LTTextBox):
return return
def get_writing_mode(self) -> str: def get_writing_mode(self) -> str:
return 'lr-tb' return "lr-tb"
class LTTextBoxVertical(LTTextBox): class LTTextBoxVertical(LTTextBox):
@ -653,7 +683,7 @@ class LTTextBoxVertical(LTTextBox):
return return
def get_writing_mode(self) -> str: def get_writing_mode(self) -> str:
return 'tb-rl' return "tb-rl"
TextGroupElement = Union[LTTextBox, "LTTextGroup"] TextGroupElement = Union[LTTextBox, "LTTextGroup"]
@ -674,7 +704,8 @@ class LTTextGroupLRTB(LTTextGroup):
# reorder the objects from top-left to bottom-right. # reorder the objects from top-left to bottom-right.
self._objs.sort( self._objs.sort(
key=lambda obj: (1 - boxes_flow) * obj.x0 key=lambda obj: (1 - boxes_flow) * obj.x0
- (1 + boxes_flow) * (obj.y0 + obj.y1)) - (1 + boxes_flow) * (obj.y0 + obj.y1)
)
return return
@ -685,8 +716,9 @@ class LTTextGroupTBRL(LTTextGroup):
boxes_flow = laparams.boxes_flow boxes_flow = laparams.boxes_flow
# reorder the objects from top-right to bottom-left. # reorder the objects from top-right to bottom-left.
self._objs.sort( self._objs.sort(
key=lambda obj: - (1 + boxes_flow) * (obj.x0 + obj.x1) key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
- (1 - boxes_flow) * obj.y1) - (1 - boxes_flow) * obj.y1
)
return return
@ -698,9 +730,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
# group_objects: group text object to textlines. # group_objects: group text object to textlines.
def group_objects( def group_objects(
self, self, laparams: LAParams, objs: Iterable[LTComponent]
laparams: LAParams,
objs: Iterable[LTComponent]
) -> Iterator[LTTextLine]: ) -> Iterator[LTTextLine]:
obj0 = None obj0 = None
line = None line = None
@ -716,13 +746,14 @@ class LTLayoutContainer(LTContainer[LTComponent]):
# #
# |<--->| # |<--->|
# (char_margin) # (char_margin)
halign = \ halign = (
obj0.is_compatible(obj1) \ obj0.is_compatible(obj1)
and obj0.is_voverlap(obj1) \ and obj0.is_voverlap(obj1)
and min(obj0.height, obj1.height) * laparams.line_overlap \ and min(obj0.height, obj1.height) * laparams.line_overlap
< obj0.voverlap(obj1) \ < obj0.voverlap(obj1)
and obj0.hdistance(obj1) \ and obj0.hdistance(obj1)
< max(obj0.width, obj1.width) * laparams.char_margin < max(obj0.width, obj1.width) * laparams.char_margin
)
# valign: obj0 and obj1 is vertically aligned. # valign: obj0 and obj1 is vertically aligned.
# #
@ -738,17 +769,19 @@ class LTLayoutContainer(LTContainer[LTComponent]):
# #
# |<-->| # |<-->|
# (line_overlap) # (line_overlap)
valign = \ valign = (
laparams.detect_vertical \ laparams.detect_vertical
and obj0.is_compatible(obj1) \ and obj0.is_compatible(obj1)
and obj0.is_hoverlap(obj1) \ and obj0.is_hoverlap(obj1)
and min(obj0.width, obj1.width) * laparams.line_overlap \ and min(obj0.width, obj1.width) * laparams.line_overlap
< obj0.hoverlap(obj1) \ < obj0.hoverlap(obj1)
and obj0.vdistance(obj1) \ and obj0.vdistance(obj1)
< max(obj0.height, obj1.height) * laparams.char_margin < max(obj0.height, obj1.height) * laparams.char_margin
)
if ((halign and isinstance(line, LTTextLineHorizontal)) or if (halign and isinstance(line, LTTextLineHorizontal)) or (
(valign and isinstance(line, LTTextLineVertical))): valign and isinstance(line, LTTextLineVertical)
):
line.add(obj1) line.add(obj1)
elif line is not None: elif line is not None:
@ -777,9 +810,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
return return
def group_textlines( def group_textlines(
self, self, laparams: LAParams, lines: Iterable[LTTextLine]
laparams: LAParams,
lines: Iterable[LTTextLine]
) -> Iterator[LTTextBox]: ) -> Iterator[LTTextBox]:
"""Group neighboring lines to textboxes""" """Group neighboring lines to textboxes"""
plane: Plane[LTTextLine] = Plane(self.bbox) plane: Plane[LTTextLine] = Plane(self.bbox)
@ -812,9 +843,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
return return
def group_textboxes( def group_textboxes(
self, self, laparams: LAParams, boxes: Sequence[LTTextBox]
laparams: LAParams,
boxes: Sequence[LTTextBox]
) -> List[LTTextGroup]: ) -> List[LTTextGroup]:
"""Group textboxes hierarchically. """Group textboxes hierarchically.
@ -853,8 +882,11 @@ class LTLayoutContainer(LTContainer[LTComponent]):
y0 = min(obj1.y0, obj2.y0) y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1) x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1) y1 = max(obj1.y1, obj2.y1)
return (x1 - x0) * (y1 - y0) \ return (
- obj1.width*obj1.height - obj2.width*obj2.height (x1 - x0) * (y1 - y0)
- obj1.width * obj1.height
- obj2.width * obj2.height
)
def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]: def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
"""Check if there's any other object between obj1 and obj2.""" """Check if there's any other object between obj1 and obj2."""
@ -868,10 +900,9 @@ class LTLayoutContainer(LTContainer[LTComponent]):
dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = [] dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []
for i in range(len(boxes)): for i in range(len(boxes)):
box1 = boxes[i] box1 = boxes[i]
for j in range(i+1, len(boxes)): for j in range(i + 1, len(boxes)):
box2 = boxes[j] box2 = boxes[j]
dists.append((False, dist(box1, box2), id(box1), id(box2), dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
box1, box2))
heapq.heapify(dists) heapq.heapify(dists)
plane.extend(boxes) plane.extend(boxes)
@ -883,8 +914,9 @@ class LTLayoutContainer(LTContainer[LTComponent]):
if not skip_isany and isany(obj1, obj2): if not skip_isany and isany(obj1, obj2):
heapq.heappush(dists, (True, d, id1, id2, obj1, obj2)) heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
continue continue
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \ if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)): obj2, (LTTextBoxVertical, LTTextGroupTBRL)
):
group: LTTextGroup = LTTextGroupTBRL([obj1, obj2]) group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
else: else:
group = LTTextGroupLRTB([obj1, obj2]) group = LTTextGroupLRTB([obj1, obj2])
@ -893,8 +925,10 @@ class LTLayoutContainer(LTContainer[LTComponent]):
done.update([id1, id2]) done.update([id1, id2])
for other in plane: for other in plane:
heapq.heappush(dists, (False, dist(group, other), heapq.heappush(
id(group), id(other), group, other)) dists,
(False, dist(group, other), id(group), id(other), group, other),
)
plane.add(group) plane.add(group)
# By now only groups are in the plane # By now only groups are in the plane
return list(cast(LTTextGroup, g) for g in plane) return list(cast(LTTextGroup, g) for g in plane)
@ -902,8 +936,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
def analyze(self, laparams: LAParams) -> None: def analyze(self, laparams: LAParams) -> None:
# textobjs is a list of LTChar objects, i.e. # textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page. # it has all the individual characters in the page.
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
self)
for obj in otherobjs: for obj in otherobjs:
obj.analyze(laparams) obj.analyze(laparams)
if not textobjs: if not textobjs:
@ -922,6 +955,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
return (0, -box.x1, -box.y0) return (0, -box.x1, -box.y0)
else: else:
return (1, -box.y0, box.x0) return (1, -box.y0, box.x0)
textboxes.sort(key=getkey) textboxes.sort(key=getkey)
else: else:
self.groups = self.group_textboxes(laparams, textboxes) self.groups = self.group_textboxes(laparams, textboxes)
@ -930,8 +964,11 @@ class LTLayoutContainer(LTContainer[LTComponent]):
group.analyze(laparams) group.analyze(laparams)
assigner.run(group) assigner.run(group)
textboxes.sort(key=lambda box: box.index) textboxes.sort(key=lambda box: box.index)
self._objs = (cast(List[LTComponent], textboxes) + otherobjs self._objs = (
+ cast(List[LTComponent], empties)) cast(List[LTComponent], textboxes)
+ otherobjs
+ cast(List[LTComponent], empties)
)
return return
@ -953,9 +990,12 @@ class LTFigure(LTLayoutContainer):
return return
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s(%s) %s matrix=%s>' % return "<%s(%s) %s matrix=%s>" % (
(self.__class__.__name__, self.name, self.__class__.__name__,
bbox2str(self.bbox), matrix2str(self.matrix))) self.name,
bbox2str(self.bbox),
matrix2str(self.matrix),
)
def analyze(self, laparams: LAParams) -> None: def analyze(self, laparams: LAParams) -> None:
if not laparams.all_texts: if not laparams.all_texts:
@ -978,6 +1018,9 @@ class LTPage(LTLayoutContainer):
return return
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<%s(%r) %s rotate=%r>' % return "<%s(%r) %s rotate=%r>" % (
(self.__class__.__name__, self.pageid, self.__class__.__name__,
bbox2str(self.bbox), self.rotate)) self.pageid,
bbox2str(self.bbox),
self.rotate,
)

View File

@ -10,7 +10,6 @@ class CorruptDataError(Exception):
class LZWDecoder: class LZWDecoder:
def __init__(self, fp: BinaryIO) -> None: def __init__(self, fp: BinaryIO) -> None:
self.fp = fp self.fp = fp
self.buff = 0 self.buff = 0
@ -24,19 +23,19 @@ class LZWDecoder:
v = 0 v = 0
while 1: while 1:
# the number of remaining bits we can get from the current buffer. # the number of remaining bits we can get from the current buffer.
r = 8-self.bpos r = 8 - self.bpos
if bits <= r: if bits <= r:
# |-----8-bits-----| # |-----8-bits-----|
# |-bpos-|-bits-| | # |-bpos-|-bits-| |
# | |----r----| # | |----r----|
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1)) v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1))
self.bpos += bits self.bpos += bits
break break
else: else:
# |-----8-bits-----| # |-----8-bits-----|
# |-bpos-|---bits----... # |-bpos-|---bits----...
# | |----r----| # | |----r----|
v = (v << r) | (self.buff & ((1 << r)-1)) v = (v << r) | (self.buff & ((1 << r) - 1))
bits -= r bits -= r
x = self.fp.read(1) x = self.fp.read(1)
if not x: if not x:
@ -46,12 +45,12 @@ class LZWDecoder:
return v return v
def feed(self, code: int) -> bytes: def feed(self, code: int) -> bytes:
x = b'' x = b""
if code == 256: if code == 256:
self.table = [bytes((c,)) for c in range(256)] # 0-255 self.table = [bytes((c,)) for c in range(256)] # 0-255
self.table.append(None) # 256 self.table.append(None) # 256
self.table.append(None) # 257 self.table.append(None) # 257
self.prevbuf = b'' self.prevbuf = b""
self.nbits = 9 self.nbits = 9
elif code == 257: elif code == 257:
pass pass
@ -62,9 +61,9 @@ class LZWDecoder:
assert self.table is not None assert self.table is not None
if code < len(self.table): if code < len(self.table):
x = cast(bytes, self.table[code]) # assume not None x = cast(bytes, self.table[code]) # assume not None
self.table.append(self.prevbuf+x[:1]) self.table.append(self.prevbuf + x[:1])
elif code == len(self.table): elif code == len(self.table):
self.table.append(self.prevbuf+self.prevbuf[:1]) self.table.append(self.prevbuf + self.prevbuf[:1])
x = cast(bytes, self.table[code]) x = cast(bytes, self.table[code])
else: else:
raise CorruptDataError raise CorruptDataError
@ -91,11 +90,13 @@ class LZWDecoder:
break break
yield x yield x
assert self.table is not None assert self.table is not None
logger.debug('nbits=%d, code=%d, output=%r, table=%r' logger.debug(
% (self.nbits, code, x, self.table[258:])) "nbits=%d, code=%d, output=%r, table=%r"
% (self.nbits, code, x, self.table[258:])
)
def lzwdecode(data: bytes) -> bytes: def lzwdecode(data: bytes) -> bytes:
fp = BytesIO(data) fp = BytesIO(data)
s = LZWDecoder(fp).run() s = LZWDecoder(fp).run()
return b''.join(s) return b"".join(s)

View File

@ -3,33 +3,31 @@ from typing import Dict
from .psparser import LIT from .psparser import LIT
LITERAL_DEVICE_GRAY = LIT('DeviceGray') LITERAL_DEVICE_GRAY = LIT("DeviceGray")
LITERAL_DEVICE_RGB = LIT('DeviceRGB') LITERAL_DEVICE_RGB = LIT("DeviceRGB")
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK') LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
class PDFColorSpace: class PDFColorSpace:
def __init__(self, name: str, ncomponents: int) -> None: def __init__(self, name: str, ncomponents: int) -> None:
self.name = name self.name = name
self.ncomponents = ncomponents self.ncomponents = ncomponents
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFColorSpace: %s, ncomponents=%d>' % \ return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
(self.name, self.ncomponents)
PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict() PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
for (name, n) in [ for (name, n) in [
('DeviceGray', 1), # default value first ("DeviceGray", 1), # default value first
('CalRGB', 3), ("CalRGB", 3),
('CalGray', 1), ("CalGray", 1),
('Lab', 3), ("Lab", 3),
('DeviceRGB', 3), ("DeviceRGB", 3),
('DeviceCMYK', 4), ("DeviceCMYK", 4),
('Separation', 1), ("Separation", 1),
('Indexed', 1), ("Indexed", 1),
('Pattern', 1), ("Pattern", 1),
]: ]:
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n) PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)

View File

@ -1,5 +1,13 @@
from typing import (BinaryIO, Iterable, List, Optional, Sequence, from typing import (
TYPE_CHECKING, Union, cast) BinaryIO,
Iterable,
List,
Optional,
Sequence,
TYPE_CHECKING,
Union,
cast,
)
from pdfminer.psparser import PSLiteral from pdfminer.psparser import PSLiteral
from . import utils from . import utils
@ -21,25 +29,19 @@ PDFTextSeq = Iterable[Union[int, float, bytes]]
class PDFDevice: class PDFDevice:
"""Translate the output of PDFPageInterpreter to the output that is needed """Translate the output of PDFPageInterpreter to the output that is needed"""
"""
def __init__(self, rsrcmgr: "PDFResourceManager") -> None: def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
self.rsrcmgr = rsrcmgr self.rsrcmgr = rsrcmgr
self.ctm: Optional[Matrix] = None self.ctm: Optional[Matrix] = None
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFDevice>' return "<PDFDevice>"
def __enter__(self) -> "PDFDevice": def __enter__(self) -> "PDFDevice":
return self return self
def __exit__( def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
self,
exc_type: object,
exc_val: object,
exc_tb: object
) -> None:
self.close() self.close()
def close(self) -> None: def close(self) -> None:
@ -48,21 +50,13 @@ class PDFDevice:
def set_ctm(self, ctm: Matrix) -> None: def set_ctm(self, ctm: Matrix) -> None:
self.ctm = ctm self.ctm = ctm
def begin_tag( def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
self,
tag: PSLiteral,
props: Optional["PDFStackT"] = None
) -> None:
pass pass
def end_tag(self) -> None: def end_tag(self) -> None:
pass pass
def do_tag( def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
self,
tag: PSLiteral,
props: Optional["PDFStackT"] = None
) -> None:
pass pass
def begin_page(self, page: PDFPage, ctm: Matrix) -> None: def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
@ -83,7 +77,7 @@ class PDFDevice:
stroke: bool, stroke: bool,
fill: bool, fill: bool,
evenodd: bool, evenodd: bool,
path: Sequence[PathSegment] path: Sequence[PathSegment],
) -> None: ) -> None:
pass pass
@ -95,42 +89,61 @@ class PDFDevice:
textstate: "PDFTextState", textstate: "PDFTextState",
seq: PDFTextSeq, seq: PDFTextSeq,
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: "PDFGraphicState" graphicstate: "PDFGraphicState",
) -> None: ) -> None:
pass pass
class PDFTextDevice(PDFDevice): class PDFTextDevice(PDFDevice):
def render_string( def render_string(
self, self,
textstate: "PDFTextState", textstate: "PDFTextState",
seq: PDFTextSeq, seq: PDFTextSeq,
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: "PDFGraphicState" graphicstate: "PDFGraphicState",
) -> None: ) -> None:
assert self.ctm is not None assert self.ctm is not None
matrix = utils.mult_matrix(textstate.matrix, self.ctm) matrix = utils.mult_matrix(textstate.matrix, self.ctm)
font = textstate.font font = textstate.font
fontsize = textstate.fontsize fontsize = textstate.fontsize
scaling = textstate.scaling * .01 scaling = textstate.scaling * 0.01
charspace = textstate.charspace * scaling charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling wordspace = textstate.wordspace * scaling
rise = textstate.rise rise = textstate.rise
assert font is not None assert font is not None
if font.is_multibyte(): if font.is_multibyte():
wordspace = 0 wordspace = 0
dxscale = .001 * fontsize * scaling dxscale = 0.001 * fontsize * scaling
if font.is_vertical(): if font.is_vertical():
textstate.linematrix = self.render_string_vertical( textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize, seq,
scaling, charspace, wordspace, rise, dxscale, ncs, matrix,
graphicstate) textstate.linematrix,
font,
fontsize,
scaling,
charspace,
wordspace,
rise,
dxscale,
ncs,
graphicstate,
)
else: else:
textstate.linematrix = self.render_string_horizontal( textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize, seq,
scaling, charspace, wordspace, rise, dxscale, ncs, matrix,
graphicstate) textstate.linematrix,
font,
fontsize,
scaling,
charspace,
wordspace,
rise,
dxscale,
ncs,
graphicstate,
)
def render_string_horizontal( def render_string_horizontal(
self, self,
@ -145,21 +158,28 @@ class PDFTextDevice(PDFDevice):
rise: float, rise: float,
dxscale: float, dxscale: float,
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: "PDFGraphicState" graphicstate: "PDFGraphicState",
) -> Point: ) -> Point:
(x, y) = pos (x, y) = pos
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if isinstance(obj, (int, float)): if isinstance(obj, (int, float)):
x -= obj*dxscale x -= obj * dxscale
needcharspace = True needcharspace = True
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
x += charspace x += charspace
x += self.render_char( x += self.render_char(
utils.translate_matrix(matrix, (x, y)), font, utils.translate_matrix(matrix, (x, y)),
fontsize, scaling, rise, cid, ncs, graphicstate) font,
fontsize,
scaling,
rise,
cid,
ncs,
graphicstate,
)
if cid == 32 and wordspace: if cid == 32 and wordspace:
x += wordspace x += wordspace
needcharspace = True needcharspace = True
@ -178,21 +198,28 @@ class PDFTextDevice(PDFDevice):
rise: float, rise: float,
dxscale: float, dxscale: float,
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: "PDFGraphicState" graphicstate: "PDFGraphicState",
) -> Point: ) -> Point:
(x, y) = pos (x, y) = pos
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if isinstance(obj, (int, float)): if isinstance(obj, (int, float)):
y -= obj*dxscale y -= obj * dxscale
needcharspace = True needcharspace = True
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
y += charspace y += charspace
y += self.render_char( y += self.render_char(
utils.translate_matrix(matrix, (x, y)), font, fontsize, utils.translate_matrix(matrix, (x, y)),
scaling, rise, cid, ncs, graphicstate) font,
fontsize,
scaling,
rise,
cid,
ncs,
graphicstate,
)
if cid == 32 and wordspace: if cid == 32 and wordspace:
y += wordspace y += wordspace
needcharspace = True needcharspace = True
@ -207,18 +234,14 @@ class PDFTextDevice(PDFDevice):
rise: float, rise: float,
cid: int, cid: int,
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: "PDFGraphicState" graphicstate: "PDFGraphicState",
) -> float: ) -> float:
return 0 return 0
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
def __init__( def __init__(
self, self, rsrcmgr: "PDFResourceManager", outfp: BinaryIO, codec: str = "utf-8"
rsrcmgr: "PDFResourceManager",
outfp: BinaryIO,
codec: str = 'utf-8'
) -> None: ) -> None:
PDFDevice.__init__(self, rsrcmgr) PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp self.outfp = outfp
@ -231,11 +254,11 @@ class TagExtractor(PDFDevice):
textstate: "PDFTextState", textstate: "PDFTextState",
seq: PDFTextSeq, seq: PDFTextSeq,
ncs: PDFColorSpace, ncs: PDFColorSpace,
graphicstate: "PDFGraphicState" graphicstate: "PDFGraphicState",
) -> None: ) -> None:
font = textstate.font font = textstate.font
assert font is not None assert font is not None
text = '' text = ""
for obj in seq: for obj in seq:
if isinstance(obj, str): if isinstance(obj, str):
obj = utils.make_compat_bytes(obj) obj = utils.make_compat_bytes(obj)
@ -251,25 +274,29 @@ class TagExtractor(PDFDevice):
self._write(utils.enc(text)) self._write(utils.enc(text))
def begin_page(self, page: PDFPage, ctm: Matrix) -> None: def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
output = '<page id="%s" bbox="%s" rotate="%d">' %\ output = '<page id="%s" bbox="%s" rotate="%d">' % (
(self.pageno, utils.bbox2str(page.mediabox), page.rotate) self.pageno,
utils.bbox2str(page.mediabox),
page.rotate,
)
self._write(output) self._write(output)
return return
def end_page(self, page: PDFPage) -> None: def end_page(self, page: PDFPage) -> None:
self._write('</page>\n') self._write("</page>\n")
self.pageno += 1 self.pageno += 1
return return
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
) -> None: s = ""
s = ''
if isinstance(props, dict): if isinstance(props, dict):
s = ''.join([ s = "".join(
' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v)) [
for (k, v) in sorted(props.items()) ' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
]) for (k, v) in sorted(props.items())
out_s = '<{}{}>'.format(utils.enc(cast(str, tag.name)), s) ]
)
out_s = "<{}{}>".format(utils.enc(cast(str, tag.name)), s)
self._write(out_s) self._write(out_s)
self._stack.append(tag) self._stack.append(tag)
return return
@ -277,12 +304,11 @@ class TagExtractor(PDFDevice):
def end_tag(self) -> None: def end_tag(self) -> None:
assert self._stack, str(self.pageno) assert self._stack, str(self.pageno)
tag = self._stack.pop(-1) tag = self._stack.pop(-1)
out_s = '</%s>' % utils.enc(cast(str, tag.name)) out_s = "</%s>" % utils.enc(cast(str, tag.name))
self._write(out_s) self._write(out_s)
return return
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
) -> None:
self.begin_tag(tag, props) self.begin_tag(tag, props)
self._stack.pop(-1) self._stack.pop(-1)
return return

View File

@ -3,8 +3,21 @@ import logging
import re import re
import struct import struct
from hashlib import sha256, md5, sha384, sha512 from hashlib import sha256, md5, sha384, sha512
from typing import (Any, Callable, Dict, Iterable, Iterator, KeysView, List, from typing import (
Optional, Sequence, Tuple, Type, Union, cast) Any,
Callable,
Dict,
Iterable,
Iterator,
KeysView,
List,
Optional,
Sequence,
Tuple,
Type,
Union,
cast,
)
from cryptography.hazmat.backends import default_backend from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
@ -13,12 +26,22 @@ from . import settings
from .arcfour import Arcfour from .arcfour import Arcfour
from .data_structures import NumberTree from .data_structures import NumberTree
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, \ from .pdftypes import (
PDFStream, PDFObjectNotFound, decipher_all, int_value, str_value, \ DecipherCallable,
list_value, uint_value, dict_value, stream_value PDFException,
PDFTypeError,
PDFStream,
PDFObjectNotFound,
decipher_all,
int_value,
str_value,
list_value,
uint_value,
dict_value,
stream_value,
)
from .psparser import PSEOF, literal_name, LIT, KWD from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, decode_text, nunpack, format_int_roman, \ from .utils import choplist, decode_text, nunpack, format_int_roman, format_int_alpha
format_int_alpha
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -32,6 +55,7 @@ class PDFNoValidXRefWarning(SyntaxWarning):
Not used anymore because warnings.warn is replaced by logger.Logger.warn. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
""" """
pass pass
@ -60,6 +84,7 @@ class PDFEncryptionWarning(UserWarning):
Not used anymore because warnings.warn is replaced by logger.Logger.warn. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
""" """
pass pass
@ -68,6 +93,7 @@ class PDFTextExtractionNotAllowedWarning(UserWarning):
Not used anymore because warnings.warn is replaced by logger.Logger.warn. Not used anymore because warnings.warn is replaced by logger.Logger.warn.
""" """
pass pass
@ -78,15 +104,19 @@ class PDFTextExtractionNotAllowed(PDFEncryptionError):
class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed): class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
def __init__(self, *args: object) -> None: def __init__(self, *args: object) -> None:
from warnings import warn from warnings import warn
warn('PDFTextExtractionNotAllowedError will be removed in the future. '
'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning) warn(
"PDFTextExtractionNotAllowedError will be removed in the future. "
"Use PDFTextExtractionNotAllowed instead.",
DeprecationWarning,
)
super().__init__(*args) super().__init__(*args)
# some predefined literals and keywords. # some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm') LITERAL_OBJSTM = LIT("ObjStm")
LITERAL_XREF = LIT('XRef') LITERAL_XREF = LIT("XRef")
LITERAL_CATALOG = LIT('Catalog') LITERAL_CATALOG = LIT("Catalog")
class PDFBaseXRef: class PDFBaseXRef:
@ -107,13 +137,12 @@ class PDFBaseXRef:
class PDFXRef(PDFBaseXRef): class PDFXRef(PDFBaseXRef):
def __init__(self) -> None: def __init__(self) -> None:
self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {} self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
self.trailer: Dict[str, Any] = {} self.trailer: Dict[str, Any] = {}
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFXRef: offsets=%r>' % (self.offsets.keys()) return "<PDFXRef: offsets=%r>" % (self.offsets.keys())
def load(self, parser: PDFParser) -> None: def load(self, parser: PDFParser) -> None:
while True: while True:
@ -123,51 +152,50 @@ class PDFXRef(PDFBaseXRef):
if not line: if not line:
continue continue
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?') raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
if line.startswith(b'trailer'): if line.startswith(b"trailer"):
parser.seek(pos) parser.seek(pos)
break break
f = line.split(b' ') f = line.split(b" ")
if len(f) != 2: if len(f) != 2:
error_msg = 'Trailer not found: {!r}: line={!r}'\ error_msg = "Trailer not found: {!r}: line={!r}".format(parser, line)
.format(parser, line)
raise PDFNoValidXRef(error_msg) raise PDFNoValidXRef(error_msg)
try: try:
(start, nobjs) = map(int, f) (start, nobjs) = map(int, f)
except ValueError: except ValueError:
error_msg = 'Invalid line: {!r}: line={!r}'\ error_msg = "Invalid line: {!r}: line={!r}".format(parser, line)
.format(parser, line)
raise PDFNoValidXRef(error_msg) raise PDFNoValidXRef(error_msg)
for objid in range(start, start+nobjs): for objid in range(start, start + nobjs):
try: try:
(_, line) = parser.nextline() (_, line) = parser.nextline()
line = line.strip() line = line.strip()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?') raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
f = line.split(b' ') f = line.split(b" ")
if len(f) != 3: if len(f) != 3:
error_msg = 'Invalid XRef format: {!r}, line={!r}'\ error_msg = "Invalid XRef format: {!r}, line={!r}".format(
.format(parser, line) parser, line
)
raise PDFNoValidXRef(error_msg) raise PDFNoValidXRef(error_msg)
(pos_b, genno_b, use_b) = f (pos_b, genno_b, use_b) = f
if use_b != b'n': if use_b != b"n":
continue continue
self.offsets[objid] = (None, int(pos_b), int(genno_b)) self.offsets[objid] = (None, int(pos_b), int(genno_b))
log.debug('xref objects: %r', self.offsets) log.debug("xref objects: %r", self.offsets)
self.load_trailer(parser) self.load_trailer(parser)
def load_trailer(self, parser: PDFParser) -> None: def load_trailer(self, parser: PDFParser) -> None:
try: try:
(_, kwd) = parser.nexttoken() (_, kwd) = parser.nexttoken()
assert kwd is KWD(b'trailer'), str(kwd) assert kwd is KWD(b"trailer"), str(kwd)
(_, dic) = parser.nextobject() (_, dic) = parser.nextobject()
except PSEOF: except PSEOF:
x = parser.pop(1) x = parser.pop(1)
if not x: if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted') raise PDFNoValidXRef("Unexpected EOF - file corrupted")
(_, dic) = x[0] (_, dic) = x[0]
self.trailer.update(dict_value(dic)) self.trailer.update(dict_value(dic))
log.debug('trailer=%r', self.trailer) log.debug("trailer=%r", self.trailer)
def get_trailer(self) -> Dict[str, Any]: def get_trailer(self) -> Dict[str, Any]:
return self.trailer return self.trailer
@ -183,11 +211,10 @@ class PDFXRef(PDFBaseXRef):
class PDFXRefFallback(PDFXRef): class PDFXRefFallback(PDFXRef):
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys()) return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
def load(self, parser: PDFParser) -> None: def load(self, parser: PDFParser) -> None:
parser.seek(0) parser.seek(0)
@ -196,12 +223,12 @@ class PDFXRefFallback(PDFXRef):
(pos, line_bytes) = parser.nextline() (pos, line_bytes) = parser.nextline()
except PSEOF: except PSEOF:
break break
if line_bytes.startswith(b'trailer'): if line_bytes.startswith(b"trailer"):
parser.seek(pos) parser.seek(pos)
self.load_trailer(parser) self.load_trailer(parser)
log.debug('trailer: %r', self.trailer) log.debug("trailer: %r", self.trailer)
break break
line = line_bytes.decode('latin-1') # default pdf encoding line = line_bytes.decode("latin-1") # default pdf encoding
m = self.PDFOBJ_CUE.match(line) m = self.PDFOBJ_CUE.match(line)
if not m: if not m:
continue continue
@ -212,14 +239,13 @@ class PDFXRefFallback(PDFXRef):
# expand ObjStm. # expand ObjStm.
parser.seek(pos) parser.seek(pos)
(_, obj) = parser.nextobject() (_, obj) = parser.nextobject()
if isinstance(obj, PDFStream) \ if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
and obj.get('Type') is LITERAL_OBJSTM:
stream = stream_value(obj) stream = stream_value(obj)
try: try:
n = stream['N'] n = stream["N"]
except KeyError: except KeyError:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream) raise PDFSyntaxError("N is not defined: %r" % stream)
n = 0 n = 0
parser1 = PDFStreamParser(stream.get_data()) parser1 = PDFStreamParser(stream.get_data())
objs: List[int] = [] objs: List[int] = []
@ -229,14 +255,13 @@ class PDFXRefFallback(PDFXRef):
objs.append(cast(int, obj)) objs.append(cast(int, obj))
except PSEOF: except PSEOF:
pass pass
n = min(n, len(objs)//2) n = min(n, len(objs) // 2)
for index in range(n): for index in range(n):
objid1 = objs[index*2] objid1 = objs[index * 2]
self.offsets[objid1] = (objid, index, 0) self.offsets[objid1] = (objid, index, 0)
class PDFXRefStream(PDFBaseXRef): class PDFXRefStream(PDFBaseXRef):
def __init__(self) -> None: def __init__(self) -> None:
self.data: Optional[bytes] = None self.data: Optional[bytes] = None
self.entlen: Optional[int] = None self.entlen: Optional[int] = None
@ -246,31 +271,32 @@ class PDFXRefStream(PDFBaseXRef):
self.ranges: List[Tuple[int, int]] = [] self.ranges: List[Tuple[int, int]] = []
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFXRefStream: ranges=%r>' % (self.ranges) return "<PDFXRefStream: ranges=%r>" % (self.ranges)
def load(self, parser: PDFParser) -> None: def load(self, parser: PDFParser) -> None:
(_, objid) = parser.nexttoken() # ignored (_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken() (_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject() (_, stream) = parser.nextobject()
if not isinstance(stream, PDFStream) \ if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
or stream.get('Type') is not LITERAL_XREF: raise PDFNoValidXRef("Invalid PDF stream spec.")
raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream["Size"]
size = stream['Size'] index_array = stream.get("Index", (0, size))
index_array = stream.get('Index', (0, size))
if len(index_array) % 2 != 0: if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number') raise PDFSyntaxError("Invalid index number")
self.ranges.extend(cast(Iterator[Tuple[int, int]], self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
choplist(2, index_array))) (self.fl1, self.fl2, self.fl3) = stream["W"]
(self.fl1, self.fl2, self.fl3) = stream['W'] assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
assert (self.fl1 is not None and self.fl2 is not None
and self.fl3 is not None)
self.data = stream.get_data() self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3 self.entlen = self.fl1 + self.fl2 + self.fl3
self.trailer = stream.attrs self.trailer = stream.attrs
log.debug('xref stream: objid=%s, fields=%d,%d,%d', log.debug(
', '.join(map(repr, self.ranges)), "xref stream: objid=%s, fields=%d,%d,%d",
self.fl1, self.fl2, self.fl3) ", ".join(map(repr, self.ranges)),
self.fl1,
self.fl2,
self.fl3,
)
return return
def get_trailer(self) -> Dict[str, Any]: def get_trailer(self) -> Dict[str, Any]:
@ -282,16 +308,16 @@ class PDFXRefStream(PDFBaseXRef):
assert self.entlen is not None assert self.entlen is not None
assert self.data is not None assert self.data is not None
offset = self.entlen * i offset = self.entlen * i
ent = self.data[offset:offset+self.entlen] ent = self.data[offset : offset + self.entlen]
f1 = nunpack(ent[:self.fl1], 1) f1 = nunpack(ent[: self.fl1], 1)
if f1 == 1 or f1 == 2: if f1 == 1 or f1 == 2:
yield start+i yield start + i
return return
def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]: def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
index = 0 index = 0
for (start, nobjs) in self.ranges: for (start, nobjs) in self.ranges:
if start <= objid and objid < start+nobjs: if start <= objid and objid < start + nobjs:
index += objid - start index += objid - start
break break
else: else:
@ -300,13 +326,12 @@ class PDFXRefStream(PDFBaseXRef):
raise KeyError(objid) raise KeyError(objid)
assert self.entlen is not None assert self.entlen is not None
assert self.data is not None assert self.data is not None
assert (self.fl1 is not None and self.fl2 is not None assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
and self.fl3 is not None)
offset = self.entlen * index offset = self.entlen * index
ent = self.data[offset:offset+self.entlen] ent = self.data[offset : offset + self.entlen]
f1 = nunpack(ent[:self.fl1], 1) f1 = nunpack(ent[: self.fl1], 1)
f2 = nunpack(ent[self.fl1:self.fl1+self.fl2]) f2 = nunpack(ent[self.fl1 : self.fl1 + self.fl2])
f3 = nunpack(ent[self.fl1+self.fl2:]) f3 = nunpack(ent[self.fl1 + self.fl2 :])
if f1 == 1: if f1 == 1:
return (None, f2, f3) return (None, f2, f3)
elif f1 == 2: elif f1 == 2:
@ -318,15 +343,14 @@ class PDFXRefStream(PDFBaseXRef):
class PDFStandardSecurityHandler: class PDFStandardSecurityHandler:
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08' PASSWORD_PADDING = (
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz') b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
)
supported_revisions: Tuple[int, ...] = (2, 3) supported_revisions: Tuple[int, ...] = (2, 3)
def __init__( def __init__(
self, self, docid: Sequence[bytes], param: Dict[str, Any], password: str = ""
docid: Sequence[bytes],
param: Dict[str, Any],
password: str = ''
) -> None: ) -> None:
self.docid = docid self.docid = docid
self.param = param self.param = param
@ -337,18 +361,18 @@ class PDFStandardSecurityHandler:
def init(self) -> None: def init(self) -> None:
self.init_params() self.init_params()
if self.r not in self.supported_revisions: if self.r not in self.supported_revisions:
error_msg = 'Unsupported revision: param=%r' % self.param error_msg = "Unsupported revision: param=%r" % self.param
raise PDFEncryptionError(error_msg) raise PDFEncryptionError(error_msg)
self.init_key() self.init_key()
return return
def init_params(self) -> None: def init_params(self) -> None:
self.v = int_value(self.param.get('V', 0)) self.v = int_value(self.param.get("V", 0))
self.r = int_value(self.param['R']) self.r = int_value(self.param["R"])
self.p = uint_value(self.param['P'], 32) self.p = uint_value(self.param["P"], 32)
self.o = str_value(self.param['O']) self.o = str_value(self.param["O"])
self.u = str_value(self.param['U']) self.u = str_value(self.param["U"])
self.length = int_value(self.param.get('Length', 40)) self.length = int_value(self.param.get("Length", 40))
return return
def init_key(self) -> None: def init_key(self) -> None:
@ -376,7 +400,7 @@ class PDFStandardSecurityHandler:
hash.update(self.docid[0]) # 3 hash.update(self.docid[0]) # 3
result = Arcfour(key).encrypt(hash.digest()) # 4 result = Arcfour(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5 for i in range(1, 20): # 5
k = b''.join(bytes((c ^ i,)) for c in iter(key)) k = b"".join(bytes((c ^ i,)) for c in iter(key))
result = Arcfour(k).encrypt(result) result = Arcfour(k).encrypt(result)
result += result # 6 result += result # 6
return result return result
@ -387,11 +411,11 @@ class PDFStandardSecurityHandler:
hash = md5(password) # 2 hash = md5(password) # 2
hash.update(self.o) # 3 hash.update(self.o) # 3
# See https://github.com/pdfminer/pdfminer.six/issues/186 # See https://github.com/pdfminer/pdfminer.six/issues/186
hash.update(struct.pack('<L', self.p)) # 4 hash.update(struct.pack("<L", self.p)) # 4
hash.update(self.docid[0]) # 5 hash.update(self.docid[0]) # 5
if self.r >= 4: if self.r >= 4:
if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata: if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
hash.update(b'\xff\xff\xff\xff') hash.update(b"\xff\xff\xff\xff")
result = hash.digest() result = hash.digest()
n = 5 n = 5
if self.r >= 3: if self.r >= 3:
@ -437,7 +461,7 @@ class PDFStandardSecurityHandler:
else: else:
user_password = self.o user_password = self.o
for i in range(19, -1, -1): for i in range(19, -1, -1):
k = b''.join(bytes((c ^ i,)) for c in iter(key)) k = b"".join(bytes((c ^ i,)) for c in iter(key))
user_password = Arcfour(k).decrypt(user_password) user_password = Arcfour(k).decrypt(user_password)
return self.authenticate_user_password(user_password) return self.authenticate_user_password(user_password)
@ -446,16 +470,15 @@ class PDFStandardSecurityHandler:
objid: int, objid: int,
genno: int, genno: int,
data: bytes, data: bytes,
attrs: Optional[Dict[str, Any]] = None attrs: Optional[Dict[str, Any]] = None,
) -> bytes: ) -> bytes:
return self.decrypt_rc4(objid, genno, data) return self.decrypt_rc4(objid, genno, data)
def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes: def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
assert self.key is not None assert self.key is not None
key = self.key + struct.pack('<L', objid)[:3] \ key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
+ struct.pack('<L', genno)[:2]
hash = md5(key) hash = md5(key)
key = hash.digest()[:min(len(key), 16)] key = hash.digest()[: min(len(key), 16)]
return Arcfour(key).decrypt(data) return Arcfour(key).decrypt(data)
@ -466,34 +489,30 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
def init_params(self) -> None: def init_params(self) -> None:
super().init_params() super().init_params()
self.length = 128 self.length = 128
self.cf = dict_value(self.param.get('CF')) self.cf = dict_value(self.param.get("CF"))
self.stmf = literal_name(self.param['StmF']) self.stmf = literal_name(self.param["StmF"])
self.strf = literal_name(self.param['StrF']) self.strf = literal_name(self.param["StrF"])
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True)) self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
if self.stmf != self.strf: if self.stmf != self.strf:
error_msg = 'Unsupported crypt filter: param=%r' % self.param error_msg = "Unsupported crypt filter: param=%r" % self.param
raise PDFEncryptionError(error_msg) raise PDFEncryptionError(error_msg)
self.cfm = {} self.cfm = {}
for k, v in self.cf.items(): for k, v in self.cf.items():
f = self.get_cfm(literal_name(v['CFM'])) f = self.get_cfm(literal_name(v["CFM"]))
if f is None: if f is None:
error_msg = 'Unknown crypt filter method: param=%r' \ error_msg = "Unknown crypt filter method: param=%r" % self.param
% self.param
raise PDFEncryptionError(error_msg) raise PDFEncryptionError(error_msg)
self.cfm[k] = f self.cfm[k] = f
self.cfm['Identity'] = self.decrypt_identity self.cfm["Identity"] = self.decrypt_identity
if self.strf not in self.cfm: if self.strf not in self.cfm:
error_msg = 'Undefined crypt filter: param=%r' % self.param error_msg = "Undefined crypt filter: param=%r" % self.param
raise PDFEncryptionError(error_msg) raise PDFEncryptionError(error_msg)
return return
def get_cfm( def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
self, if name == "V2":
name: str
) -> Optional[Callable[[int, int, bytes], bytes]]:
if name == 'V2':
return self.decrypt_rc4 return self.decrypt_rc4
elif name == 'AESV2': elif name == "AESV2":
return self.decrypt_aes128 return self.decrypt_aes128
else: else:
return None return None
@ -504,11 +523,11 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
genno: int, genno: int,
data: bytes, data: bytes,
attrs: Optional[Dict[str, Any]] = None, attrs: Optional[Dict[str, Any]] = None,
name: Optional[str] = None name: Optional[str] = None,
) -> bytes: ) -> bytes:
if not self.encrypt_metadata and attrs is not None: if not self.encrypt_metadata and attrs is not None:
t = attrs.get('Type') t = attrs.get("Type")
if t is not None and literal_name(t) == 'Metadata': if t is not None and literal_name(t) == "Metadata":
return data return data
if name is None: if name is None:
name = self.strf name = self.strf
@ -519,15 +538,21 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes: def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
assert self.key is not None assert self.key is not None
key = self.key + struct.pack('<L', objid)[:3] \ key = (
+ struct.pack('<L', genno)[:2] + b'sAlT' self.key
+ struct.pack("<L", objid)[:3]
+ struct.pack("<L", genno)[:2]
+ b"sAlT"
)
hash = md5(key) hash = md5(key)
key = hash.digest()[:min(len(key), 16)] key = hash.digest()[: min(len(key), 16)]
initialization_vector = data[:16] initialization_vector = data[:16]
ciphertext = data[16:] ciphertext = data[16:]
cipher = Cipher(algorithms.AES(key), cipher = Cipher(
modes.CBC(initialization_vector), algorithms.AES(key),
backend=default_backend()) # type: ignore modes.CBC(initialization_vector),
backend=default_backend(),
) # type: ignore
return cipher.decryptor().update(ciphertext) # type: ignore return cipher.decryptor().update(ciphertext) # type: ignore
@ -538,8 +563,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
def init_params(self) -> None: def init_params(self) -> None:
super().init_params() super().init_params()
self.length = 256 self.length = 256
self.oe = str_value(self.param['OE']) self.oe = str_value(self.param["OE"])
self.ue = str_value(self.param['UE']) self.ue = str_value(self.param["UE"])
self.o_hash = self.o[:32] self.o_hash = self.o[:32]
self.o_validation_salt = self.o[32:40] self.o_validation_salt = self.o[32:40]
self.o_key_salt = self.o[40:] self.o_key_salt = self.o[40:]
@ -548,11 +573,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
self.u_key_salt = self.u[40:] self.u_key_salt = self.u[40:]
return return
def get_cfm( def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
self, if name == "AESV3":
name: str
) -> Optional[Callable[[int, int, bytes], bytes]]:
if name == 'AESV3':
return self.decrypt_aes256 return self.decrypt_aes256
else: else:
return None return None
@ -562,16 +584,16 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
hash = self._password_hash(password_b, self.o_validation_salt, self.u) hash = self._password_hash(password_b, self.o_validation_salt, self.u)
if hash == self.o_hash: if hash == self.o_hash:
hash = self._password_hash(password_b, self.o_key_salt, self.u) hash = self._password_hash(password_b, self.o_key_salt, self.u)
cipher = Cipher(algorithms.AES(hash), cipher = Cipher(
modes.CBC(b'\0' * 16), algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend()
backend=default_backend()) # type: ignore ) # type: ignore
return cipher.decryptor().update(self.oe) # type: ignore return cipher.decryptor().update(self.oe) # type: ignore
hash = self._password_hash(password_b, self.u_validation_salt) hash = self._password_hash(password_b, self.u_validation_salt)
if hash == self.u_hash: if hash == self.u_hash:
hash = self._password_hash(password_b, self.u_key_salt) hash = self._password_hash(password_b, self.u_key_salt)
cipher = Cipher(algorithms.AES(hash), cipher = Cipher(
modes.CBC(b'\0' * 16), algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend()
backend=default_backend()) # type: ignore ) # type: ignore
return cipher.decryptor().update(self.ue) # type: ignore return cipher.decryptor().update(self.ue) # type: ignore
return None return None
@ -579,16 +601,14 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
if self.r == 6: if self.r == 6:
# saslprep expects non-empty strings, apparently # saslprep expects non-empty strings, apparently
if not password: if not password:
return b'' return b""
from ._saslprep import saslprep from ._saslprep import saslprep
password = saslprep(password) password = saslprep(password)
return password.encode('utf-8')[:127] return password.encode("utf-8")[:127]
def _password_hash( def _password_hash(
self, self, password: bytes, salt: bytes, vector: Optional[bytes] = None
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
) -> bytes: ) -> bytes:
""" """
Compute password hash depending on revision number Compute password hash depending on revision number
@ -598,10 +618,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
return self._r6_password(password, salt[0:8], vector) return self._r6_password(password, salt[0:8], vector)
def _r5_password( def _r5_password(
self, self, password: bytes, salt: bytes, vector: Optional[bytes] = None
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
) -> bytes: ) -> bytes:
""" """
Compute the password for revision 5 Compute the password for revision 5
@ -613,10 +630,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
return hash.digest() return hash.digest()
def _r6_password( def _r6_password(
self, self, password: bytes, salt: bytes, vector: Optional[bytes] = None
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
) -> bytes: ) -> bytes:
""" """
Compute the password for revision 6 Compute the password for revision 6
@ -629,10 +643,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
hashes = (sha256, sha384, sha512) hashes = (sha256, sha384, sha512)
round_no = last_byte_val = 0 round_no = last_byte_val = 0
while round_no < 64 or last_byte_val > round_no - 32: while round_no < 64 or last_byte_val > round_no - 32:
k1 = (password + k + (vector or b'')) * 64 k1 = (password + k + (vector or b"")) * 64
e = self._aes_cbc_encrypt( e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
key=k[:16], iv=k[16:32], data=k1
)
# compute the first 16 bytes of e, # compute the first 16 bytes of e,
# interpreted as an unsigned integer mod 3 # interpreted as an unsigned integer mod 3
next_hash = hashes[self._bytes_mod_3(e[:16])] next_hash = hashes[self._bytes_mod_3(e[:16])]
@ -646,12 +658,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
# 256 is 1 mod 3, so we can just sum 'em # 256 is 1 mod 3, so we can just sum 'em
return sum(b % 3 for b in input_bytes) % 3 return sum(b % 3 for b in input_bytes) % 3
def _aes_cbc_encrypt( def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
self,
key: bytes,
iv: bytes,
data: bytes
) -> bytes:
cipher = Cipher(algorithms.AES(key), modes.CBC(iv)) cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
encryptor = cipher.encryptor() # type: ignore encryptor = cipher.encryptor() # type: ignore
return encryptor.update(data) + encryptor.finalize() # type: ignore return encryptor.update(data) + encryptor.finalize() # type: ignore
@ -660,9 +667,11 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
initialization_vector = data[:16] initialization_vector = data[:16]
ciphertext = data[16:] ciphertext = data[16:]
assert self.key is not None assert self.key is not None
cipher = Cipher(algorithms.AES(self.key), cipher = Cipher(
modes.CBC(initialization_vector), algorithms.AES(self.key),
backend=default_backend()) # type: ignore modes.CBC(initialization_vector),
backend=default_backend(),
) # type: ignore
return cipher.decryptor().update(ciphertext) # type: ignore return cipher.decryptor().update(ciphertext) # type: ignore
@ -689,9 +698,9 @@ class PDFDocument:
def __init__( def __init__(
self, self,
parser: PDFParser, parser: PDFParser,
password: str = '', password: str = "",
caching: bool = True, caching: bool = True,
fallback: bool = True fallback: bool = True,
) -> None: ) -> None:
"Set the document to use a given PDFParser object." "Set the document to use a given PDFParser object."
self.caching = caching self.caching = caching
@ -723,43 +732,42 @@ class PDFDocument:
if not trailer: if not trailer:
continue continue
# If there's an encryption info, remember it. # If there's an encryption info, remember it.
if 'Encrypt' in trailer: if "Encrypt" in trailer:
if 'ID' in trailer: if "ID" in trailer:
id_value = list_value(trailer['ID']) id_value = list_value(trailer["ID"])
else: else:
# Some documents may not have a /ID, use two empty # Some documents may not have a /ID, use two empty
# byte strings instead. Solves # byte strings instead. Solves
# https://github.com/pdfminer/pdfminer.six/issues/594 # https://github.com/pdfminer/pdfminer.six/issues/594
id_value = (b'', b'') id_value = (b"", b"")
self.encryption = (id_value, self.encryption = (id_value, dict_value(trailer["Encrypt"]))
dict_value(trailer['Encrypt']))
self._initialize_password(password) self._initialize_password(password)
if 'Info' in trailer: if "Info" in trailer:
self.info.append(dict_value(trailer['Info'])) self.info.append(dict_value(trailer["Info"]))
if 'Root' in trailer: if "Root" in trailer:
# Every PDF file must have exactly one /Root dictionary. # Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root']) self.catalog = dict_value(trailer["Root"])
break break
else: else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?') raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
if self.catalog.get('Type') is not LITERAL_CATALOG: if self.catalog.get("Type") is not LITERAL_CATALOG:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('Catalog not found!') raise PDFSyntaxError("Catalog not found!")
return return
KEYWORD_OBJ = KWD(b'obj') KEYWORD_OBJ = KWD(b"obj")
# _initialize_password(password=b'') # _initialize_password(password=b'')
# Perform the initialization with a given password. # Perform the initialization with a given password.
def _initialize_password(self, password: str = '') -> None: def _initialize_password(self, password: str = "") -> None:
assert self.encryption is not None assert self.encryption is not None
(docid, param) = self.encryption (docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard': if literal_name(param.get("Filter")) != "Standard":
raise PDFEncryptionError('Unknown filter: param=%r' % param) raise PDFEncryptionError("Unknown filter: param=%r" % param)
v = int_value(param.get('V', 0)) v = int_value(param.get("V", 0))
factory = self.security_handler_registry.get(v) factory = self.security_handler_registry.get(v)
if factory is None: if factory is None:
raise PDFEncryptionError('Unknown algorithm: param=%r' % param) raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
handler = factory(docid, param, password) handler = factory(docid, param, password)
self.decipher = handler.decrypt self.decipher = handler.decrypt
self.is_printable = handler.is_printable() self.is_printable = handler.is_printable()
@ -769,12 +777,7 @@ class PDFDocument:
self._parser.fallback = False # need to read streams with exact length self._parser.fallback = False # need to read streams with exact length
return return
def _getobj_objstm( def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
self,
stream: PDFStream,
index: int,
objid: int
) -> object:
if stream.objid in self._parsed_objs: if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid] (objs, n) = self._parsed_objs[stream.objid]
else: else:
@ -782,22 +785,22 @@ class PDFDocument:
if self.caching: if self.caching:
assert stream.objid is not None assert stream.objid is not None
self._parsed_objs[stream.objid] = (objs, n) self._parsed_objs[stream.objid] = (objs, n)
i = n*2+index i = n * 2 + index
try: try:
obj = objs[i] obj = objs[i]
except IndexError: except IndexError:
raise PDFSyntaxError('index too big: %r' % index) raise PDFSyntaxError("index too big: %r" % index)
return obj return obj
def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]: def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
if stream.get('Type') is not LITERAL_OBJSTM: if stream.get("Type") is not LITERAL_OBJSTM:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream) raise PDFSyntaxError("Not a stream object: %r" % stream)
try: try:
n = cast(int, stream['N']) n = cast(int, stream["N"])
except KeyError: except KeyError:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream) raise PDFSyntaxError("N is not defined: %r" % stream)
n = 0 n = 0
parser = PDFStreamParser(stream.get_data()) parser = PDFStreamParser(stream.get_data())
parser.set_document(self) parser.set_document(self)
@ -830,11 +833,10 @@ class PDFDocument:
objid1 = x[-2] objid1 = x[-2]
# #### end hack around malformed pdf files # #### end hack around malformed pdf files
if objid1 != objid: if objid1 != objid:
raise PDFSyntaxError('objid mismatch: {!r}={!r}' raise PDFSyntaxError("objid mismatch: {!r}={!r}".format(objid1, objid))
.format(objid1, objid))
if kwd != KWD(b'obj'): if kwd != KWD(b"obj"):
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos) raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
(_, obj) = self._parser.nextobject() (_, obj) = self._parser.nextobject()
return obj return obj
@ -846,8 +848,8 @@ class PDFDocument:
:raises PDFObjectNotFound if objid does not exist in PDF :raises PDFObjectNotFound if objid does not exist in PDF
""" """
if not self.xrefs: if not self.xrefs:
raise PDFException('PDFDocument is not initialized') raise PDFException("PDFDocument is not initialized")
log.debug('getobj: objid=%r', objid) log.debug("getobj: objid=%r", objid)
if objid in self._cached_objs: if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid] (obj, genno) = self._cached_objs[objid]
else: else:
@ -863,8 +865,7 @@ class PDFDocument:
else: else:
obj = self._getobj_parse(index, objid) obj = self._getobj_parse(index, objid)
if self.decipher: if self.decipher:
obj = decipher_all(self.decipher, objid, genno, obj = decipher_all(self.decipher, objid, genno, obj)
obj)
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
obj.set_objid(objid, genno) obj.set_objid(objid, genno)
@ -873,7 +874,7 @@ class PDFDocument:
continue continue
else: else:
raise PDFObjectNotFound(objid) raise PDFObjectNotFound(objid)
log.debug('register: objid=%r: %r', objid, obj) log.debug("register: objid=%r: %r", objid, obj)
if self.caching: if self.caching:
self._cached_objs[objid] = (obj, genno) self._cached_objs[objid] = (obj, genno)
return obj return obj
@ -881,25 +882,25 @@ class PDFDocument:
OutlineType = Tuple[Any, Any, Any, Any, Any] OutlineType = Tuple[Any, Any, Any, Any, Any]
def get_outlines(self) -> Iterator[OutlineType]: def get_outlines(self) -> Iterator[OutlineType]:
if 'Outlines' not in self.catalog: if "Outlines" not in self.catalog:
raise PDFNoOutlines raise PDFNoOutlines
def search(entry: object, level: int def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
) -> Iterator[PDFDocument.OutlineType]:
entry = dict_value(entry) entry = dict_value(entry)
if 'Title' in entry: if "Title" in entry:
if 'A' in entry or 'Dest' in entry: if "A" in entry or "Dest" in entry:
title = decode_text(str_value(entry['Title'])) title = decode_text(str_value(entry["Title"]))
dest = entry.get('Dest') dest = entry.get("Dest")
action = entry.get('A') action = entry.get("A")
se = entry.get('SE') se = entry.get("SE")
yield (level, title, dest, action, se) yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry: if "First" in entry and "Last" in entry:
yield from search(entry['First'], level+1) yield from search(entry["First"], level + 1)
if 'Next' in entry: if "Next" in entry:
yield from search(entry['Next'], level) yield from search(entry["Next"], level)
return return
return search(self.catalog['Outlines'], 0)
return search(self.catalog["Outlines"], 0)
def get_page_labels(self) -> Iterator[str]: def get_page_labels(self) -> Iterator[str]:
""" """
@ -913,51 +914,49 @@ class PDFDocument:
assert self.catalog is not None assert self.catalog is not None
try: try:
page_labels = PageLabels(self.catalog['PageLabels']) page_labels = PageLabels(self.catalog["PageLabels"])
except (PDFTypeError, KeyError): except (PDFTypeError, KeyError):
raise PDFNoPageLabels raise PDFNoPageLabels
return page_labels.labels return page_labels.labels
def lookup_name( def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
self,
cat: str,
key: Union[str, bytes]
) -> Any:
try: try:
names = dict_value(self.catalog['Names']) names = dict_value(self.catalog["Names"])
except (PDFTypeError, KeyError): except (PDFTypeError, KeyError):
raise KeyError((cat, key)) raise KeyError((cat, key))
# may raise KeyError # may raise KeyError
d0 = dict_value(names[cat]) d0 = dict_value(names[cat])
def lookup(d: Dict[str, Any]) -> Any: def lookup(d: Dict[str, Any]) -> Any:
if 'Limits' in d: if "Limits" in d:
(k1, k2) = list_value(d['Limits']) (k1, k2) = list_value(d["Limits"])
if key < k1 or k2 < key: if key < k1 or k2 < key:
return None return None
if 'Names' in d: if "Names" in d:
objs = list_value(d['Names']) objs = list_value(d["Names"])
names = dict(cast(Iterator[Tuple[Union[str, bytes], Any]], names = dict(
choplist(2, objs))) cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs))
)
return names[key] return names[key]
if 'Kids' in d: if "Kids" in d:
for c in list_value(d['Kids']): for c in list_value(d["Kids"]):
v = lookup(dict_value(c)) v = lookup(dict_value(c))
if v: if v:
return v return v
raise KeyError((cat, key)) raise KeyError((cat, key))
return lookup(d0) return lookup(d0)
def get_dest(self, name: Union[str, bytes]) -> Any: def get_dest(self, name: Union[str, bytes]) -> Any:
try: try:
# PDF-1.2 or later # PDF-1.2 or later
obj = self.lookup_name('Dests', name) obj = self.lookup_name("Dests", name)
except KeyError: except KeyError:
# PDF-1.1 or prior # PDF-1.1 or prior
if 'Dests' not in self.catalog: if "Dests" not in self.catalog:
raise PDFDestinationNotFound(name) raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests']) d0 = dict_value(self.catalog["Dests"])
if name not in d0: if name not in d0:
raise PDFDestinationNotFound(name) raise PDFDestinationNotFound(name)
obj = d0[name] obj = d0[name]
@ -970,23 +969,20 @@ class PDFDocument:
prev = None prev = None
for line in parser.revreadlines(): for line in parser.revreadlines():
line = line.strip() line = line.strip()
log.debug('find_xref: %r', line) log.debug("find_xref: %r", line)
if line == b'startxref': if line == b"startxref":
break break
if line: if line:
prev = line prev = line
else: else:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef("Unexpected EOF")
log.debug('xref found: pos=%r', prev) log.debug("xref found: pos=%r", prev)
assert prev is not None assert prev is not None
return int(prev) return int(prev)
# read xref table # read xref table
def read_xref_from( def read_xref_from(
self, self, parser: PDFParser, start: int, xrefs: List[PDFBaseXRef]
parser: PDFParser,
start: int,
xrefs: List[PDFBaseXRef]
) -> None: ) -> None:
"""Reads XRefs from the given location.""" """Reads XRefs from the given location."""
parser.seek(start) parser.seek(start)
@ -994,8 +990,8 @@ class PDFDocument:
try: try:
(pos, token) = parser.nexttoken() (pos, token) = parser.nexttoken()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef("Unexpected EOF")
log.debug('read_xref_from: start=%d, token=%r', start, token) log.debug("read_xref_from: start=%d, token=%r", start, token)
if isinstance(token, int): if isinstance(token, int):
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
parser.seek(pos) parser.seek(pos)
@ -1009,13 +1005,13 @@ class PDFDocument:
xref.load(parser) xref.load(parser)
xrefs.append(xref) xrefs.append(xref)
trailer = xref.get_trailer() trailer = xref.get_trailer()
log.debug('trailer: %r', trailer) log.debug("trailer: %r", trailer)
if 'XRefStm' in trailer: if "XRefStm" in trailer:
pos = int_value(trailer['XRefStm']) pos = int_value(trailer["XRefStm"])
self.read_xref_from(parser, pos, xrefs) self.read_xref_from(parser, pos, xrefs)
if 'Prev' in trailer: if "Prev" in trailer:
# find previous xref # find previous xref
pos = int_value(trailer['Prev']) pos = int_value(trailer["Prev"])
self.read_xref_from(parser, pos, xrefs) self.read_xref_from(parser, pos, xrefs)
return return
@ -1033,16 +1029,16 @@ class PageLabels(NumberTree):
# The tree must begin with page index 0 # The tree must begin with page index 0
if len(ranges) == 0 or ranges[0][0] != 0: if len(ranges) == 0 or ranges[0][0] != 0:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('PageLabels is missing page index 0') raise PDFSyntaxError("PageLabels is missing page index 0")
else: else:
# Try to cope, by assuming empty labels for the initial pages # Try to cope, by assuming empty labels for the initial pages
ranges.insert(0, (0, {})) ranges.insert(0, (0, {}))
for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1): for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
label_dict = dict_value(label_dict_unchecked) label_dict = dict_value(label_dict_unchecked)
style = label_dict.get('S') style = label_dict.get("S")
prefix = decode_text(str_value(label_dict.get('P', b''))) prefix = decode_text(str_value(label_dict.get("P", b"")))
first_value = int_value(label_dict.get('St', 1)) first_value = int_value(label_dict.get("St", 1))
if next == len(ranges): if next == len(ranges):
# This is the last specified range. It continues until the end # This is the last specified range. It continues until the end
@ -1061,18 +1057,18 @@ class PageLabels(NumberTree):
def _format_page_label(value: int, style: Any) -> str: def _format_page_label(value: int, style: Any) -> str:
"""Format page label value in a specific style""" """Format page label value in a specific style"""
if style is None: if style is None:
label = '' label = ""
elif style is LIT('D'): # Decimal arabic numerals elif style is LIT("D"): # Decimal arabic numerals
label = str(value) label = str(value)
elif style is LIT('R'): # Uppercase roman numerals elif style is LIT("R"): # Uppercase roman numerals
label = format_int_roman(value).upper() label = format_int_roman(value).upper()
elif style is LIT('r'): # Lowercase roman numerals elif style is LIT("r"): # Lowercase roman numerals
label = format_int_roman(value) label = format_int_roman(value)
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ... elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ...
label = format_int_alpha(value).upper() label = format_int_alpha(value).upper()
elif style is LIT('a'): # Lowercase letters a-z, aa-zz... elif style is LIT("a"): # Lowercase letters a-z, aa-zz...
label = format_int_alpha(value) label = format_int_alpha(value)
else: else:
log.warning('Unknown page label style: %r', style) log.warning("Unknown page label style: %r", style)
label = '' label = ""
return label return label

File diff suppressed because it is too large Load Diff

View File

@ -50,11 +50,11 @@ class PDFInterpreterError(PDFException):
pass pass
LITERAL_PDF = LIT('PDF') LITERAL_PDF = LIT("PDF")
LITERAL_TEXT = LIT('Text') LITERAL_TEXT = LIT("Text")
LITERAL_FONT = LIT('Font') LITERAL_FONT = LIT("Font")
LITERAL_FORM = LIT('Form') LITERAL_FORM = LIT("Form")
LITERAL_IMAGE = LIT('Image') LITERAL_IMAGE = LIT("Image")
class PDFTextState: class PDFTextState:
@ -75,12 +75,23 @@ class PDFTextState:
# self.linematrix is set # self.linematrix is set
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \ return (
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \ "<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
'matrix=%r, linematrix=%r>' \ "wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
% (self.font, self.fontsize, self.charspace, self.wordspace, "matrix=%r, linematrix=%r>"
self.scaling, self.leading, self.render, self.rise, % (
self.matrix, self.linematrix) self.font,
self.fontsize,
self.charspace,
self.wordspace,
self.scaling,
self.leading,
self.render,
self.rise,
self.matrix,
self.linematrix,
)
)
def copy(self) -> "PDFTextState": def copy(self) -> "PDFTextState":
obj = PDFTextState() obj = PDFTextState()
@ -102,13 +113,13 @@ class PDFTextState:
Color = Union[ Color = Union[
float, # Greyscale float, # Greyscale
Tuple[float, float, float], # R, G, B Tuple[float, float, float], # R, G, B
Tuple[float, float, float, float]] # C, M, Y, K Tuple[float, float, float, float],
] # C, M, Y, K
class PDFGraphicState: class PDFGraphicState:
def __init__(self) -> None: def __init__(self) -> None:
self.linewidth: float = 0 self.linewidth: float = 0
self.linecap: Optional[object] = None self.linecap: Optional[object] = None
@ -138,12 +149,22 @@ class PDFGraphicState:
return obj return obj
def __repr__(self) -> str: def __repr__(self) -> str:
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, ' return (
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, ' "<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
' stroking color=%r, non stroking color=%r>' % " miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
(self.linewidth, self.linecap, self.linejoin, " stroking color=%r, non stroking color=%r>"
self.miterlimit, self.dash, self.intent, self.flatness, % (
self.scolor, self.ncolor)) self.linewidth,
self.linecap,
self.linejoin,
self.miterlimit,
self.dash,
self.intent,
self.flatness,
self.scolor,
self.ncolor,
)
)
class PDFResourceManager: class PDFResourceManager:
@ -179,41 +200,41 @@ class PDFResourceManager:
if objid and objid in self._cached_fonts: if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid] font = self._cached_fonts[objid]
else: else:
log.debug('get_font: create: objid=%r, spec=%r', objid, spec) log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
if settings.STRICT: if settings.STRICT:
if spec['Type'] is not LITERAL_FONT: if spec["Type"] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font') raise PDFFontError("Type is not /Font")
# Create a Font object. # Create a Font object.
if 'Subtype' in spec: if "Subtype" in spec:
subtype = literal_name(spec['Subtype']) subtype = literal_name(spec["Subtype"])
else: else:
if settings.STRICT: if settings.STRICT:
raise PDFFontError('Font Subtype is not specified.') raise PDFFontError("Font Subtype is not specified.")
subtype = 'Type1' subtype = "Type1"
if subtype in ('Type1', 'MMType1'): if subtype in ("Type1", "MMType1"):
# Type1 Font # Type1 Font
font = PDFType1Font(self, spec) font = PDFType1Font(self, spec)
elif subtype == 'TrueType': elif subtype == "TrueType":
# TrueType Font # TrueType Font
font = PDFTrueTypeFont(self, spec) font = PDFTrueTypeFont(self, spec)
elif subtype == 'Type3': elif subtype == "Type3":
# Type3 Font # Type3 Font
font = PDFType3Font(self, spec) font = PDFType3Font(self, spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'): elif subtype in ("CIDFontType0", "CIDFontType2"):
# CID Font # CID Font
font = PDFCIDFont(self, spec) font = PDFCIDFont(self, spec)
elif subtype == 'Type0': elif subtype == "Type0":
# Type0 Font # Type0 Font
dfonts = list_value(spec['DescendantFonts']) dfonts = list_value(spec["DescendantFonts"])
assert dfonts assert dfonts
subspec = dict_value(dfonts[0]).copy() subspec = dict_value(dfonts[0]).copy()
for k in ('Encoding', 'ToUnicode'): for k in ("Encoding", "ToUnicode"):
if k in spec: if k in spec:
subspec[k] = resolve1(spec[k]) subspec[k] = resolve1(spec[k])
font = self.get_font(None, subspec) font = self.get_font(None, subspec)
else: else:
if settings.STRICT: if settings.STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec) raise PDFFontError("Invalid Font spec: %r" % spec)
font = PDFType1Font(self, spec) # this is so wrong! font = PDFType1Font(self, spec) # this is so wrong!
if objid and self.caching: if objid and self.caching:
self._cached_fonts[objid] = font self._cached_fonts[objid] = font
@ -221,7 +242,6 @@ class PDFResourceManager:
class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
def __init__(self, streams: Sequence[object]) -> None: def __init__(self, streams: Sequence[object]) -> None:
self.streams = streams self.streams = streams
self.istream = 0 self.istream = 0
@ -236,7 +256,7 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
strm = stream_value(self.streams[self.istream]) strm = stream_value(self.streams[self.istream])
self.istream += 1 self.istream += 1
else: else:
raise PSEOF('Unexpected EOF, file truncated?') raise PSEOF("Unexpected EOF, file truncated?")
self.fp = BytesIO(strm.get_data()) self.fp = BytesIO(strm.get_data())
def seek(self, pos: int) -> None: def seek(self, pos: int) -> None:
@ -255,14 +275,10 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
self.fp = None # type: ignore[assignment] self.fp = None # type: ignore[assignment]
self.charpos = 0 self.charpos = 0
def get_inline_data( def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
self,
pos: int,
target: bytes = b'EI'
) -> Tuple[int, bytes]:
self.seek(pos) self.seek(pos)
i = 0 i = 0
data = b'' data = b""
while i <= len(target): while i <= len(target):
self.fillbuf() self.fillbuf()
if i: if i:
@ -279,36 +295,35 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
else: else:
try: try:
j = self.buf.index(target[0], self.charpos) j = self.buf.index(target[0], self.charpos)
data += self.buf[self.charpos:j+1] data += self.buf[self.charpos : j + 1]
self.charpos = j+1 self.charpos = j + 1
i = 1 i = 1
except ValueError: except ValueError:
data += self.buf[self.charpos:] data += self.buf[self.charpos :]
self.charpos = len(self.buf) self.charpos = len(self.buf)
data = data[:-(len(target)+1)] # strip the last part data = data[: -(len(target) + 1)] # strip the last part
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data) data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
return (pos, data) return (pos, data)
def flush(self) -> None: def flush(self) -> None:
self.add_results(*self.popall()) self.add_results(*self.popall())
KEYWORD_BI = KWD(b'BI') KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b'ID') KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b'EI') KEYWORD_EI = KWD(b"EI")
def do_keyword(self, pos: int, token: PSKeyword) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BI: if token is self.KEYWORD_BI:
# inline image within a content stream # inline image within a content stream
self.start_type(pos, 'inline') self.start_type(pos, "inline")
elif token is self.KEYWORD_ID: elif token is self.KEYWORD_ID:
try: try:
(_, objs) = self.end_type('inline') (_, objs) = self.end_type("inline")
if len(objs) % 2 != 0: if len(objs) % 2 != 0:
error_msg = 'Invalid dictionary construct: {!r}' \ error_msg = "Invalid dictionary construct: {!r}".format(objs)
.format(objs)
raise PSTypeError(error_msg) raise PSTypeError(error_msg)
d = {literal_name(k): v for (k, v) in choplist(2, objs)} d = {literal_name(k): v for (k, v) in choplist(2, objs)}
(pos, data) = self.get_inline_data(pos+len(b'ID ')) (pos, data) = self.get_inline_data(pos + len(b"ID "))
obj = PDFStream(d, data) obj = PDFStream(d, data)
self.push((pos, obj)) self.push((pos, obj))
self.push((pos, self.KEYWORD_EI)) self.push((pos, self.KEYWORD_EI))
@ -351,32 +366,30 @@ class PDFPageInterpreter:
name = literal_name(spec[0]) name = literal_name(spec[0])
else: else:
name = literal_name(spec) name = literal_name(spec)
if name == 'ICCBased' and isinstance(spec, list) \ if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec):
and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])["N"])
return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec):
elif name == 'DeviceN' and isinstance(spec, list) \
and 2 <= len(spec):
return PDFColorSpace(name, len(list_value(spec[1]))) return PDFColorSpace(name, len(list_value(spec[1])))
else: else:
return PREDEFINED_COLORSPACE.get(name) return PREDEFINED_COLORSPACE.get(name)
for (k, v) in dict_value(resources).items(): for (k, v) in dict_value(resources).items():
log.debug('Resource: %r: %r', k, v) log.debug("Resource: %r: %r", k, v)
if k == 'Font': if k == "Font":
for (fontid, spec) in dict_value(v).items(): for (fontid, spec) in dict_value(v).items():
objid = None objid = None
if isinstance(spec, PDFObjRef): if isinstance(spec, PDFObjRef):
objid = spec.objid objid = spec.objid
spec = dict_value(spec) spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace': elif k == "ColorSpace":
for (csid, spec) in dict_value(v).items(): for (csid, spec) in dict_value(v).items():
colorspace = get_colorspace(resolve1(spec)) colorspace = get_colorspace(resolve1(spec))
if colorspace is not None: if colorspace is not None:
self.csmap[csid] = colorspace self.csmap[csid] = colorspace
elif k == 'ProcSet': elif k == "ProcSet":
self.rsrcmgr.get_procset(list_value(v)) self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject': elif k == "XObject":
for (xobjid, xobjstrm) in dict_value(v).items(): for (xobjid, xobjstrm) in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm self.xobjmap[xobjid] = xobjstrm
return return
@ -410,14 +423,11 @@ class PDFPageInterpreter:
self.argstack = self.argstack[:-n] self.argstack = self.argstack[:-n]
return x return x
def get_current_state( def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
self
) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
def set_current_state( def set_current_state(
self, self, state: Tuple[Matrix, PDFTextState, PDFGraphicState]
state: Tuple[Matrix, PDFTextState, PDFGraphicState]
) -> None: ) -> None:
(self.ctm, self.textstate, self.graphicstate) = state (self.ctm, self.textstate, self.graphicstate) = state
self.device.set_ctm(self.ctm) self.device.set_ctm(self.ctm)
@ -441,11 +451,10 @@ class PDFPageInterpreter:
c1: PDFStackT, c1: PDFStackT,
d1: PDFStackT, d1: PDFStackT,
e1: PDFStackT, e1: PDFStackT,
f1: PDFStackT f1: PDFStackT,
) -> None: ) -> None:
"""Concatenate matrix to current transformation matrix""" """Concatenate matrix to current transformation matrix"""
self.ctm = \ self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
self.device.set_ctm(self.ctm) self.device.set_ctm(self.ctm)
return return
@ -491,12 +500,12 @@ class PDFPageInterpreter:
def do_m(self, x: PDFStackT, y: PDFStackT) -> None: def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
"""Begin new subpath""" """Begin new subpath"""
self.curpath.append(('m', cast(float, x), cast(float, y))) self.curpath.append(("m", cast(float, x), cast(float, y)))
return return
def do_l(self, x: PDFStackT, y: PDFStackT) -> None: def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
"""Append straight line segment to path""" """Append straight line segment to path"""
self.curpath.append(('l', cast(float, x), cast(float, y))) self.curpath.append(("l", cast(float, x), cast(float, y)))
return return
def do_c( def do_c(
@ -506,66 +515,57 @@ class PDFPageInterpreter:
x2: PDFStackT, x2: PDFStackT,
y2: PDFStackT, y2: PDFStackT,
x3: PDFStackT, x3: PDFStackT,
y3: PDFStackT y3: PDFStackT,
) -> None: ) -> None:
"""Append curved segment to path (three control points)""" """Append curved segment to path (three control points)"""
self.curpath.append(('c', cast(float, x1), cast(float, y1), self.curpath.append(
cast(float, x2), cast(float, y2), (
cast(float, x3), cast(float, y3))) "c",
cast(float, x1),
cast(float, y1),
cast(float, x2),
cast(float, y2),
cast(float, x3),
cast(float, y3),
)
)
return return
def do_v( def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
self,
x2: PDFStackT,
y2: PDFStackT,
x3: PDFStackT,
y3: PDFStackT
) -> None:
"""Append curved segment to path (initial point replicated)""" """Append curved segment to path (initial point replicated)"""
self.curpath.append(('v', cast(float, x2), cast(float, y2), self.curpath.append(
cast(float, x3), cast(float, y3))) ("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3))
)
return return
def do_y( def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
self,
x1: PDFStackT,
y1: PDFStackT,
x3: PDFStackT,
y3: PDFStackT
) -> None:
"""Append curved segment to path (final point replicated)""" """Append curved segment to path (final point replicated)"""
self.curpath.append(('y', cast(float, x1), cast(float, y1), self.curpath.append(
cast(float, x3), cast(float, y3))) ("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3))
)
return return
def do_h(self) -> None: def do_h(self) -> None:
"""Close subpath""" """Close subpath"""
self.curpath.append(('h',)) self.curpath.append(("h",))
return return
def do_re( def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
self,
x: PDFStackT,
y: PDFStackT,
w: PDFStackT,
h: PDFStackT
) -> None:
"""Append rectangle to path""" """Append rectangle to path"""
x = cast(float, x) x = cast(float, x)
y = cast(float, y) y = cast(float, y)
w = cast(float, w) w = cast(float, w)
h = cast(float, h) h = cast(float, h)
self.curpath.append(('m', x, y)) self.curpath.append(("m", x, y))
self.curpath.append(('l', x+w, y)) self.curpath.append(("l", x + w, y))
self.curpath.append(('l', x+w, y+h)) self.curpath.append(("l", x + w, y + h))
self.curpath.append(('l', x, y+h)) self.curpath.append(("l", x, y + h))
self.curpath.append(('h',)) self.curpath.append(("h",))
return return
def do_S(self) -> None: def do_S(self) -> None:
"""Stroke path""" """Stroke path"""
self.device.paint_path(self.graphicstate, True, False, False, self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath)
self.curpath = [] self.curpath = []
return return
@ -577,8 +577,7 @@ class PDFPageInterpreter:
def do_f(self) -> None: def do_f(self) -> None:
"""Fill path using nonzero winding number rule""" """Fill path using nonzero winding number rule"""
self.device.paint_path(self.graphicstate, False, True, False, self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
self.curpath)
self.curpath = [] self.curpath = []
return return
@ -588,22 +587,19 @@ class PDFPageInterpreter:
def do_f_a(self) -> None: def do_f_a(self) -> None:
"""Fill path using even-odd rule""" """Fill path using even-odd rule"""
self.device.paint_path(self.graphicstate, False, True, True, self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath)
self.curpath = [] self.curpath = []
return return
def do_B(self) -> None: def do_B(self) -> None:
"""Fill and stroke path using nonzero winding number rule""" """Fill and stroke path using nonzero winding number rule"""
self.device.paint_path(self.graphicstate, True, True, False, self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath)
self.curpath = [] self.curpath = []
return return
def do_B_a(self) -> None: def do_B_a(self) -> None:
"""Fill and stroke path using even-odd rule""" """Fill and stroke path using even-odd rule"""
self.device.paint_path(self.graphicstate, True, True, True, self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath)
self.curpath = [] self.curpath = []
return return
@ -641,7 +637,7 @@ class PDFPageInterpreter:
self.scs = self.csmap[literal_name(name)] self.scs = self.csmap[literal_name(name)]
except KeyError: except KeyError:
if settings.STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name) raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
return return
def do_cs(self, name: PDFStackT) -> None: def do_cs(self, name: PDFStackT) -> None:
@ -650,7 +646,7 @@ class PDFPageInterpreter:
self.ncs = self.csmap[literal_name(name)] self.ncs = self.csmap[literal_name(name)]
except KeyError: except KeyError:
if settings.STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name) raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
return return
def do_G(self, gray: PDFStackT) -> None: def do_G(self, gray: PDFStackT) -> None:
@ -665,38 +661,32 @@ class PDFPageInterpreter:
def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
"""Set RGB color for stroking operations""" """Set RGB color for stroking operations"""
self.graphicstate.scolor = \ self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b))
(cast(float, r), cast(float, g), cast(float, b))
return return
def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None: def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
"""Set RGB color for nonstroking operations""" """Set RGB color for nonstroking operations"""
self.graphicstate.ncolor = \ self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b))
(cast(float, r), cast(float, g), cast(float, b))
return return
def do_K( def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
self,
c: PDFStackT,
m: PDFStackT,
y: PDFStackT,
k: PDFStackT
) -> None:
"""Set CMYK color for stroking operations""" """Set CMYK color for stroking operations"""
self.graphicstate.scolor = \ self.graphicstate.scolor = (
(cast(float, c), cast(float, m), cast(float, y), cast(float, k)) cast(float, c),
cast(float, m),
cast(float, y),
cast(float, k),
)
return return
def do_k( def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
self,
c: PDFStackT,
m: PDFStackT,
y: PDFStackT,
k: PDFStackT
) -> None:
"""Set CMYK color for nonstroking operations""" """Set CMYK color for nonstroking operations"""
self.graphicstate.ncolor = \ self.graphicstate.ncolor = (
(cast(float, c), cast(float, m), cast(float, y), cast(float, k)) cast(float, c),
cast(float, m),
cast(float, y),
cast(float, k),
)
return return
def do_SCN(self) -> None: def do_SCN(self) -> None:
@ -705,7 +695,7 @@ class PDFPageInterpreter:
n = self.scs.ncomponents n = self.scs.ncomponents
else: else:
if settings.STRICT: if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!') raise PDFInterpreterError("No colorspace specified!")
n = 1 n = 1
self.graphicstate.scolor = cast(Color, self.pop(n)) self.graphicstate.scolor = cast(Color, self.pop(n))
return return
@ -716,7 +706,7 @@ class PDFPageInterpreter:
n = self.ncs.ncomponents n = self.ncs.ncomponents
else: else:
if settings.STRICT: if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!') raise PDFInterpreterError("No colorspace specified!")
n = 1 n = 1
self.graphicstate.ncolor = cast(Color, self.pop(n)) self.graphicstate.ncolor = cast(Color, self.pop(n))
return return
@ -831,7 +821,7 @@ class PDFPageInterpreter:
self.textstate.font = self.fontmap[literal_name(fontid)] self.textstate.font = self.fontmap[literal_name(fontid)]
except KeyError: except KeyError:
if settings.STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined Font id: %r' % fontid) raise PDFInterpreterError("Undefined Font id: %r" % fontid)
self.textstate.font = self.rsrcmgr.get_font(None, {}) self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = cast(float, fontsize) self.textstate.fontsize = cast(float, fontsize)
return return
@ -854,7 +844,7 @@ class PDFPageInterpreter:
tx = cast(float, tx) tx = cast(float, tx)
ty = cast(float, ty) ty = cast(float, ty)
(a, b, c, d, e, f) = self.textstate.matrix (a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) self.textstate.matrix = (a, b, c, d, tx * a + ty * c + e, tx * b + ty * d + f)
self.textstate.linematrix = (0, 0) self.textstate.linematrix = (0, 0)
return return
@ -863,7 +853,7 @@ class PDFPageInterpreter:
tx = cast(float, tx) tx = cast(float, tx)
ty = cast(float, ty) ty = cast(float, ty)
(a, b, c, d, e, f) = self.textstate.matrix (a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f) self.textstate.matrix = (a, b, c, d, tx * a + ty * c + e, tx * b + ty * d + f)
self.textstate.leading = ty self.textstate.leading = ty
self.textstate.linematrix = (0, 0) self.textstate.linematrix = (0, 0)
return return
@ -875,7 +865,7 @@ class PDFPageInterpreter:
c: PDFStackT, c: PDFStackT,
d: PDFStackT, d: PDFStackT,
e: PDFStackT, e: PDFStackT,
f: PDFStackT f: PDFStackT,
) -> None: ) -> None:
"""Set text matrix and text line matrix""" """Set text matrix and text line matrix"""
self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f)) self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
@ -885,8 +875,14 @@ class PDFPageInterpreter:
def do_T_a(self) -> None: def do_T_a(self) -> None:
"""Move to start of next text line""" """Move to start of next text line"""
(a, b, c, d, e, f) = self.textstate.matrix (a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.matrix = (
self.textstate.leading*d+f) a,
b,
c,
d,
self.textstate.leading * c + e,
self.textstate.leading * d + f,
)
self.textstate.linematrix = (0, 0) self.textstate.linematrix = (0, 0)
return return
@ -894,11 +890,12 @@ class PDFPageInterpreter:
"""Show text, allowing individual glyph positioning""" """Show text, allowing individual glyph positioning"""
if self.textstate.font is None: if self.textstate.font is None:
if settings.STRICT: if settings.STRICT:
raise PDFInterpreterError('No font specified!') raise PDFInterpreterError("No font specified!")
return return
assert self.ncs is not None assert self.ncs is not None
self.device.render_string(self.textstate, cast(PDFTextSeq, seq), self.device.render_string(
self.ncs, self.graphicstate.copy()) self.textstate, cast(PDFTextSeq, seq), self.ncs, self.graphicstate.copy()
)
return return
def do_Tj(self, s: PDFStackT) -> None: def do_Tj(self, s: PDFStackT) -> None:
@ -935,7 +932,7 @@ class PDFPageInterpreter:
def do_EI(self, obj: PDFStackT) -> None: def do_EI(self, obj: PDFStackT) -> None:
"""End inline image object""" """End inline image object"""
if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj: if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
iobjid = str(id(obj)) iobjid = str(id(obj))
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY) self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(iobjid, obj) self.device.render_image(iobjid, obj)
@ -949,28 +946,28 @@ class PDFPageInterpreter:
xobj = stream_value(self.xobjmap[xobjid]) xobj = stream_value(self.xobjmap[xobjid])
except KeyError: except KeyError:
if settings.STRICT: if settings.STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
return return
log.debug('Processing xobj: %r', xobj) log.debug("Processing xobj: %r", xobj)
subtype = xobj.get('Subtype') subtype = xobj.get("Subtype")
if subtype is LITERAL_FORM and 'BBox' in xobj: if subtype is LITERAL_FORM and "BBox" in xobj:
interpreter = self.dup() interpreter = self.dup()
bbox = cast(Rect, list_value(xobj['BBox'])) bbox = cast(Rect, list_value(xobj["BBox"]))
matrix = cast(Matrix, list_value( matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
xobj.get('Matrix', MATRIX_IDENTITY)))
# According to PDF reference 1.7 section 4.9.1, XObjects in # According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry # earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry. # instead of having their own Resources entry.
xobjres = xobj.get('Resources') xobjres = xobj.get("Resources")
if xobjres: if xobjres:
resources = dict_value(xobjres) resources = dict_value(xobjres)
else: else:
resources = self.resources.copy() resources = self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix) self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(resources, [xobj], interpreter.render_contents(
ctm=mult_matrix(matrix, self.ctm)) resources, [xobj], ctm=mult_matrix(matrix, self.ctm)
)
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(xobjid, xobj) self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
@ -980,7 +977,7 @@ class PDFPageInterpreter:
return return
def process_page(self, page: PDFPage) -> None: def process_page(self, page: PDFPage) -> None:
log.debug('Processing page: %r', page) log.debug("Processing page: %r", page)
(x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
if page.rotate == 90: if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1) ctm = (0, -1, 1, 0, -y0, x1)
@ -999,14 +996,15 @@ class PDFPageInterpreter:
self, self,
resources: Dict[object, object], resources: Dict[object, object],
streams: Sequence[object], streams: Sequence[object],
ctm: Matrix = MATRIX_IDENTITY ctm: Matrix = MATRIX_IDENTITY,
) -> None: ) -> None:
"""Render the content streams. """Render the content streams.
This method may be called recursively. This method may be called recursively.
""" """
log.debug('render_contents: resources=%r, streams=%r, ctm=%r', log.debug(
resources, streams, ctm) "render_contents: resources=%r, streams=%r, ctm=%r", resources, streams, ctm
)
self.init_resources(resources) self.init_resources(resources)
self.init_state(ctm) self.init_state(ctm)
self.execute(list_value(streams)) self.execute(list_value(streams))
@ -1025,22 +1023,23 @@ class PDFPageInterpreter:
break break
if isinstance(obj, PSKeyword): if isinstance(obj, PSKeyword):
name = keyword_name(obj) name = keyword_name(obj)
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w')\ method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
.replace("'", '_q') "'", "_q"
)
if hasattr(self, method): if hasattr(self, method):
func = getattr(self, method) func = getattr(self, method)
nargs = func.__code__.co_argcount-1 nargs = func.__code__.co_argcount - 1
if nargs: if nargs:
args = self.pop(nargs) args = self.pop(nargs)
log.debug('exec: %s %r', name, args) log.debug("exec: %s %r", name, args)
if len(args) == nargs: if len(args) == nargs:
func(*args) func(*args)
else: else:
log.debug('exec: %s', name) log.debug("exec: %s", name)
func() func()
else: else:
if settings.STRICT: if settings.STRICT:
error_msg = 'Unknown operator: %r' % name error_msg = "Unknown operator: %r" % name
raise PDFInterpreterError(error_msg) raise PDFInterpreterError(error_msg)
else: else:
self.push(obj) self.push(obj)

View File

@ -4,8 +4,7 @@ from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
from pdfminer.utils import Rect from pdfminer.utils import Rect
from . import settings from . import settings
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \ from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, PDFNoPageLabels
PDFNoPageLabels
from .pdfparser import PDFParser from .pdfparser import PDFParser
from .pdftypes import PDFObjectNotFound from .pdftypes import PDFObjectNotFound
from .pdftypes import dict_value from .pdftypes import dict_value
@ -17,8 +16,8 @@ from .psparser import LIT
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# some predefined literals and keywords. # some predefined literals and keywords.
LITERAL_PAGE = LIT('Page') LITERAL_PAGE = LIT("Page")
LITERAL_PAGES = LIT('Pages') LITERAL_PAGES = LIT("Pages")
class PDFPage: class PDFPage:
@ -44,11 +43,7 @@ class PDFPage:
""" """
def __init__( def __init__(
self, self, doc: PDFDocument, pageid: object, attrs: object, label: Optional[str]
doc: PDFDocument,
pageid: object,
attrs: object,
label: Optional[str]
) -> None: ) -> None:
"""Initialize a page object. """Initialize a page object.
@ -61,19 +56,20 @@ class PDFPage:
self.pageid = pageid self.pageid = pageid
self.attrs = dict_value(attrs) self.attrs = dict_value(attrs)
self.label = label self.label = label
self.lastmod = resolve1(self.attrs.get('LastModified')) self.lastmod = resolve1(self.attrs.get("LastModified"))
self.resources: Dict[object, object] = \ self.resources: Dict[object, object] = resolve1(
resolve1(self.attrs.get('Resources', dict())) self.attrs.get("Resources", dict())
self.mediabox: Rect = resolve1(self.attrs['MediaBox']) )
if 'CropBox' in self.attrs: self.mediabox: Rect = resolve1(self.attrs["MediaBox"])
self.cropbox: Rect = resolve1(self.attrs['CropBox']) if "CropBox" in self.attrs:
self.cropbox: Rect = resolve1(self.attrs["CropBox"])
else: else:
self.cropbox = self.mediabox self.cropbox = self.mediabox
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360 self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
self.annots = self.attrs.get('Annots') self.annots = self.attrs.get("Annots")
self.beads = self.attrs.get('B') self.beads = self.attrs.get("B")
if 'Contents' in self.attrs: if "Contents" in self.attrs:
contents = resolve1(self.attrs['Contents']) contents = resolve1(self.attrs["Contents"])
else: else:
contents = [] contents = []
if not isinstance(contents, list): if not isinstance(contents, list):
@ -81,16 +77,16 @@ class PDFPage:
self.contents: List[object] = contents self.contents: List[object] = contents
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFPage: Resources={!r}, MediaBox={!r}>'\ return "<PDFPage: Resources={!r}, MediaBox={!r}>".format(
.format(self.resources, self.mediabox) self.resources, self.mediabox
)
INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'} INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
@classmethod @classmethod
def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]: def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
def search( def search(
obj: object, obj: object, parent: Dict[str, object]
parent: Dict[str, object]
) -> Iterator[Tuple[int, Dict[object, Dict[object, object]]]]: ) -> Iterator[Tuple[int, Dict[object, Dict[object, object]]]]:
if isinstance(obj, int): if isinstance(obj, int):
objid = obj objid = obj
@ -104,16 +100,16 @@ class PDFPage:
if k in cls.INHERITABLE_ATTRS and k not in tree: if k in cls.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
tree_type = tree.get('Type') tree_type = tree.get("Type")
if tree_type is None and not settings.STRICT: # See #64 if tree_type is None and not settings.STRICT: # See #64
tree_type = tree.get('type') tree_type = tree.get("type")
if tree_type is LITERAL_PAGES and 'Kids' in tree: if tree_type is LITERAL_PAGES and "Kids" in tree:
log.debug('Pages: Kids=%r', tree['Kids']) log.debug("Pages: Kids=%r", tree["Kids"])
for c in list_value(tree['Kids']): for c in list_value(tree["Kids"]):
yield from search(c, tree) yield from search(c, tree)
elif tree_type is LITERAL_PAGE: elif tree_type is LITERAL_PAGE:
log.debug('Page: %r', tree) log.debug("Page: %r", tree)
yield (objid, tree) yield (objid, tree)
try: try:
@ -122,8 +118,8 @@ class PDFPage:
page_labels = itertools.repeat(None) page_labels = itertools.repeat(None)
pages = False pages = False
if 'Pages' in document.catalog: if "Pages" in document.catalog:
objects = search(document.catalog['Pages'], document.catalog) objects = search(document.catalog["Pages"], document.catalog)
for (objid, tree) in objects: for (objid, tree) in objects:
yield cls(document, objid, tree, next(page_labels)) yield cls(document, objid, tree, next(page_labels))
pages = True pages = True
@ -133,8 +129,7 @@ class PDFPage:
for objid in xref.get_objids(): for objid in xref.get_objids():
try: try:
obj = document.getobj(objid) obj = document.getobj(objid)
if isinstance(obj, dict) \ if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
and obj.get('Type') is LITERAL_PAGE:
yield cls(document, objid, obj, next(page_labels)) yield cls(document, objid, obj, next(page_labels))
except PDFObjectNotFound: except PDFObjectNotFound:
pass pass
@ -146,9 +141,9 @@ class PDFPage:
fp: BinaryIO, fp: BinaryIO,
pagenos: Optional[Container[int]] = None, pagenos: Optional[Container[int]] = None,
maxpages: int = 0, maxpages: int = 0,
password: str = '', password: str = "",
caching: bool = True, caching: bool = True,
check_extractable: bool = False check_extractable: bool = False,
) -> Iterator["PDFPage"]: ) -> Iterator["PDFPage"]:
# Create a PDF parser object associated with the file object. # Create a PDF parser object associated with the file object.
parser = PDFParser(fp) parser = PDFParser(fp)
@ -158,20 +153,22 @@ class PDFPage:
# If not, warn the user and proceed. # If not, warn the user and proceed.
if not doc.is_extractable: if not doc.is_extractable:
if check_extractable: if check_extractable:
error_msg = 'Text extraction is not allowed: %r' % fp error_msg = "Text extraction is not allowed: %r" % fp
raise PDFTextExtractionNotAllowed(error_msg) raise PDFTextExtractionNotAllowed(error_msg)
else: else:
warning_msg = 'The PDF %r contains a metadata field '\ warning_msg = (
'indicating that it should not allow ' \ "The PDF %r contains a metadata field "
'text extraction. Ignoring this field ' \ "indicating that it should not allow "
'and proceeding. Use the check_extractable ' \ "text extraction. Ignoring this field "
'if you want to raise an error in this case' % fp "and proceeding. Use the check_extractable "
"if you want to raise an error in this case" % fp
)
log.warning(warning_msg) log.warning(warning_msg)
# Process each page contained in the document. # Process each page contained in the document.
for (pageno, page) in enumerate(cls.create_pages(doc)): for (pageno, page) in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos): if pagenos and (pageno not in pagenos):
continue continue
yield page yield page
if maxpages and maxpages <= pageno+1: if maxpages and maxpages <= pageno + 1:
break break
return return

View File

@ -51,12 +51,12 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
"""Associates the parser with a PDFDocument object.""" """Associates the parser with a PDFDocument object."""
self.doc = doc self.doc = doc
KEYWORD_R = KWD(b'R') KEYWORD_R = KWD(b"R")
KEYWORD_NULL = KWD(b'null') KEYWORD_NULL = KWD(b"null")
KEYWORD_ENDOBJ = KWD(b'endobj') KEYWORD_ENDOBJ = KWD(b"endobj")
KEYWORD_STREAM = KWD(b'stream') KEYWORD_STREAM = KWD(b"stream")
KEYWORD_XREF = KWD(b'xref') KEYWORD_XREF = KWD(b"xref")
KEYWORD_STARTXREF = KWD(b'startxref') KEYWORD_STARTXREF = KWD(b"startxref")
def do_keyword(self, pos: int, token: PSKeyword) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None:
"""Handles PDF-related keywords.""" """Handles PDF-related keywords."""
@ -76,8 +76,7 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
if len(self.curstack) >= 2: if len(self.curstack) >= 2:
try: try:
((_, objid), (_, genno)) = self.pop(2) ((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = ( (objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
int(objid), int(genno)) # type: ignore[arg-type]
assert self.doc is not None assert self.doc is not None
obj = PDFObjRef(self.doc, objid, genno) obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj)) self.push((pos, obj))
@ -90,30 +89,30 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
objlen = 0 objlen = 0
if not self.fallback: if not self.fallback:
try: try:
objlen = int_value(dic['Length']) objlen = int_value(dic["Length"])
except KeyError: except KeyError:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('/Length is undefined: %r' % dic) raise PDFSyntaxError("/Length is undefined: %r" % dic)
self.seek(pos) self.seek(pos)
try: try:
(_, line) = self.nextline() # 'stream' (_, line) = self.nextline() # 'stream'
except PSEOF: except PSEOF:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF') raise PDFSyntaxError("Unexpected EOF")
return return
pos += len(line) pos += len(line)
self.fp.seek(pos) self.fp.seek(pos)
data = bytearray(self.fp.read(objlen)) data = bytearray(self.fp.read(objlen))
self.seek(pos+objlen) self.seek(pos + objlen)
while 1: while 1:
try: try:
(linepos, line) = self.nextline() (linepos, line) = self.nextline()
except PSEOF: except PSEOF:
if settings.STRICT: if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF') raise PDFSyntaxError("Unexpected EOF")
break break
if b'endstream' in line: if b"endstream" in line:
i = line.index(b'endstream') i = line.index(b"endstream")
objlen += i objlen += i
if self.fallback: if self.fallback:
data += line[:i] data += line[:i]
@ -121,10 +120,15 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
objlen += len(line) objlen += len(line)
if self.fallback: if self.fallback:
data += line data += line
self.seek(pos+objlen) self.seek(pos + objlen)
# XXX limit objlen not to exceed object boundary # XXX limit objlen not to exceed object boundary
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, log.debug(
objlen, dic, data[:10]) "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
pos,
objlen,
dic,
data[:10],
)
assert self.doc is not None assert self.doc is not None
stream = PDFStream(dic, bytes(data), self.doc.decipher) stream = PDFStream(dic, bytes(data), self.doc.decipher)
self.push((pos, stream)) self.push((pos, stream))
@ -149,15 +153,14 @@ class PDFStreamParser(PDFParser):
def flush(self) -> None: def flush(self) -> None:
self.add_results(*self.popall()) self.add_results(*self.popall())
KEYWORD_OBJ = KWD(b'obj') KEYWORD_OBJ = KWD(b"obj")
def do_keyword(self, pos: int, token: PSKeyword) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_R: if token is self.KEYWORD_R:
# reference to indirect object # reference to indirect object
try: try:
((_, objid), (_, genno)) = self.pop(2) ((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = ( (objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
int(objid), int(genno)) # type: ignore[arg-type]
obj = PDFObjRef(self.doc, objid, genno) obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj)) self.push((pos, obj))
except PSSyntaxError: except PSSyntaxError:
@ -167,7 +170,7 @@ class PDFStreamParser(PDFParser):
if settings.STRICT: if settings.STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the # See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used. # stream; the obj and endobj keywords are not used.
raise PDFSyntaxError('Keyword endobj found in stream') raise PDFSyntaxError("Keyword endobj found in stream")
return return
# others # others
self.push((pos, token)) self.push((pos, token))

View File

@ -2,8 +2,17 @@ import io
import logging import logging
import sys import sys
import zlib import zlib
from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List, from typing import (
Tuple, cast) TYPE_CHECKING,
Any,
Dict,
Iterable,
Optional,
Union,
List,
Tuple,
cast,
)
from . import settings from . import settings
from .ascii85 import ascii85decode from .ascii85 import ascii85decode
@ -21,18 +30,18 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
LITERAL_CRYPT = LIT('Crypt') LITERAL_CRYPT = LIT("Crypt")
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images" # Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl')) LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl"))
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW"))
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85')) LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85"))
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx"))
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL"))
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF"))
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT"))
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),) LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),)
LITERALS_JPX_DECODE = (LIT('JPXDecode'),) LITERALS_JPX_DECODE = (LIT("JPXDecode"),)
if sys.version_info >= (3, 8): if sys.version_info >= (3, 8):
@ -40,8 +49,14 @@ if sys.version_info >= (3, 8):
class DecipherCallable(Protocol): class DecipherCallable(Protocol):
"""Fully typed a decipher callback, with optional parameter.""" """Fully typed a decipher callback, with optional parameter."""
def __call__(self, objid: int, genno: int, data: bytes,
attrs: Optional[Dict[str, Any]] = None) -> bytes: def __call__(
self,
objid: int,
genno: int,
data: bytes,
attrs: Optional[Dict[str, Any]] = None,
) -> bytes:
raise NotImplementedError raise NotImplementedError
else: # Fallback for older Python else: # Fallback for older Python
@ -75,21 +90,15 @@ class PDFNotImplementedError(PDFException):
class PDFObjRef(PDFObject): class PDFObjRef(PDFObject):
def __init__(self, doc: Optional["PDFDocument"], objid: int, _: object) -> None:
def __init__(
self,
doc: Optional["PDFDocument"],
objid: int,
_: object
) -> None:
if objid == 0: if objid == 0:
if settings.STRICT: if settings.STRICT:
raise PDFValueError('PDF object id cannot be 0.') raise PDFValueError("PDF object id cannot be 0.")
self.doc = doc self.doc = doc
self.objid = objid self.objid = objid
def __repr__(self) -> str: def __repr__(self) -> str:
return '<PDFObjRef:%d>' % (self.objid) return "<PDFObjRef:%d>" % (self.objid)
def resolve(self, default: object = None) -> Any: def resolve(self, default: object = None) -> Any:
assert self.doc is not None assert self.doc is not None
@ -126,14 +135,8 @@ def resolve_all(x: object, default: object = None) -> Any:
return x return x
def decipher_all( def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any:
decipher: DecipherCallable, """Recursively deciphers the given object."""
objid: int,
genno: int,
x: object
) -> Any:
"""Recursively deciphers the given object.
"""
if isinstance(x, bytes): if isinstance(x, bytes):
return decipher(objid, genno, x) return decipher(objid, genno, x)
if isinstance(x, list): if isinstance(x, list):
@ -148,7 +151,7 @@ def int_value(x: object) -> int:
x = resolve1(x) x = resolve1(x)
if not isinstance(x, int): if not isinstance(x, int):
if settings.STRICT: if settings.STRICT:
raise PDFTypeError('Integer required: %r' % x) raise PDFTypeError("Integer required: %r" % x)
return 0 return 0
return x return x
@ -157,7 +160,7 @@ def float_value(x: object) -> float:
x = resolve1(x) x = resolve1(x)
if not isinstance(x, float): if not isinstance(x, float):
if settings.STRICT: if settings.STRICT:
raise PDFTypeError('Float required: %r' % x) raise PDFTypeError("Float required: %r" % x)
return 0.0 return 0.0
return x return x
@ -166,7 +169,7 @@ def num_value(x: object) -> float:
x = resolve1(x) x = resolve1(x)
if not isinstance(x, (int, float)): # == utils.isnumber(x) if not isinstance(x, (int, float)): # == utils.isnumber(x)
if settings.STRICT: if settings.STRICT:
raise PDFTypeError('Int or Float required: %r' % x) raise PDFTypeError("Int or Float required: %r" % x)
return 0 return 0
return x return x
@ -184,8 +187,8 @@ def str_value(x: object) -> bytes:
x = resolve1(x) x = resolve1(x)
if not isinstance(x, bytes): if not isinstance(x, bytes):
if settings.STRICT: if settings.STRICT:
raise PDFTypeError('String required: %r' % x) raise PDFTypeError("String required: %r" % x)
return b'' return b""
return x return x
@ -193,7 +196,7 @@ def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]:
x = resolve1(x) x = resolve1(x)
if not isinstance(x, (list, tuple)): if not isinstance(x, (list, tuple)):
if settings.STRICT: if settings.STRICT:
raise PDFTypeError('List required: %r' % x) raise PDFTypeError("List required: %r" % x)
return [] return []
return x return x
@ -202,8 +205,8 @@ def dict_value(x: object) -> Dict[Any, Any]:
x = resolve1(x) x = resolve1(x)
if not isinstance(x, dict): if not isinstance(x, dict):
if settings.STRICT: if settings.STRICT:
logger.error('PDFTypeError : Dict required: %r', x) logger.error("PDFTypeError : Dict required: %r", x)
raise PDFTypeError('Dict required: %r' % x) raise PDFTypeError("Dict required: %r" % x)
return {} return {}
return x return x
@ -212,8 +215,8 @@ def stream_value(x: object) -> "PDFStream":
x = resolve1(x) x = resolve1(x)
if not isinstance(x, PDFStream): if not isinstance(x, PDFStream):
if settings.STRICT: if settings.STRICT:
raise PDFTypeError('PDFStream required: %r' % x) raise PDFTypeError("PDFStream required: %r" % x)
return PDFStream({}, b'') return PDFStream({}, b"")
return x return x
@ -223,7 +226,7 @@ def decompress_corrupted(data: bytes) -> bytes:
""" """
d = zlib.decompressobj() d = zlib.decompressobj()
f = io.BytesIO(data) f = io.BytesIO(data)
result_str = b'' result_str = b""
buffer = f.read(1) buffer = f.read(1)
i = 0 i = 0
try: try:
@ -239,12 +242,11 @@ def decompress_corrupted(data: bytes) -> bytes:
class PDFStream(PDFObject): class PDFStream(PDFObject):
def __init__( def __init__(
self, self,
attrs: Dict[str, Any], attrs: Dict[str, Any],
rawdata: bytes, rawdata: bytes,
decipher: Optional[DecipherCallable] = None decipher: Optional[DecipherCallable] = None,
) -> None: ) -> None:
assert isinstance(attrs, dict), str(type(attrs)) assert isinstance(attrs, dict), str(type(attrs))
self.attrs = attrs self.attrs = attrs
@ -261,12 +263,18 @@ class PDFStream(PDFObject):
def __repr__(self) -> str: def __repr__(self) -> str:
if self.data is None: if self.data is None:
assert self.rawdata is not None assert self.rawdata is not None
return '<PDFStream(%r): raw=%d, %r>' % \ return "<PDFStream(%r): raw=%d, %r>" % (
(self.objid, len(self.rawdata), self.attrs) self.objid,
len(self.rawdata),
self.attrs,
)
else: else:
assert self.data is not None assert self.data is not None
return '<PDFStream(%r): len=%d, %r>' % \ return "<PDFStream(%r): len=%d, %r>" % (
(self.objid, len(self.data), self.attrs) self.objid,
len(self.data),
self.attrs,
)
def __contains__(self, name: object) -> bool: def __contains__(self, name: object) -> bool:
return name in self.attrs return name in self.attrs
@ -284,8 +292,8 @@ class PDFStream(PDFObject):
return default return default
def get_filters(self) -> List[Tuple[Any, Any]]: def get_filters(self) -> List[Tuple[Any, Any]]:
filters = self.get_any(('F', 'Filter')) filters = self.get_any(("F", "Filter"))
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})
if not filters: if not filters:
return [] return []
if not isinstance(filters, list): if not isinstance(filters, list):
@ -298,15 +306,16 @@ class PDFStream(PDFObject):
# resolve filter if possible # resolve filter if possible
_filters = [] _filters = []
for fltr in filters: for fltr in filters:
if hasattr(fltr, 'resolve'): if hasattr(fltr, "resolve"):
fltr = fltr.resolve()[0] fltr = fltr.resolve()[0]
_filters.append(fltr) _filters.append(fltr)
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15 # return list solves https://github.com/pdfminer/pdfminer.six/issues/15
return list(zip(_filters, params)) return list(zip(_filters, params))
def decode(self) -> None: def decode(self) -> None:
assert self.data is None \ assert self.data is None and self.rawdata is not None, str(
and self.rawdata is not None, str((self.data, self.rawdata)) (self.data, self.rawdata)
)
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher:
# Handle encryption # Handle encryption
@ -326,14 +335,13 @@ class PDFStream(PDFObject):
except zlib.error as e: except zlib.error as e:
if settings.STRICT: if settings.STRICT:
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\ error_msg = "Invalid zlib bytes: {!r}, {!r}".format(e, data)
.format(e, data)
raise PDFException(error_msg) raise PDFException(error_msg)
try: try:
data = decompress_corrupted(data) data = decompress_corrupted(data)
except zlib.error: except zlib.error:
data = b'' data = b""
elif f in LITERALS_LZW_DECODE: elif f in LITERALS_LZW_DECODE:
data = lzwdecode(data) data = lzwdecode(data)
@ -356,25 +364,26 @@ class PDFStream(PDFObject):
pass pass
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
# not yet.. # not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported') raise PDFNotImplementedError("/Crypt filter is unsupported")
else: else:
raise PDFNotImplementedError('Unsupported filter: %r' % f) raise PDFNotImplementedError("Unsupported filter: %r" % f)
# apply predictors # apply predictors
if params and 'Predictor' in params: if params and "Predictor" in params:
pred = int_value(params['Predictor']) pred = int_value(params["Predictor"])
if pred == 1: if pred == 1:
# no predictor # no predictor
pass pass
elif 10 <= pred: elif 10 <= pred:
# PNG predictor # PNG predictor
colors = int_value(params.get('Colors', 1)) colors = int_value(params.get("Colors", 1))
columns = int_value(params.get('Columns', 1)) columns = int_value(params.get("Columns", 1))
raw_bits_per_component = params.get('BitsPerComponent', 8) raw_bits_per_component = params.get("BitsPerComponent", 8)
bitspercomponent = int_value(raw_bits_per_component) bitspercomponent = int_value(raw_bits_per_component)
data = apply_png_predictor(pred, colors, columns, data = apply_png_predictor(
bitspercomponent, data) pred, colors, columns, bitspercomponent, data
)
else: else:
error_msg = 'Unsupported predictor: %r' % pred error_msg = "Unsupported predictor: %r" % pred
raise PDFNotImplementedError(error_msg) raise PDFNotImplementedError(error_msg)
self.data = data self.data = data
self.rawdata = None self.rawdata = None

View File

@ -4,8 +4,19 @@
import logging import logging
import re import re
from typing import (Any, BinaryIO, Dict, Generic, Iterator, List, from typing import (
Optional, Tuple, Type, TypeVar, Union) Any,
BinaryIO,
Dict,
Generic,
Iterator,
List,
Optional,
Tuple,
Type,
TypeVar,
Union,
)
from . import settings from . import settings
from .utils import choplist from .utils import choplist
@ -59,7 +70,7 @@ class PSLiteral(PSObject):
def __repr__(self) -> str: def __repr__(self) -> str:
name = self.name name = self.name
return '/%r' % name return "/%r" % name
class PSKeyword(PSObject): class PSKeyword(PSObject):
@ -79,10 +90,10 @@ class PSKeyword(PSObject):
def __repr__(self) -> str: def __repr__(self) -> str:
name = self.name name = self.name
return '/%r' % name return "/%r" % name
_SymbolT = TypeVar('_SymbolT', PSLiteral, PSKeyword) _SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword)
class PSSymbolTable(Generic[_SymbolT]): class PSSymbolTable(Generic[_SymbolT]):
@ -110,25 +121,25 @@ PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword) PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern KWD = PSKeywordTable.intern
KEYWORD_PROC_BEGIN = KWD(b'{') KEYWORD_PROC_BEGIN = KWD(b"{")
KEYWORD_PROC_END = KWD(b'}') KEYWORD_PROC_END = KWD(b"}")
KEYWORD_ARRAY_BEGIN = KWD(b'[') KEYWORD_ARRAY_BEGIN = KWD(b"[")
KEYWORD_ARRAY_END = KWD(b']') KEYWORD_ARRAY_END = KWD(b"]")
KEYWORD_DICT_BEGIN = KWD(b'<<') KEYWORD_DICT_BEGIN = KWD(b"<<")
KEYWORD_DICT_END = KWD(b'>>') KEYWORD_DICT_END = KWD(b">>")
def literal_name(x: object) -> Any: def literal_name(x: object) -> Any:
if not isinstance(x, PSLiteral): if not isinstance(x, PSLiteral):
if settings.STRICT: if settings.STRICT:
raise PSTypeError('Literal required: {!r}'.format(x)) raise PSTypeError("Literal required: {!r}".format(x))
else: else:
name = x name = x
else: else:
name = x.name name = x.name
if not isinstance(name, str): if not isinstance(name, str):
try: try:
name = str(name, 'utf-8') name = str(name, "utf-8")
except Exception: except Exception:
pass pass
return name return name
@ -137,34 +148,34 @@ def literal_name(x: object) -> Any:
def keyword_name(x: object) -> Any: def keyword_name(x: object) -> Any:
if not isinstance(x, PSKeyword): if not isinstance(x, PSKeyword):
if settings.STRICT: if settings.STRICT:
raise PSTypeError('Keyword required: %r' % x) raise PSTypeError("Keyword required: %r" % x)
else: else:
name = x name = x
else: else:
name = str(x.name, 'utf-8', 'ignore') name = str(x.name, "utf-8", "ignore")
return name return name
EOL = re.compile(br'[\r\n]') EOL = re.compile(rb"[\r\n]")
SPC = re.compile(br'\s') SPC = re.compile(rb"\s")
NONSPC = re.compile(br'\S') NONSPC = re.compile(rb"\S")
HEX = re.compile(br'[0-9a-fA-F]') HEX = re.compile(rb"[0-9a-fA-F]")
END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]') END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]') END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.') HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
END_NUMBER = re.compile(br'[^0-9]') END_NUMBER = re.compile(rb"[^0-9]")
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]') END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
END_STRING = re.compile(br'[()\134]') END_STRING = re.compile(rb"[()\134]")
OCT_STRING = re.compile(br'[0-7]') OCT_STRING = re.compile(rb"[0-7]")
ESC_STRING = { ESC_STRING = {
b'b': 8, b"b": 8,
b't': 9, b"t": 9,
b'n': 10, b"n": 10,
b'f': 12, b"f": 12,
b'r': 13, b"r": 13,
b'(': 40, b"(": 40,
b')': 41, b")": 41,
b'\\': 92 b"\\": 92,
} }
@ -173,8 +184,8 @@ PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
class PSBaseParser: class PSBaseParser:
"""Most basic PostScript parser that performs only tokenization. """Most basic PostScript parser that performs only tokenization."""
"""
BUFSIZ = 4096 BUFSIZ = 4096
def __init__(self, fp: BinaryIO) -> None: def __init__(self, fp: BinaryIO) -> None:
@ -182,8 +193,7 @@ class PSBaseParser:
self.seek(0) self.seek(0)
def __repr__(self) -> str: def __repr__(self) -> str:
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)
self.bufpos)
def flush(self) -> None: def flush(self) -> None:
return return
@ -193,29 +203,28 @@ class PSBaseParser:
return return
def tell(self) -> int: def tell(self) -> int:
return self.bufpos+self.charpos return self.bufpos + self.charpos
def poll(self, pos: Optional[int] = None, n: int = 80) -> None: def poll(self, pos: Optional[int] = None, n: int = 80) -> None:
pos0 = self.fp.tell() pos0 = self.fp.tell()
if not pos: if not pos:
pos = self.bufpos+self.charpos pos = self.bufpos + self.charpos
self.fp.seek(pos) self.fp.seek(pos)
log.debug('poll(%d): %r', pos, self.fp.read(n)) log.debug("poll(%d): %r", pos, self.fp.read(n))
self.fp.seek(pos0) self.fp.seek(pos0)
return return
def seek(self, pos: int) -> None: def seek(self, pos: int) -> None:
"""Seeks the parser to the given position. """Seeks the parser to the given position."""
""" log.debug("seek: %r", pos)
log.debug('seek: %r', pos)
self.fp.seek(pos) self.fp.seek(pos)
# reset the status for nextline() # reset the status for nextline()
self.bufpos = pos self.bufpos = pos
self.buf = b'' self.buf = b""
self.charpos = 0 self.charpos = 0
# reset the status for nexttoken() # reset the status for nexttoken()
self._parse1 = self._parse_main self._parse1 = self._parse_main
self._curtoken = b'' self._curtoken = b""
self._curtokenpos = 0 self._curtokenpos = 0
self._tokens: List[Tuple[int, PSBaseParserToken]] = [] self._tokens: List[Tuple[int, PSBaseParserToken]] = []
return return
@ -227,37 +236,36 @@ class PSBaseParser:
self.bufpos = self.fp.tell() self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ) self.buf = self.fp.read(self.BUFSIZ)
if not self.buf: if not self.buf:
raise PSEOF('Unexpected EOF') raise PSEOF("Unexpected EOF")
self.charpos = 0 self.charpos = 0
return return
def nextline(self) -> Tuple[int, bytes]: def nextline(self) -> Tuple[int, bytes]:
"""Fetches a next line that ends either with \\r or \\n. """Fetches a next line that ends either with \\r or \\n."""
""" linebuf = b""
linebuf = b''
linepos = self.bufpos + self.charpos linepos = self.bufpos + self.charpos
eol = False eol = False
while 1: while 1:
self.fillbuf() self.fillbuf()
if eol: if eol:
c = self.buf[self.charpos:self.charpos+1] c = self.buf[self.charpos : self.charpos + 1]
# handle b'\r\n' # handle b'\r\n'
if c == b'\n': if c == b"\n":
linebuf += c linebuf += c
self.charpos += 1 self.charpos += 1
break break
m = EOL.search(self.buf, self.charpos) m = EOL.search(self.buf, self.charpos)
if m: if m:
linebuf += self.buf[self.charpos:m.end(0)] linebuf += self.buf[self.charpos : m.end(0)]
self.charpos = m.end(0) self.charpos = m.end(0)
if linebuf[-1:] == b'\r': if linebuf[-1:] == b"\r":
eol = True eol = True
else: else:
break break
else: else:
linebuf += self.buf[self.charpos:] linebuf += self.buf[self.charpos :]
self.charpos = len(self.buf) self.charpos = len(self.buf)
log.debug('nextline: %r, %r', linepos, linebuf) log.debug("nextline: %r, %r", linepos, linebuf)
return (linepos, linebuf) return (linepos, linebuf)
@ -268,22 +276,22 @@ class PSBaseParser:
""" """
self.fp.seek(0, 2) self.fp.seek(0, 2)
pos = self.fp.tell() pos = self.fp.tell()
buf = b'' buf = b""
while 0 < pos: while 0 < pos:
prevpos = pos prevpos = pos
pos = max(0, pos-self.BUFSIZ) pos = max(0, pos - self.BUFSIZ)
self.fp.seek(pos) self.fp.seek(pos)
s = self.fp.read(prevpos-pos) s = self.fp.read(prevpos - pos)
if not s: if not s:
break break
while 1: while 1:
n = max(s.rfind(b'\r'), s.rfind(b'\n')) n = max(s.rfind(b"\r"), s.rfind(b"\n"))
if n == -1: if n == -1:
buf = s + buf buf = s + buf
break break
yield s[n:] + buf yield s[n:] + buf
s = s[:n] s = s[:n]
buf = b'' buf = b""
return return
def _parse_main(self, s: bytes, i: int) -> int: def _parse_main(self, s: bytes, i: int) -> int:
@ -291,44 +299,44 @@ class PSBaseParser:
if not m: if not m:
return len(s) return len(s)
j = m.start(0) j = m.start(0)
c = s[j:j+1] c = s[j : j + 1]
self._curtokenpos = self.bufpos+j self._curtokenpos = self.bufpos + j
if c == b'%': if c == b"%":
self._curtoken = b'%' self._curtoken = b"%"
self._parse1 = self._parse_comment self._parse1 = self._parse_comment
return j+1 return j + 1
elif c == b'/': elif c == b"/":
self._curtoken = b'' self._curtoken = b""
self._parse1 = self._parse_literal self._parse1 = self._parse_literal
return j+1 return j + 1
elif c in b'-+' or c.isdigit(): elif c in b"-+" or c.isdigit():
self._curtoken = c self._curtoken = c
self._parse1 = self._parse_number self._parse1 = self._parse_number
return j+1 return j + 1
elif c == b'.': elif c == b".":
self._curtoken = c self._curtoken = c
self._parse1 = self._parse_float self._parse1 = self._parse_float
return j+1 return j + 1
elif c.isalpha(): elif c.isalpha():
self._curtoken = c self._curtoken = c
self._parse1 = self._parse_keyword self._parse1 = self._parse_keyword
return j+1 return j + 1
elif c == b'(': elif c == b"(":
self._curtoken = b'' self._curtoken = b""
self.paren = 1 self.paren = 1
self._parse1 = self._parse_string self._parse1 = self._parse_string
return j+1 return j + 1
elif c == b'<': elif c == b"<":
self._curtoken = b'' self._curtoken = b""
self._parse1 = self._parse_wopen self._parse1 = self._parse_wopen
return j+1 return j + 1
elif c == b'>': elif c == b">":
self._curtoken = b'' self._curtoken = b""
self._parse1 = self._parse_wclose self._parse1 = self._parse_wclose
return j+1 return j + 1
else: else:
self._add_token(KWD(c)) self._add_token(KWD(c))
return j+1 return j + 1
def _add_token(self, obj: PSBaseParserToken) -> None: def _add_token(self, obj: PSBaseParserToken) -> None:
self._tokens.append((self._curtokenpos, obj)) self._tokens.append((self._curtokenpos, obj))
@ -353,13 +361,13 @@ class PSBaseParser:
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
c = s[j:j+1] c = s[j : j + 1]
if c == b'#': if c == b"#":
self.hex = b'' self.hex = b""
self._parse1 = self._parse_literal_hex self._parse1 = self._parse_literal_hex
return j+1 return j + 1
try: try:
name: Union[str, bytes] = str(self._curtoken, 'utf-8') name: Union[str, bytes] = str(self._curtoken, "utf-8")
except Exception: except Exception:
name = self._curtoken name = self._curtoken
self._add_token(LIT(name)) self._add_token(LIT(name))
@ -367,10 +375,10 @@ class PSBaseParser:
return j return j
def _parse_literal_hex(self, s: bytes, i: int) -> int: def _parse_literal_hex(self, s: bytes, i: int) -> int:
c = s[i:i+1] c = s[i : i + 1]
if HEX.match(c) and len(self.hex) < 2: if HEX.match(c) and len(self.hex) < 2:
self.hex += c self.hex += c
return i+1 return i + 1
if self.hex: if self.hex:
self._curtoken += bytes((int(self.hex, 16),)) self._curtoken += bytes((int(self.hex, 16),))
self._parse1 = self._parse_literal self._parse1 = self._parse_literal
@ -383,11 +391,11 @@ class PSBaseParser:
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
c = s[j:j+1] c = s[j : j + 1]
if c == b'.': if c == b".":
self._curtoken += c self._curtoken += c
self._parse1 = self._parse_float self._parse1 = self._parse_float
return j+1 return j + 1
try: try:
self._add_token(int(self._curtoken)) self._add_token(int(self._curtoken))
except ValueError: except ValueError:
@ -416,9 +424,9 @@ class PSBaseParser:
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
if self._curtoken == b'true': if self._curtoken == b"true":
token: Union[bool, PSKeyword] = True token: Union[bool, PSKeyword] = True
elif self._curtoken == b'false': elif self._curtoken == b"false":
token = False token = False
else: else:
token = KWD(self._curtoken) token = KWD(self._curtoken)
@ -433,34 +441,34 @@ class PSBaseParser:
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
c = s[j:j+1] c = s[j : j + 1]
if c == b'\\': if c == b"\\":
self.oct = b'' self.oct = b""
self._parse1 = self._parse_string_1 self._parse1 = self._parse_string_1
return j+1 return j + 1
if c == b'(': if c == b"(":
self.paren += 1 self.paren += 1
self._curtoken += c self._curtoken += c
return j+1 return j + 1
if c == b')': if c == b")":
self.paren -= 1 self.paren -= 1
if self.paren: if self.paren:
# WTF, they said balanced parens need no special treatment. # WTF, they said balanced parens need no special treatment.
self._curtoken += c self._curtoken += c
return j+1 return j + 1
self._add_token(self._curtoken) self._add_token(self._curtoken)
self._parse1 = self._parse_main self._parse1 = self._parse_main
return j+1 return j + 1
def _parse_string_1(self, s: bytes, i: int) -> int: def _parse_string_1(self, s: bytes, i: int) -> int:
"""Parse literal strings """Parse literal strings
PDF Reference 3.2.3 PDF Reference 3.2.3
""" """
c = s[i:i+1] c = s[i : i + 1]
if OCT_STRING.match(c) and len(self.oct) < 3: if OCT_STRING.match(c) and len(self.oct) < 3:
self.oct += c self.oct += c
return i+1 return i + 1
elif self.oct: elif self.oct:
self._curtoken += bytes((int(self.oct, 8),)) self._curtoken += bytes((int(self.oct, 8),))
@ -470,18 +478,18 @@ class PSBaseParser:
elif c in ESC_STRING: elif c in ESC_STRING:
self._curtoken += bytes((ESC_STRING[c],)) self._curtoken += bytes((ESC_STRING[c],))
elif c == b'\r' and len(s) > i+1 and s[i+1:i+2] == b'\n': elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
# If current and next character is \r\n skip both because enters # If current and next character is \r\n skip both because enters
# after a \ are ignored # after a \ are ignored
i += 1 i += 1
# default action # default action
self._parse1 = self._parse_string self._parse1 = self._parse_string
return i+1 return i + 1
def _parse_wopen(self, s: bytes, i: int) -> int: def _parse_wopen(self, s: bytes, i: int) -> int:
c = s[i:i+1] c = s[i : i + 1]
if c == b'<': if c == b"<":
self._add_token(KEYWORD_DICT_BEGIN) self._add_token(KEYWORD_DICT_BEGIN)
self._parse1 = self._parse_main self._parse1 = self._parse_main
i += 1 i += 1
@ -490,8 +498,8 @@ class PSBaseParser:
return i return i
def _parse_wclose(self, s: bytes, i: int) -> int: def _parse_wclose(self, s: bytes, i: int) -> int:
c = s[i:i+1] c = s[i : i + 1]
if c == b'>': if c == b">":
self._add_token(KEYWORD_DICT_END) self._add_token(KEYWORD_DICT_END)
i += 1 i += 1
self._parse1 = self._parse_main self._parse1 = self._parse_main
@ -504,8 +512,9 @@ class PSBaseParser:
return len(s) return len(s)
j = m.start(0) j = m.start(0)
self._curtoken += s[i:j] self._curtoken += s[i:j]
token = HEX_PAIR.sub(lambda m: bytes((int(m.group(0), 16),)), token = HEX_PAIR.sub(
SPC.sub(b'', self._curtoken)) lambda m: bytes((int(m.group(0), 16),)), SPC.sub(b"", self._curtoken)
)
self._add_token(token) self._add_token(token)
self._parse1 = self._parse_main self._parse1 = self._parse_main
return j return j
@ -515,7 +524,7 @@ class PSBaseParser:
self.fillbuf() self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos) self.charpos = self._parse1(self.buf, self.charpos)
token = self._tokens.pop(0) token = self._tokens.pop(0)
log.debug('nexttoken: %r', token) log.debug("nexttoken: %r", token)
return token return token
@ -530,15 +539,13 @@ PSStackEntry = Tuple[int, PSStackType[ExtraT]]
class PSStackParser(PSBaseParser, Generic[ExtraT]): class PSStackParser(PSBaseParser, Generic[ExtraT]):
def __init__(self, fp: BinaryIO) -> None: def __init__(self, fp: BinaryIO) -> None:
PSBaseParser.__init__(self, fp) PSBaseParser.__init__(self, fp)
self.reset() self.reset()
return return
def reset(self) -> None: def reset(self) -> None:
self.context: List[Tuple[int, Optional[str], self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = []
List[PSStackEntry[ExtraT]]]] = []
self.curtype: Optional[str] = None self.curtype: Optional[str] = None
self.curstack: List[PSStackEntry[ExtraT]] = [] self.curstack: List[PSStackEntry[ExtraT]] = []
self.results: List[PSStackEntry[ExtraT]] = [] self.results: List[PSStackEntry[ExtraT]] = []
@ -565,25 +572,24 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
try: try:
log.debug('add_results: %r', objs) log.debug("add_results: %r", objs)
except Exception: except Exception:
log.debug('add_results: (unprintable object)') log.debug("add_results: (unprintable object)")
self.results.extend(objs) self.results.extend(objs)
return return
def start_type(self, pos: int, type: str) -> None: def start_type(self, pos: int, type: str) -> None:
self.context.append((pos, self.curtype, self.curstack)) self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, []) (self.curtype, self.curstack) = (type, [])
log.debug('start_type: pos=%r, type=%r', pos, type) log.debug("start_type: pos=%r, type=%r", pos, type)
return return
def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
if self.curtype != type: if self.curtype != type:
raise PSTypeError('Type mismatch: {!r} != {!r}' raise PSTypeError("Type mismatch: {!r} != {!r}".format(self.curtype, type))
.format(self.curtype, type))
objs = [obj for (_, obj) in self.curstack] objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop() (pos, self.curtype, self.curstack) = self.context.pop()
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs) log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
return (pos, objs) return (pos, objs)
def do_keyword(self, pos: int, token: PSKeyword) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None:
@ -604,47 +610,55 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
self.push((pos, token)) self.push((pos, token))
elif token == KEYWORD_ARRAY_BEGIN: elif token == KEYWORD_ARRAY_BEGIN:
# begin array # begin array
self.start_type(pos, 'a') self.start_type(pos, "a")
elif token == KEYWORD_ARRAY_END: elif token == KEYWORD_ARRAY_END:
# end array # end array
try: try:
self.push(self.end_type('a')) self.push(self.end_type("a"))
except PSTypeError: except PSTypeError:
if settings.STRICT: if settings.STRICT:
raise raise
elif token == KEYWORD_DICT_BEGIN: elif token == KEYWORD_DICT_BEGIN:
# begin dictionary # begin dictionary
self.start_type(pos, 'd') self.start_type(pos, "d")
elif token == KEYWORD_DICT_END: elif token == KEYWORD_DICT_END:
# end dictionary # end dictionary
try: try:
(pos, objs) = self.end_type('d') (pos, objs) = self.end_type("d")
if len(objs) % 2 != 0: if len(objs) % 2 != 0:
error_msg = 'Invalid dictionary construct: %r' % objs error_msg = "Invalid dictionary construct: %r" % objs
raise PSSyntaxError(error_msg) raise PSSyntaxError(error_msg)
d = {literal_name(k): v d = {
for (k, v) in choplist(2, objs) if v is not None} literal_name(k): v
for (k, v) in choplist(2, objs)
if v is not None
}
self.push((pos, d)) self.push((pos, d))
except PSTypeError: except PSTypeError:
if settings.STRICT: if settings.STRICT:
raise raise
elif token == KEYWORD_PROC_BEGIN: elif token == KEYWORD_PROC_BEGIN:
# begin proc # begin proc
self.start_type(pos, 'p') self.start_type(pos, "p")
elif token == KEYWORD_PROC_END: elif token == KEYWORD_PROC_END:
# end proc # end proc
try: try:
self.push(self.end_type('p')) self.push(self.end_type("p"))
except PSTypeError: except PSTypeError:
if settings.STRICT: if settings.STRICT:
raise raise
elif isinstance(token, PSKeyword): elif isinstance(token, PSKeyword):
log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, log.debug(
token, self.curstack) "do_keyword: pos=%r, token=%r, stack=%r", pos, token, self.curstack
)
self.do_keyword(pos, token) self.do_keyword(pos, token)
else: else:
log.error('unknown token: pos=%r, token=%r, stack=%r', pos, log.error(
token, self.curstack) "unknown token: pos=%r, token=%r, stack=%r",
pos,
token,
self.curstack,
)
self.do_keyword(pos, token) self.do_keyword(pos, token)
raise raise
if self.context: if self.context:
@ -653,7 +667,7 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
self.flush() self.flush()
obj = self.results.pop(0) obj = self.results.pop(0)
try: try:
log.debug('nextobject: %r', obj) log.debug("nextobject: %r", obj)
except Exception: except Exception:
log.debug('nextobject: (unprintable object)') log.debug("nextobject: (unprintable object)")
return obj return obj

View File

@ -20,7 +20,7 @@ def rldecode(data: bytes) -> bytes:
(2 to 128) times during decompression. A length value of 128 (2 to 128) times during decompression. A length value of 128
denotes EOD. denotes EOD.
""" """
decoded = b'' decoded = b""
i = 0 i = 0
while i < len(data): while i < len(data):
length = data[i] length = data[i]
@ -28,13 +28,13 @@ def rldecode(data: bytes) -> bytes:
break break
if length >= 0 and length < 128: if length >= 0 and length < 128:
for j in range(i+1, (i+1)+(length+1)): for j in range(i + 1, (i + 1) + (length + 1)):
decoded += bytes((data[j],)) decoded += bytes((data[j],))
i = (i+1) + (length+1) i = (i + 1) + (length + 1)
if length > 128: if length > 128:
run = bytes((data[i+1],))*(257-length) run = bytes((data[i + 1],)) * (257 - length)
decoded += run decoded += run
i = (i+1) + 1 i = (i + 1) + 1
return decoded return decoded

View File

@ -6,9 +6,24 @@ import pathlib
import string import string
import struct import struct
from html import escape from html import escape
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator, from typing import (
List, Optional, Set, TextIO, Tuple, TypeVar, Union, Any,
TYPE_CHECKING, cast) BinaryIO,
Callable,
Dict,
Generic,
Iterable,
Iterator,
List,
Optional,
Set,
TextIO,
Tuple,
TypeVar,
Union,
TYPE_CHECKING,
cast,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from .layout import LTComponent from .layout import LTComponent
@ -30,12 +45,8 @@ class open_filename(object):
(str or pathlib.PurePath type is supported) and closes it on exit, (str or pathlib.PurePath type is supported) and closes it on exit,
(just like `open`), but does nothing for file-like objects. (just like `open`), but does nothing for file-like objects.
""" """
def __init__(
self, def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
filename: FileOrName,
*args: Any,
**kwargs: Any
) -> None:
if isinstance(filename, pathlib.PurePath): if isinstance(filename, pathlib.PurePath):
filename = str(filename) filename = str(filename)
if isinstance(filename, str): if isinstance(filename, str):
@ -45,17 +56,12 @@ class open_filename(object):
self.file_handler = cast(AnyIO, filename) self.file_handler = cast(AnyIO, filename)
self.closing = False self.closing = False
else: else:
raise TypeError('Unsupported input type: %s' % type(filename)) raise TypeError("Unsupported input type: %s" % type(filename))
def __enter__(self) -> AnyIO: def __enter__(self) -> AnyIO:
return self.file_handler return self.file_handler
def __exit__( def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
self,
exc_type: object,
exc_val: object,
exc_tb: object
) -> None:
if self.closing: if self.closing:
self.file_handler.close() self.file_handler.close()
@ -70,7 +76,7 @@ def make_compat_str(o: object) -> str:
"""Converts everything to string, if bytes guessing the encoding.""" """Converts everything to string, if bytes guessing the encoding."""
if isinstance(o, bytes): if isinstance(o, bytes):
enc = chardet.detect(o) enc = chardet.detect(o)
return o.decode(enc['encoding']) return o.decode(enc["encoding"])
else: else:
return str(o) return str(o)
@ -80,20 +86,18 @@ def shorten_str(s: str, size: int) -> str:
return s[:size] return s[:size]
if len(s) > size: if len(s) > size:
length = (size - 5) // 2 length = (size - 5) // 2
return '{} ... {}'.format(s[:length], s[-length:]) return "{} ... {}".format(s[:length], s[-length:])
else: else:
return s return s
def compatible_encode_method( def compatible_encode_method(
bytesorstring: Union[bytes, str], bytesorstring: Union[bytes, str], encoding: str = "utf-8", erraction: str = "ignore"
encoding: str = 'utf-8',
erraction: str = 'ignore'
) -> str: ) -> str:
"""When Py2 str.encode is called, it often means bytes.encode in Py3. """When Py2 str.encode is called, it often means bytes.encode in Py3.
This does either. This does either.
""" """
if isinstance(bytesorstring, str): if isinstance(bytesorstring, str):
return bytesorstring return bytesorstring
assert isinstance(bytesorstring, bytes), str(type(bytesorstring)) assert isinstance(bytesorstring, bytes), str(type(bytesorstring))
@ -119,11 +123,7 @@ def paeth_predictor(left: int, above: int, upper_left: int) -> int:
def apply_png_predictor( def apply_png_predictor(
pred: int, pred: int, colors: int, columns: int, bitspercomponent: int, data: bytes
colors: int,
columns: int,
bitspercomponent: int,
data: bytes
) -> bytes: ) -> bytes:
"""Reverse the effect of the PNG predictor """Reverse the effect of the PNG predictor
@ -135,12 +135,12 @@ def apply_png_predictor(
nbytes = colors * columns * bitspercomponent // 8 nbytes = colors * columns * bitspercomponent // 8
bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel
buf = b'' buf = b""
line_above = b'\x00' * columns line_above = b"\x00" * columns
for scanline_i in range(0, len(data), nbytes + 1): for scanline_i in range(0, len(data), nbytes + 1):
filter_type = data[scanline_i] filter_type = data[scanline_i]
line_encoded = data[scanline_i + 1:scanline_i + 1 + nbytes] line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
raw = b'' raw = b""
if filter_type == 0: if filter_type == 0:
# Filter type 0: None # Filter type 0: None
@ -223,10 +223,11 @@ Point = Tuple[float, float]
Rect = Tuple[float, float, float, float] Rect = Tuple[float, float, float, float]
Matrix = Tuple[float, float, float, float, float, float] Matrix = Tuple[float, float, float, float, float, float]
PathSegment = Union[ PathSegment = Union[
Tuple[str], # Literal['h'] Tuple[str], # Literal['h']
Tuple[str, float, float], # Literal['m', 'l'] Tuple[str, float, float], # Literal['m', 'l']
Tuple[str, float, float, float, float], # Literal['v', 'y'] Tuple[str, float, float, float, float], # Literal['v', 'y']
Tuple[str, float, float, float, float, float, float]] # Literal['c'] Tuple[str, float, float, float, float, float, float],
] # Literal['c']
# Matrix operations # Matrix operations
MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
@ -236,9 +237,14 @@ def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
(a1, b1, c1, d1, e1, f1) = m1 (a1, b1, c1, d1, e1, f1) = m1
(a0, b0, c0, d0, e0, f0) = m0 (a0, b0, c0, d0, e0, f0) = m0
"""Returns the multiplication of two matrices.""" """Returns the multiplication of two matrices."""
return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1, return (
a0 * c1 + c0 * d1, b0 * c1 + d0 * d1, a0 * a1 + c0 * b1,
a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0) b0 * a1 + d0 * b1,
a0 * c1 + c0 * d1,
b0 * c1 + d0 * d1,
a0 * e1 + c0 * f1 + e0,
b0 * e1 + d0 * f1 + f0,
)
def translate_matrix(m: Matrix, v: Point) -> Matrix: def translate_matrix(m: Matrix, v: Point) -> Matrix:
@ -264,11 +270,12 @@ def apply_matrix_norm(m: Matrix, v: Point) -> Point:
# Utility functions # Utility functions
def isnumber(x: object) -> bool: def isnumber(x: object) -> bool:
return isinstance(x, (int, float)) return isinstance(x, (int, float))
_T = TypeVar('_T') _T = TypeVar("_T")
def uniq(objs: Iterable[_T]) -> Iterator[_T]: def uniq(objs: Iterable[_T]) -> Iterator[_T]:
@ -282,10 +289,7 @@ def uniq(objs: Iterable[_T]) -> Iterator[_T]:
return return
def fsplit( def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]:
pred: Callable[[_T], bool],
objs: Iterable[_T]
) -> Tuple[List[_T], List[_T]]:
"""Split a list into two classes according to the predicate.""" """Split a list into two classes according to the predicate."""
t = [] t = []
f = [] f = []
@ -315,9 +319,7 @@ def get_bound(pts: Iterable[Point]) -> Rect:
def pick( def pick(
seq: Iterable[_T], seq: Iterable[_T], func: Callable[[_T], float], maxobj: Optional[_T] = None
func: Callable[[_T], float],
maxobj: Optional[_T] = None
) -> Optional[_T]: ) -> Optional[_T]:
"""Picks the object obj where func(obj) has the highest value.""" """Picks the object obj where func(obj) has the highest value."""
maxscore = None maxscore = None
@ -347,77 +349,303 @@ def nunpack(s: bytes, default: int = 0) -> int:
elif length == 1: elif length == 1:
return ord(s) return ord(s)
elif length == 2: elif length == 2:
return cast(int, struct.unpack('>H', s)[0]) return cast(int, struct.unpack(">H", s)[0])
elif length == 3: elif length == 3:
return cast(int, struct.unpack('>L', b'\x00' + s)[0]) return cast(int, struct.unpack(">L", b"\x00" + s)[0])
elif length == 4: elif length == 4:
return cast(int, struct.unpack('>L', s)[0]) return cast(int, struct.unpack(">L", s)[0])
elif length == 8: elif length == 8:
return cast(int, struct.unpack('>Q', s)[0]) return cast(int, struct.unpack(">Q", s)[0])
else: else:
raise TypeError('invalid length: %d' % length) raise TypeError("invalid length: %d" % length)
PDFDocEncoding = ''.join(chr(x) for x in ( PDFDocEncoding = "".join(
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, chr(x)
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, for x in (
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017, 0x0000,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x0001,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0002,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0003,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0004,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0005,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0006,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0007,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0008,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0009,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x000A,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x000B,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x000C,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000, 0x000D,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, 0x000E,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018, 0x000F,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x0010,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x0011,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x0012,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x0013,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x0014,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x0015,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x0017,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x0017,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x02D8,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x02C7,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x02C6,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x02D9,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x02DD,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x02DB,
)) 0x02DA,
0x02DC,
0x0020,
0x0021,
0x0022,
0x0023,
0x0024,
0x0025,
0x0026,
0x0027,
0x0028,
0x0029,
0x002A,
0x002B,
0x002C,
0x002D,
0x002E,
0x002F,
0x0030,
0x0031,
0x0032,
0x0033,
0x0034,
0x0035,
0x0036,
0x0037,
0x0038,
0x0039,
0x003A,
0x003B,
0x003C,
0x003D,
0x003E,
0x003F,
0x0040,
0x0041,
0x0042,
0x0043,
0x0044,
0x0045,
0x0046,
0x0047,
0x0048,
0x0049,
0x004A,
0x004B,
0x004C,
0x004D,
0x004E,
0x004F,
0x0050,
0x0051,
0x0052,
0x0053,
0x0054,
0x0055,
0x0056,
0x0057,
0x0058,
0x0059,
0x005A,
0x005B,
0x005C,
0x005D,
0x005E,
0x005F,
0x0060,
0x0061,
0x0062,
0x0063,
0x0064,
0x0065,
0x0066,
0x0067,
0x0068,
0x0069,
0x006A,
0x006B,
0x006C,
0x006D,
0x006E,
0x006F,
0x0070,
0x0071,
0x0072,
0x0073,
0x0074,
0x0075,
0x0076,
0x0077,
0x0078,
0x0079,
0x007A,
0x007B,
0x007C,
0x007D,
0x007E,
0x0000,
0x2022,
0x2020,
0x2021,
0x2026,
0x2014,
0x2013,
0x0192,
0x2044,
0x2039,
0x203A,
0x2212,
0x2030,
0x201E,
0x201C,
0x201D,
0x2018,
0x2019,
0x201A,
0x2122,
0xFB01,
0xFB02,
0x0141,
0x0152,
0x0160,
0x0178,
0x017D,
0x0131,
0x0142,
0x0153,
0x0161,
0x017E,
0x0000,
0x20AC,
0x00A1,
0x00A2,
0x00A3,
0x00A4,
0x00A5,
0x00A6,
0x00A7,
0x00A8,
0x00A9,
0x00AA,
0x00AB,
0x00AC,
0x0000,
0x00AE,
0x00AF,
0x00B0,
0x00B1,
0x00B2,
0x00B3,
0x00B4,
0x00B5,
0x00B6,
0x00B7,
0x00B8,
0x00B9,
0x00BA,
0x00BB,
0x00BC,
0x00BD,
0x00BE,
0x00BF,
0x00C0,
0x00C1,
0x00C2,
0x00C3,
0x00C4,
0x00C5,
0x00C6,
0x00C7,
0x00C8,
0x00C9,
0x00CA,
0x00CB,
0x00CC,
0x00CD,
0x00CE,
0x00CF,
0x00D0,
0x00D1,
0x00D2,
0x00D3,
0x00D4,
0x00D5,
0x00D6,
0x00D7,
0x00D8,
0x00D9,
0x00DA,
0x00DB,
0x00DC,
0x00DD,
0x00DE,
0x00DF,
0x00E0,
0x00E1,
0x00E2,
0x00E3,
0x00E4,
0x00E5,
0x00E6,
0x00E7,
0x00E8,
0x00E9,
0x00EA,
0x00EB,
0x00EC,
0x00ED,
0x00EE,
0x00EF,
0x00F0,
0x00F1,
0x00F2,
0x00F3,
0x00F4,
0x00F5,
0x00F6,
0x00F7,
0x00F8,
0x00F9,
0x00FA,
0x00FB,
0x00FC,
0x00FD,
0x00FE,
0x00FF,
)
)
def decode_text(s: bytes) -> str: def decode_text(s: bytes) -> str:
"""Decodes a PDFDocEncoding string to Unicode.""" """Decodes a PDFDocEncoding string to Unicode."""
if s.startswith(b'\xfe\xff'): if s.startswith(b"\xfe\xff"):
return str(s[2:], 'utf-16be', 'ignore') return str(s[2:], "utf-16be", "ignore")
else: else:
return ''.join(PDFDocEncoding[c] for c in s) return "".join(PDFDocEncoding[c] for c in s)
def enc(x: str) -> str: def enc(x: str) -> str:
"""Encodes a string for SGML/XML/HTML""" """Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes): if isinstance(x, bytes):
return '' return ""
return escape(x) return escape(x)
def bbox2str(bbox: Rect) -> str: def bbox2str(bbox: Rect) -> str:
(x0, y0, x1, y1) = bbox (x0, y0, x1, y1) = bbox
return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1) return "{:.3f},{:.3f},{:.3f},{:.3f}".format(x0, y0, x1, y1)
def matrix2str(m: Matrix) -> str: def matrix2str(m: Matrix) -> str:
(a, b, c, d, e, f) = m (a, b, c, d, e, f) = m
return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\ return "[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]".format(a, b, c, d, e, f)
.format(a, b, c, d, e, f)
def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point: def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
@ -446,7 +674,7 @@ def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
return max(0, iw), max(0, ih) return max(0, iw), max(0, ih)
LTComponentT = TypeVar('LTComponentT', bound='LTComponent') LTComponentT = TypeVar("LTComponentT", bound="LTComponent")
class Plane(Generic[LTComponentT]): class Plane(Generic[LTComponentT]):
@ -465,7 +693,7 @@ class Plane(Generic[LTComponentT]):
(self.x0, self.y0, self.x1, self.y1) = bbox (self.x0, self.y0, self.x1, self.y1) = bbox
def __repr__(self) -> str: def __repr__(self) -> str:
return '<Plane objs=%r>' % list(self) return "<Plane objs=%r>" % list(self)
def __iter__(self) -> Iterator[LTComponentT]: def __iter__(self) -> Iterator[LTComponentT]:
return (obj for obj in self._seq if obj in self._objs) return (obj for obj in self._seq if obj in self._objs)
@ -524,14 +752,13 @@ class Plane(Generic[LTComponentT]):
if obj in done: if obj in done:
continue continue
done.add(obj) done.add(obj)
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 \ if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
or y1 <= obj.y0:
continue continue
yield obj yield obj
ROMAN_ONES = ['i', 'x', 'c', 'm'] ROMAN_ONES = ["i", "x", "c", "m"]
ROMAN_FIVES = ['v', 'l', 'd'] ROMAN_FIVES = ["v", "l", "d"]
def format_int_roman(value: int) -> str: def format_int_roman(value: int) -> str:
@ -557,7 +784,7 @@ def format_int_roman(value: int) -> str:
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder) result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
index += 1 index += 1
return ''.join(result) return "".join(result)
def format_int_alpha(value: int) -> str: def format_int_alpha(value: int) -> str:
@ -571,4 +798,4 @@ def format_int_alpha(value: int) -> str:
result.append(string.ascii_lowercase[remainder]) result.append(string.ascii_lowercase[remainder])
result.reverse() result.reverse()
return ''.join(result) return "".join(result)

View File

@ -8,52 +8,52 @@ sys.path.append(str(Path(__file__).parent))
import pdfminer as package import pdfminer as package
with open(path.join(path.abspath(path.dirname(__file__)), 'README.md')) as f: with open(path.join(path.abspath(path.dirname(__file__)), "README.md")) as f:
readme = f.read() readme = f.read()
setup( setup(
name='pdfminer.six', name="pdfminer.six",
version=package.__version__, version=package.__version__,
packages=['pdfminer'], packages=["pdfminer"],
package_data={'pdfminer': ['cmap/*.pickle.gz', 'py.typed']}, package_data={"pdfminer": ["cmap/*.pickle.gz", "py.typed"]},
install_requires=[ install_requires=[
'chardet ; python_version > "3.0"', 'chardet ; python_version > "3.0"',
'cryptography', "cryptography",
], ],
extras_require={ extras_require={
"dev": ["pytest", "nox", "mypy == 0.931"], "dev": ["pytest", "nox", "black", "mypy == 0.931"],
"docs": ["sphinx", "sphinx-argparse"], "docs": ["sphinx", "sphinx-argparse"],
}, },
description='PDF parser and analyzer', description="PDF parser and analyzer",
long_description=readme, long_description=readme,
long_description_content_type='text/markdown', long_description_content_type="text/markdown",
license='MIT/X', license="MIT/X",
author='Yusuke Shinyama + Philippe Guglielmetti', author="Yusuke Shinyama + Philippe Guglielmetti",
author_email='pdfminer@goulu.net', author_email="pdfminer@goulu.net",
url='https://github.com/pdfminer/pdfminer.six', url="https://github.com/pdfminer/pdfminer.six",
scripts=[ scripts=[
'tools/pdf2txt.py', "tools/pdf2txt.py",
'tools/dumppdf.py', "tools/dumppdf.py",
], ],
keywords=[ keywords=[
'pdf parser', "pdf parser",
'pdf converter', "pdf converter",
'layout analysis', "layout analysis",
'text mining', "text mining",
], ],
python_requires='>=3.6', python_requires=">=3.6",
classifiers=[ classifiers=[
'Programming Language :: Python', "Programming Language :: Python",
'Programming Language :: Python :: 3.6', "Programming Language :: Python :: 3.6",
'Programming Language :: Python :: 3.7', "Programming Language :: Python :: 3.7",
'Programming Language :: Python :: 3.8', "Programming Language :: Python :: 3.8",
'Programming Language :: Python :: 3.9', "Programming Language :: Python :: 3.9",
'Programming Language :: Python :: 3 :: Only', "Programming Language :: Python :: 3 :: Only",
'Development Status :: 5 - Production/Stable', "Development Status :: 5 - Production/Stable",
'Environment :: Console', "Environment :: Console",
'Intended Audience :: Developers', "Intended Audience :: Developers",
'Intended Audience :: Science/Research', "Intended Audience :: Science/Research",
'License :: OSI Approved :: MIT License', "License :: OSI Approved :: MIT License",
'Topic :: Text Processing', "Topic :: Text Processing",
], ],
) )

View File

@ -2,7 +2,6 @@ import os
def absolute_sample_path(relative_sample_path): def absolute_sample_path(relative_sample_path):
sample_dir = os.path.abspath( sample_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../samples"))
os.path.join(os.path.dirname(__file__), '../samples'))
sample_file = os.path.join(sample_dir, relative_sample_path) sample_file = os.path.join(sample_dir, relative_sample_path)
return sample_file return sample_file

View File

@ -4,7 +4,7 @@ import tempfile
import os import os
class TemporaryFilePath(): class TemporaryFilePath:
"""Context manager class, which generates temporary file name """Context manager class, which generates temporary file name
Coonroraly to standard tempfile.NamedTemporaryFile(), it does not Coonroraly to standard tempfile.NamedTemporaryFile(), it does not
@ -40,9 +40,9 @@ class TemporaryFilePath():
`tempfile.NamedTemporaryFile` will create and delete a file, and `tempfile.NamedTemporaryFile` will create and delete a file, and
this method only returns the filepath of the non-existing file. this method only returns the filepath of the non-existing file.
""" """
with tempfile.NamedTemporaryFile(suffix=self.suffix, with tempfile.NamedTemporaryFile(
prefix=self.prefix, suffix=self.suffix, prefix=self.prefix, dir=self.dir
dir=self.dir) as file: ) as file:
self.temp_file_name = file.name self.temp_file_name = file.name
return self.temp_file_name return self.temp_file_name

View File

@ -9,14 +9,14 @@ from pdfminer.pdfinterp import PDFGraphicState
class TestPaintPath: class TestPaintPath:
def test_paint_path(self): def test_paint_path(self):
path = [('m', 6, 7), ('l', 7, 7)] path = [("m", 6, 7), ("l", 7, 7)]
analyzer = self._get_analyzer() analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100]) analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path) analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert len(analyzer.cur_item._objs) == 1 assert len(analyzer.cur_item._objs) == 1
def test_paint_path_mlllh(self): def test_paint_path_mlllh(self):
path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)] path = [("m", 6, 7), ("l", 7, 7), ("l", 7, 91), ("l", 6, 91), ("h",)]
analyzer = self._get_analyzer() analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100]) analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path) analyzer.paint_path(PDFGraphicState(), False, False, False, path)
@ -25,9 +25,21 @@ class TestPaintPath:
def test_paint_path_multiple_mlllh(self): def test_paint_path_multiple_mlllh(self):
"""Path from samples/contrib/issue-00369-excel.pdf""" """Path from samples/contrib/issue-00369-excel.pdf"""
path = [ path = [
('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',), ("m", 6, 7),
('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',), ("l", 7, 7),
('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',) ("l", 7, 91),
("l", 6, 91),
("h",),
("m", 4, 7),
("l", 6, 7),
("l", 6, 91),
("l", 4, 91),
("h",),
("m", 67, 2),
("l", 68, 2),
("l", 68, 3),
("l", 67, 3),
("h",),
] ]
analyzer = self._get_analyzer() analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100]) analyzer.cur_item = LTContainer([0, 100, 0, 100])
@ -177,34 +189,34 @@ class TestPaintPath:
return analyzer.cur_item._objs return analyzer.cur_item._objs
# "c" operator # "c" operator
assert parse([ assert parse(
("m", 72.41, 433.89), [
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89), ("m", 72.41, 433.89),
])[0].pts == [ ("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
]
)[0].pts == [
(72.41, 433.89), (72.41, 433.89),
(71.41, 434.89), (71.41, 434.89),
] ]
# "v" operator # "v" operator
assert parse([ assert parse([("m", 72.41, 433.89), ("v", 71.96, 434.89, 71.41, 434.89)])[
("m", 72.41, 433.89), 0
("v", 71.96, 434.89, 71.41, 434.89), ].pts == [
])[0].pts == [
(72.41, 433.89), (72.41, 433.89),
(71.41, 434.89), (71.41, 434.89),
] ]
# "y" operator # "y" operator
assert parse([ assert parse([("m", 72.41, 433.89), ("y", 72.41, 434.45, 71.41, 434.89)])[
("m", 72.41, 433.89), 0
("y", 72.41, 434.45, 71.41, 434.89), ].pts == [
])[0].pts == [
(72.41, 433.89), (72.41, 433.89),
(71.41, 434.89), (71.41, 434.89),
] ]
class TestBinaryDetector(): class TestBinaryDetector:
def test_stringio(self): def test_stringio(self):
assert not PDFConverter._is_binary_stream(io.StringIO()) assert not PDFConverter._is_binary_stream(io.StringIO())
@ -212,11 +224,11 @@ class TestBinaryDetector():
assert PDFConverter._is_binary_stream(io.BytesIO()) assert PDFConverter._is_binary_stream(io.BytesIO())
def test_tmpfile(self): def test_tmpfile(self):
with TemporaryFile(mode='w') as f: with TemporaryFile(mode="w") as f:
assert not PDFConverter._is_binary_stream(f) assert not PDFConverter._is_binary_stream(f)
def test_binary_tmpfile(self): def test_binary_tmpfile(self):
with TemporaryFile(mode='wb') as f: with TemporaryFile(mode="wb") as f:
assert PDFConverter._is_binary_stream(f) assert PDFConverter._is_binary_stream(f)
def test_non_file_like_object_defaults_to_binary(self): def test_non_file_like_object_defaults_to_binary(self):

View File

@ -13,31 +13,31 @@ from pdfminer.psparser import PSLiteral
def test_name2unicode_name_in_agl(): def test_name2unicode_name_in_agl():
"""The name "Lcommaaccent" has a single component, """The name "Lcommaaccent" has a single component,
which is mapped to the string U+013B by AGL""" which is mapped to the string U+013B by AGL"""
assert '\u013B' == name2unicode('Lcommaaccent') assert "\u013B" == name2unicode("Lcommaaccent")
def test_name2unicode_uni(): def test_name2unicode_uni():
"""The components "Lcommaaccent," "uni013B," and "u013B" """The components "Lcommaaccent," "uni013B," and "u013B"
all map to the string U+013B""" all map to the string U+013B"""
assert '\u013B' == name2unicode('uni013B') assert "\u013B" == name2unicode("uni013B")
def test_name2unicode_uni_lowercase(): def test_name2unicode_uni_lowercase():
"""The components "Lcommaaccent," "uni013B," and "u013B" """The components "Lcommaaccent," "uni013B," and "u013B"
all map to the string U+013B""" all map to the string U+013B"""
assert '\u013B' == name2unicode('uni013b') assert "\u013B" == name2unicode("uni013b")
def test_name2unicode_uni_with_sequence_of_digits(): def test_name2unicode_uni_with_sequence_of_digits():
"""The name "uni20AC0308" has a single component, """The name "uni20AC0308" has a single component,
which is mapped to the string U+20AC U+0308""" which is mapped to the string U+20AC U+0308"""
assert '\u20AC\u0308' == name2unicode('uni20AC0308') assert "\u20AC\u0308" == name2unicode("uni20AC0308")
def test_name2unicode_uni_with_sequence_of_digits_lowercase(): def test_name2unicode_uni_with_sequence_of_digits_lowercase():
"""The name "uni20AC0308" has a single component, """The name "uni20AC0308" has a single component,
which is mapped to the string U+20AC U+0308""" which is mapped to the string U+20AC U+0308"""
assert '\u20AC\u0308' == name2unicode('uni20ac0308') assert "\u20AC\u0308" == name2unicode("uni20ac0308")
def test_name2unicode_uni_empty_string(): def test_name2unicode_uni_empty_string():
@ -46,7 +46,7 @@ def test_name2unicode_uni_empty_string():
According to the specification this should be mapped to an empty string, According to the specification this should be mapped to an empty string,
but we also want to support lowercase hexadecimals""" but we also want to support lowercase hexadecimals"""
assert '\u20ac' == name2unicode('uni20ac') assert "\u20ac" == name2unicode("uni20ac")
def test_name2unicode_uni_empty_string_long(): def test_name2unicode_uni_empty_string_long():
@ -60,7 +60,7 @@ def test_name2unicode_uni_empty_string_long():
glyph name "u1040C. glyph name "u1040C.
""" """
with pytest.raises(KeyError): with pytest.raises(KeyError):
name2unicode('uniD801DC0C') name2unicode("uniD801DC0C")
def test_name2unicode_uni_empty_string_long_lowercase(): def test_name2unicode_uni_empty_string_long_lowercase():
@ -73,57 +73,59 @@ def test_name2unicode_uni_empty_string_long_lowercase():
This character can be correctly mapped by using the This character can be correctly mapped by using the
glyph name "u1040C.""" glyph name "u1040C."""
with pytest.raises(KeyError): with pytest.raises(KeyError):
name2unicode('uniD801DC0C') name2unicode("uniD801DC0C")
def test_name2unicode_uni_pua(): def test_name2unicode_uni_pua():
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to """ "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
U+F6FB.""" U+F6FB."""
assert '\uF6FB' == name2unicode('uniF6FB') assert "\uF6FB" == name2unicode("uniF6FB")
def test_name2unicode_uni_pua_lowercase(): def test_name2unicode_uni_pua_lowercase():
""""Ogoneksmall" and "uniF6FB" both map to the string that corresponds to """ "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
U+F6FB.""" U+F6FB."""
assert '\uF6FB' == name2unicode('unif6fb') assert "\uF6FB" == name2unicode("unif6fb")
def test_name2unicode_u_with_4_digits(): def test_name2unicode_u_with_4_digits():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the """The components "Lcommaaccent," "uni013B," and "u013B" all map to the
string U+013B""" string U+013B"""
assert '\u013B' == name2unicode('u013B') assert "\u013B" == name2unicode("u013B")
def test_name2unicode_u_with_4_digits_lowercase(): def test_name2unicode_u_with_4_digits_lowercase():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the """The components "Lcommaaccent," "uni013B," and "u013B" all map to the
string U+013B""" string U+013B"""
assert '\u013B' == name2unicode('u013b') assert "\u013B" == name2unicode("u013b")
def test_name2unicode_u_with_5_digits(): def test_name2unicode_u_with_5_digits():
"""The name "u1040C" has a single component, which is mapped to the string """The name "u1040C" has a single component, which is mapped to the string
U+1040C""" U+1040C"""
assert '\U0001040C' == name2unicode('u1040C') assert "\U0001040C" == name2unicode("u1040C")
def test_name2unicode_u_with_5_digits_lowercase(): def test_name2unicode_u_with_5_digits_lowercase():
"""The name "u1040C" has a single component, which is mapped to the string """The name "u1040C" has a single component, which is mapped to the string
U+1040C""" U+1040C"""
assert '\U0001040C' == name2unicode('u1040c') assert "\U0001040C" == name2unicode("u1040c")
def test_name2unicode_multiple_components(): def test_name2unicode_multiple_components():
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
string U+013B U+20AC U+0308 U+1040C""" string U+013B U+20AC U+0308 U+1040C"""
assert '\u013B\u20AC\u0308\U0001040C' == \ assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate') "Lcommaaccent_uni20AC0308_u1040C.alternate"
)
def test_name2unicode_multiple_components_lowercase(): def test_name2unicode_multiple_components_lowercase():
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
string U+013B U+20AC U+0308 U+1040C""" string U+013B U+20AC U+0308 U+1040C"""
assert '\u013B\u20AC\u0308\U0001040C' == \ assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate') "Lcommaaccent_uni20ac0308_u1040c.alternate"
)
def test_name2unicode_foo(): def test_name2unicode_foo():
@ -131,26 +133,26 @@ def test_name2unicode_foo():
because 'foo' is not in AGL, because 'foo' is not in AGL,
and because it does not start with a 'u.'""" and because it does not start with a 'u.'"""
with pytest.raises(KeyError): with pytest.raises(KeyError):
name2unicode('foo') name2unicode("foo")
def test_name2unicode_notdef(): def test_name2unicode_notdef():
"""The name ".notdef" is reduced to an empty string (step 1) """The name ".notdef" is reduced to an empty string (step 1)
and mapped to an empty string (step 3)""" and mapped to an empty string (step 3)"""
with pytest.raises(KeyError): with pytest.raises(KeyError):
name2unicode('.notdef') name2unicode(".notdef")
def test_name2unicode_pua_ogoneksmall(): def test_name2unicode_pua_ogoneksmall():
"""" """ "
Ogoneksmall" and "uniF6FB" both map to the string Ogoneksmall" and "uniF6FB" both map to the string
that corresponds to U+F6FB.""" that corresponds to U+F6FB."""
assert '\uF6FB' == name2unicode('Ogoneksmall') assert "\uF6FB" == name2unicode("Ogoneksmall")
def test_name2unicode_overflow_error(): def test_name2unicode_overflow_error():
with pytest.raises(KeyError): with pytest.raises(KeyError):
name2unicode('226215240241240240240240') name2unicode("226215240241240240240240")
def test_get_encoding_with_invalid_differences(): def test_get_encoding_with_invalid_differences():
@ -158,5 +160,5 @@ def test_get_encoding_with_invalid_differences():
Regression test for https://github.com/pdfminer/pdfminer.six/issues/385 Regression test for https://github.com/pdfminer/pdfminer.six/issues/385
""" """
invalid_differences = [PSLiteral('ubuntu'), PSLiteral('1234')] invalid_differences = [PSLiteral("ubuntu"), PSLiteral("1234")]
EncodingDB.get_encoding('StandardEncoding', invalid_differences) EncodingDB.get_encoding("StandardEncoding", invalid_differences)

View File

@ -4,7 +4,7 @@ from pdfminer.layout import LTChar, LTTextBox
def test_font_size(): def test_font_size():
path = absolute_sample_path('font-size-test.pdf') path = absolute_sample_path("font-size-test.pdf")
for page in extract_pages(path): for page in extract_pages(path):
for text_box in page: for text_box in page:
if isinstance(text_box, LTTextBox): if isinstance(text_box, LTTextBox):

View File

@ -22,19 +22,19 @@ def run_with_file(sample_path):
test_strings = { test_strings = {
"simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f", "H e l l o \n\nW o r l d\n\n\f",
"simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n" "simple1.pdf_no_boxes_flow": "Hello \n\nWorld\n\nHello \n\nWorld\n\n"
"H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f", "H e l l o \n\nW o r l d\n\n\f",
"simple2.pdf": "\f", "simple2.pdf": "\f",
"simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n" "simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n"
"World\n\nWorld\n\n\f", "World\n\nWorld\n\n\f",
"simple4.pdf": "Text1\nText2\nText3\n\n\f", "simple4.pdf": "Text1\nText2\nText3\n\n\f",
"simple5.pdf": "Heading\n\n" "simple5.pdf": "Heading\n\n"
"Link to heading that is working with vim-pandoc.\n\n" "Link to heading that is working with vim-pandoc.\n\n"
"Link to heading “that is” not working with vim-pandoc.\n\n" "Link to heading “that is” not working with vim-pandoc.\n\n"
"Subheading\n\nSome “more text”\n\n1\n\n\f", "Subheading\n\nSome “more text”\n\n1\n\n\f",
"zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt", "zen_of_python_corrupted.pdf": "Mai 30, 18 13:27\n\nzen_of_python.txt",
"contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣", "contrib/issue_566_test_1.pdf": "ISSUE Date2019-4-25 Buyer黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
@ -102,7 +102,7 @@ class TestExtractText(unittest.TestCase):
test_file = "zen_of_python_corrupted.pdf" test_file = "zen_of_python_corrupted.pdf"
s = run_with_file(test_file) s = run_with_file(test_file)
expected = test_strings[test_file] expected = test_strings[test_file]
self.assertEqual(s[:len(expected)], expected) self.assertEqual(s[: len(expected)], expected)
def test_issue_566_cmap_bytes(self): def test_issue_566_cmap_bytes(self):
test_file = "contrib/issue_566_test_1.pdf" test_file = "contrib/issue_566_test_1.pdf"
@ -129,37 +129,43 @@ class TestExtractPages(unittest.TestCase):
def test_line_margin(self): def test_line_margin(self):
# The lines have margin 0.2 relative to the height. # The lines have margin 0.2 relative to the height.
# Extract with line_margin 0.19 should break into 3 separate textboxes. # Extract with line_margin 0.19 should break into 3 separate textboxes.
pages = list(extract_pages( pages = list(
self._get_test_file_path(), laparams=LAParams(line_margin=0.19))) extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.19)
)
)
self.assertEqual(len(pages), 1) self.assertEqual(len(pages), 1)
page = pages[0] page = pages[0]
elements = [element for element in page elements = [element for element in page if isinstance(element, LTTextContainer)]
if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 3) self.assertEqual(len(elements), 3)
self.assertEqual(elements[0].get_text(), "Text1\n") self.assertEqual(elements[0].get_text(), "Text1\n")
self.assertEqual(elements[1].get_text(), "Text2\n") self.assertEqual(elements[1].get_text(), "Text2\n")
self.assertEqual(elements[2].get_text(), "Text3\n") self.assertEqual(elements[2].get_text(), "Text3\n")
# Extract with line_margin 0.21 should merge into one textbox. # Extract with line_margin 0.21 should merge into one textbox.
pages = list(extract_pages( pages = list(
self._get_test_file_path(), laparams=LAParams(line_margin=0.21))) extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.21)
)
)
self.assertEqual(len(pages), 1) self.assertEqual(len(pages), 1)
page = pages[0] page = pages[0]
elements = [element for element in page elements = [element for element in page if isinstance(element, LTTextContainer)]
if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 1) self.assertEqual(len(elements), 1)
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n") self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
def test_no_boxes_flow(self): def test_no_boxes_flow(self):
pages = list(extract_pages( pages = list(
self._get_test_file_path(), laparams=LAParams(boxes_flow=None))) extract_pages(
self._get_test_file_path(), laparams=LAParams(boxes_flow=None)
)
)
self.assertEqual(len(pages), 1) self.assertEqual(len(pages), 1)
page = pages[0] page = pages[0]
elements = [element for element in page elements = [element for element in page if isinstance(element, LTTextContainer)]
if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 1) self.assertEqual(len(elements), 1)
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n") self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")

View File

@ -46,8 +46,7 @@ class TestFindNeigbors(unittest.TestCase):
right_aligned_below.set_bbox((15, 2, 20, 4)) right_aligned_below.set_bbox((15, 2, 20, 4))
plane.add(right_aligned_below) plane.add(right_aligned_below)
centrally_aligned_overlapping = LTTextLineHorizontal( centrally_aligned_overlapping = LTTextLineHorizontal(laparams.word_margin)
laparams.word_margin)
centrally_aligned_overlapping.set_bbox((13, 5, 17, 7)) centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
plane.add(centrally_aligned_overlapping) plane.add(centrally_aligned_overlapping)
@ -86,8 +85,7 @@ class TestFindNeigbors(unittest.TestCase):
top_aligned_left.set_bbox((2, 15, 4, 20)) top_aligned_left.set_bbox((2, 15, 4, 20))
plane.add(top_aligned_left) plane.add(top_aligned_left)
centrally_aligned_overlapping = LTTextLineVertical( centrally_aligned_overlapping = LTTextLineVertical(laparams.word_margin)
laparams.word_margin)
centrally_aligned_overlapping.set_bbox((5, 13, 7, 17)) centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
plane.add(centrally_aligned_overlapping) plane.add(centrally_aligned_overlapping)

View File

@ -9,9 +9,8 @@ from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
class TestPdfDocument(object): class TestPdfDocument(object):
def test_get_zero_objid_raises_pdfobjectnotfound(self): def test_get_zero_objid_raises_pdfobjectnotfound(self):
with open(absolute_sample_path('simple1.pdf'), 'rb') as in_file: with open(absolute_sample_path("simple1.pdf"), "rb") as in_file:
parser = PDFParser(in_file) parser = PDFParser(in_file)
doc = PDFDocument(parser) doc = PDFDocument(parser)
with pytest.raises(PDFObjectNotFound): with pytest.raises(PDFObjectNotFound):
@ -21,24 +20,29 @@ class TestPdfDocument(object):
# Some documents may be encrypted but not have an /ID key in # Some documents may be encrypted but not have an /ID key in
# their trailer. Tests # their trailer. Tests
# https://github.com/pdfminer/pdfminer.six/issues/594 # https://github.com/pdfminer/pdfminer.six/issues/594
path = absolute_sample_path('encryption/encrypted_doc_no_id.pdf') path = absolute_sample_path("encryption/encrypted_doc_no_id.pdf")
with open(path, 'rb') as fp: with open(path, "rb") as fp:
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)
assert doc.info == [{'Producer': b'European Patent Office'}] assert doc.info == [{"Producer": b"European Patent Office"}]
def test_page_labels(self): def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf') path = absolute_sample_path("contrib/pagelabels.pdf")
with open(path, 'rb') as fp: with open(path, "rb") as fp:
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count']) total_pages = int_value(dict_value(doc.catalog["Pages"])["Count"])
assert list(itertools.islice(doc.get_page_labels(), total_pages)) \ assert list(itertools.islice(doc.get_page_labels(), total_pages)) == [
== ['iii', 'iv', '1', '2', '1'] "iii",
"iv",
"1",
"2",
"1",
]
def test_no_page_labels(self): def test_no_page_labels(self):
path = absolute_sample_path('simple1.pdf') path = absolute_sample_path("simple1.pdf")
with open(path, 'rb') as fp: with open(path, "rb") as fp:
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)

View File

@ -9,96 +9,95 @@ from pdfminer.psparser import PSLiteral
class TestPDFEncoding: class TestPDFEncoding:
def test_cmapname_onebyteidentityV(self): def test_cmapname_onebyteidentityV(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') stream = PDFStream({"CMapName": PSLiteral("OneByteIdentityV")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte) assert isinstance(font.cmap, IdentityCMapByte)
def test_cmapname_onebyteidentityH(self): def test_cmapname_onebyteidentityH(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') stream = PDFStream({"CMapName": PSLiteral("OneByteIdentityH")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte) assert isinstance(font.cmap, IdentityCMapByte)
def test_cmapname_V(self): def test_cmapname_V(self):
stream = PDFStream({'CMapName': PSLiteral('V')}, '') stream = PDFStream({"CMapName": PSLiteral("V")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap) assert isinstance(font.cmap, CMap)
def test_cmapname_H(self): def test_cmapname_H(self):
stream = PDFStream({'CMapName': PSLiteral('H')}, '') stream = PDFStream({"CMapName": PSLiteral("H")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap) assert isinstance(font.cmap, CMap)
def test_encoding_identityH(self): def test_encoding_identityH(self):
spec = {'Encoding': PSLiteral('Identity-H')} spec = {"Encoding": PSLiteral("Identity-H")}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV(self): def test_encoding_identityV(self):
spec = {'Encoding': PSLiteral('Identity-V')} spec = {"Encoding": PSLiteral("Identity-V")}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_PSLiteral_stream(self): def test_encoding_identityH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('Identity-H')}, '') stream = PDFStream({"CMapName": PSLiteral("Identity-H")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_PSLiteral_stream(self): def test_encoding_identityV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('Identity-V')}, '') stream = PDFStream({"CMapName": PSLiteral("Identity-V")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_stream(self): def test_encoding_identityH_as_stream(self):
stream = PDFStream({'CMapName': 'Identity-H'}, '') stream = PDFStream({"CMapName": "Identity-H"}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_stream(self): def test_encoding_identityV_as_stream(self):
stream = PDFStream({'CMapName': 'Identity-V'}, '') stream = PDFStream({"CMapName": "Identity-V"}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH(self): def test_encoding_DLIdentH(self):
spec = {'Encoding': PSLiteral('DLIdent-H')} spec = {"Encoding": PSLiteral("DLIdent-H")}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV(self): def test_encoding_DLIdentV(self):
spec = {'Encoding': PSLiteral('DLIdent-V')} spec = {"Encoding": PSLiteral("DLIdent-V")}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH_as_PSLiteral_stream(self): def test_encoding_DLIdentH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('DLIdent-H')}, '') stream = PDFStream({"CMapName": PSLiteral("DLIdent-H")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV_as_PSLiteral_stream(self): def test_encoding_DLIdentV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('DLIdent-V')}, '') stream = PDFStream({"CMapName": PSLiteral("DLIdent-V")}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH_as_stream(self): def test_encoding_DLIdentH_as_stream(self):
stream = PDFStream({'CMapName': 'DLIdent-H'}, '') stream = PDFStream({"CMapName": "DLIdent-H"}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV_as_stream(self): def test_encoding_DLIdentV_as_stream(self):
stream = PDFStream({'CMapName': 'DLIdent-V'}, '') stream = PDFStream({"CMapName": "DLIdent-V"}, "")
spec = {'Encoding': stream} spec = {"Encoding": stream}
font = PDFCIDFont(None, spec) font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap) assert isinstance(font.cmap, IdentityCMap)

View File

@ -8,12 +8,12 @@ def test_get_cmap_from_pickle():
Regression test for https://github.com/pdfminer/pdfminer.six/issues/391 Regression test for https://github.com/pdfminer/pdfminer.six/issues/391
""" """
cmap_name = 'UniGB-UCS2-H' cmap_name = "UniGB-UCS2-H"
spec = {'Encoding': PSLiteral(cmap_name)} spec = {"Encoding": PSLiteral(cmap_name)}
resource_manager = PDFResourceManager() resource_manager = PDFResourceManager()
font = PDFCIDFont(resource_manager, spec) font = PDFCIDFont(resource_manager, spec)
cmap = font.get_cmap_from_spec(spec, False) cmap = font.get_cmap_from_spec(spec, False)
assert cmap.attrs.get('CMapName') == cmap_name assert cmap.attrs.get("CMapName") == cmap_name
assert len(cmap.code2cid) > 0 assert len(cmap.code2cid) > 0

View File

@ -1,7 +1,7 @@
from pdfminer.ccitt import CCITTG4Parser, CCITTFaxDecoder from pdfminer.ccitt import CCITTG4Parser, CCITTFaxDecoder
class TestCCITTG4Parser(): class TestCCITTG4Parser:
def get_parser(self, bits): def get_parser(self, bits):
parser = CCITTG4Parser(len(bits)) parser = CCITTG4Parser(len(bits))
parser._curline = [int(c) for c in bits] parser._curline = [int(c) for c in bits]
@ -9,60 +9,60 @@ class TestCCITTG4Parser():
return parser return parser
def test_b1(self): def test_b1(self):
parser = self.get_parser('00000') parser = self.get_parser("00000")
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 0 assert parser._curpos == 0
return return
def test_b2(self): def test_b2(self):
parser = self.get_parser('10000') parser = self.get_parser("10000")
parser._do_vertical(-1) parser._do_vertical(-1)
assert parser._curpos == 0 assert parser._curpos == 0
return return
def test_b3(self): def test_b3(self):
parser = self.get_parser('000111') parser = self.get_parser("000111")
parser._do_pass() parser._do_pass()
assert parser._curpos == 3 assert parser._curpos == 3
assert parser._get_bits() == '111' assert parser._get_bits() == "111"
return return
def test_b4(self): def test_b4(self):
parser = self.get_parser('00000') parser = self.get_parser("00000")
parser._do_vertical(+2) parser._do_vertical(+2)
assert parser._curpos == 2 assert parser._curpos == 2
assert parser._get_bits() == '11' assert parser._get_bits() == "11"
return return
def test_b5(self): def test_b5(self):
parser = self.get_parser('11111111100') parser = self.get_parser("11111111100")
parser._do_horizontal(0, 3) parser._do_horizontal(0, 3)
assert parser._curpos == 3 assert parser._curpos == 3
parser._do_vertical(1) parser._do_vertical(1)
assert parser._curpos == 10 assert parser._curpos == 10
assert parser._get_bits() == '0001111111' assert parser._get_bits() == "0001111111"
return return
def test_e1(self): def test_e1(self):
parser = self.get_parser('10000') parser = self.get_parser("10000")
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 1 assert parser._curpos == 1
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 5 assert parser._curpos == 5
assert parser._get_bits() == '10000' assert parser._get_bits() == "10000"
return return
def test_e2(self): def test_e2(self):
parser = self.get_parser('10011') parser = self.get_parser("10011")
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 1 assert parser._curpos == 1
parser._do_vertical(2) parser._do_vertical(2)
assert parser._curpos == 5 assert parser._curpos == 5
assert parser._get_bits() == '10000' assert parser._get_bits() == "10000"
return return
def test_e3(self): def test_e3(self):
parser = self.get_parser('011111') parser = self.get_parser("011111")
parser._color = 0 parser._color = 0
parser._do_vertical(0) parser._do_vertical(0)
assert parser._color == 1 assert parser._color == 1
@ -72,90 +72,90 @@ class TestCCITTG4Parser():
assert parser._curpos == 4 assert parser._curpos == 4
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 6 assert parser._curpos == 6
assert parser._get_bits() == '011100' assert parser._get_bits() == "011100"
return return
def test_e4(self): def test_e4(self):
parser = self.get_parser('10000') parser = self.get_parser("10000")
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 1 assert parser._curpos == 1
parser._do_vertical(-2) parser._do_vertical(-2)
assert parser._curpos == 3 assert parser._curpos == 3
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 5 assert parser._curpos == 5
assert parser._get_bits() == '10011' assert parser._get_bits() == "10011"
return return
def test_e5(self): def test_e5(self):
parser = self.get_parser('011000') parser = self.get_parser("011000")
parser._color = 0 parser._color = 0
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 1 assert parser._curpos == 1
parser._do_vertical(3) parser._do_vertical(3)
assert parser._curpos == 6 assert parser._curpos == 6
assert parser._get_bits() == '011111' assert parser._get_bits() == "011111"
return return
def test_e6(self): def test_e6(self):
parser = self.get_parser('11001') parser = self.get_parser("11001")
parser._do_pass() parser._do_pass()
assert parser._curpos == 4 assert parser._curpos == 4
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 5 assert parser._curpos == 5
assert parser._get_bits() == '11111' assert parser._get_bits() == "11111"
return return
def test_e7(self): def test_e7(self):
parser = self.get_parser('0000000000') parser = self.get_parser("0000000000")
parser._curpos = 2 parser._curpos = 2
parser._color = 1 parser._color = 1
parser._do_horizontal(2, 6) parser._do_horizontal(2, 6)
assert parser._curpos == 10 assert parser._curpos == 10
assert parser._get_bits() == '1111000000' assert parser._get_bits() == "1111000000"
return return
def test_e8(self): def test_e8(self):
parser = self.get_parser('001100000') parser = self.get_parser("001100000")
parser._curpos = 1 parser._curpos = 1
parser._color = 0 parser._color = 0
parser._do_vertical(0) parser._do_vertical(0)
assert parser._curpos == 2 assert parser._curpos == 2
parser._do_horizontal(7, 0) parser._do_horizontal(7, 0)
assert parser._curpos == 9 assert parser._curpos == 9
assert parser._get_bits() == '101111111' assert parser._get_bits() == "101111111"
return return
def test_m1(self): def test_m1(self):
parser = self.get_parser('10101') parser = self.get_parser("10101")
parser._do_pass() parser._do_pass()
assert parser._curpos == 2 assert parser._curpos == 2
parser._do_pass() parser._do_pass()
assert parser._curpos == 4 assert parser._curpos == 4
assert parser._get_bits() == '1111' assert parser._get_bits() == "1111"
return return
def test_m2(self): def test_m2(self):
parser = self.get_parser('101011') parser = self.get_parser("101011")
parser._do_vertical(-1) parser._do_vertical(-1)
parser._do_vertical(-1) parser._do_vertical(-1)
parser._do_vertical(1) parser._do_vertical(1)
parser._do_horizontal(1, 1) parser._do_horizontal(1, 1)
assert parser._get_bits() == '011101' assert parser._get_bits() == "011101"
return return
def test_m3(self): def test_m3(self):
parser = self.get_parser('10111011') parser = self.get_parser("10111011")
parser._do_vertical(-1) parser._do_vertical(-1)
parser._do_pass() parser._do_pass()
parser._do_vertical(1) parser._do_vertical(1)
parser._do_vertical(1) parser._do_vertical(1)
assert parser._get_bits() == '00000001' assert parser._get_bits() == "00000001"
return return
class TestCCITTFaxDecoder: class TestCCITTFaxDecoder:
def test_b1(self): def test_b1(self):
decoder = CCITTFaxDecoder(5) decoder = CCITTFaxDecoder(5)
decoder.output_line(0, b'0') decoder.output_line(0, b"0")
assert decoder.close() == b'\x80' assert decoder.close() == b"\x80"
return return

View File

@ -18,36 +18,37 @@ def dehex(b):
return binascii.unhexlify(b) return binascii.unhexlify(b)
class TestAscii85(): class TestAscii85:
def test_ascii85decode(self): def test_ascii85decode(self):
"""The sample string is taken from: """The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85""" http://en.wikipedia.org/w/index.php?title=Ascii85"""
assert ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q') \ assert ascii85decode(b"9jqo^BlbD-BleB1DJ+*+F(f,q") == b"Man is distinguished"
== b'Man is distinguished' assert ascii85decode(b"E,9)oF*2M7/c~>") == b"pleasure."
assert ascii85decode(b'E,9)oF*2M7/c~>') == b'pleasure.'
def test_asciihexdecode(self): def test_asciihexdecode(self):
assert asciihexdecode(b'61 62 2e6364 65') == b'ab.cde' assert asciihexdecode(b"61 62 2e6364 65") == b"ab.cde"
assert asciihexdecode(b'61 62 2e6364 657>') == b'ab.cdep' assert asciihexdecode(b"61 62 2e6364 657>") == b"ab.cdep"
assert asciihexdecode(b'7>') == b'p' assert asciihexdecode(b"7>") == b"p"
class TestArcfour(): class TestArcfour:
def test(self): def test(self):
assert hex(Arcfour(b'Key').process(b'Plaintext')) \ assert hex(Arcfour(b"Key").process(b"Plaintext")) == b"bbf316e8d940af0ad3"
== b'bbf316e8d940af0ad3' assert hex(Arcfour(b"Wiki").process(b"pedia")) == b"1021bf0420"
assert hex(Arcfour(b'Wiki').process(b'pedia')) == b'1021bf0420' assert (
assert hex(Arcfour(b'Secret').process(b'Attack at dawn')) \ hex(Arcfour(b"Secret").process(b"Attack at dawn"))
== b'45a01f645fc35b383552544b9bf5' == b"45a01f645fc35b383552544b9bf5"
)
class TestLzw(): class TestLzw:
def test_lzwdecode(self): def test_lzwdecode(self):
assert lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') \ assert (
== b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' lzwdecode(b"\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01")
== b"\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42"
)
class TestRunlength(): class TestRunlength:
def test_rldecode(self): def test_rldecode(self):
assert rldecode(b'\x05123456\xfa7\x04abcde\x80junk') \ assert rldecode(b"\x05123456\xfa7\x04abcde\x80junk") == b"1234567777777abcde"
== b'1234567777777abcde'

View File

@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
class TestPSBaseParser: class TestPSBaseParser:
"""Simplistic Test cases""" """Simplistic Test cases"""
TESTDATA = br'''%!PS TESTDATA = rb"""%!PS
begin end begin end
" @ # " @ #
/a/BCD /Some_Name /foo#5f#xbaa /a/BCD /Some_Name /foo#5f#xbaa
@ -26,33 +26,83 @@ baa)
func/a/b{(c)do*}def func/a/b{(c)do*}def
[ 1 (z) ! ] [ 1 (z) ! ]
<< /foo (bar) >> << /foo (bar) >>
''' """
TOKENS = [ TOKENS = [
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (5, KWD(b"begin")),
(19, KWD(b'@')), (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (11, KWD(b"end")),
(30, LIT('Some_Name')), (41, LIT('foo_xbaa')), (54, 0), (56, 1), (16, KWD(b'"')),
(59, -2), (62, 0.5), (65, 1.234), (71, b'abc'), (77, b''), (19, KWD(b"@")),
(80, b'abc ( def ) ghi'), (98, b'def \x00 4ghi'), (21, KWD(b"#")),
(118, b'bach\\slask'), (132, b'foo\nbaa'), (23, LIT("a")),
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (25, LIT("BCD")),
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '), (30, LIT("Some_Name")),
(211, b'\xab\xcd\x00\x124\x05'), (226, KWD(b'func')), (230, LIT('a')), (41, LIT("foo_xbaa")),
(232, LIT('b')), (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (54, 0),
(241, KWD(b'}')), (242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (56, 1),
(250, b'z'), (254, KWD(b'!')), (256, KWD(b']')), (258, KWD(b'<<')), (59, -2),
(261, LIT('foo')), (266, b'bar'), (272, KWD(b'>>')) (62, 0.5),
(65, 1.234),
(71, b"abc"),
(77, b""),
(80, b"abc ( def ) ghi"),
(98, b"def \x00 4ghi"),
(118, b"bach\\slask"),
(132, b"foo\nbaa"),
(143, b"this % is not a comment."),
(170, b"foo\nbaa"),
(180, b"foobaa"),
(191, b""),
(194, b" "),
(199, b"@@ "),
(211, b"\xab\xcd\x00\x124\x05"),
(226, KWD(b"func")),
(230, LIT("a")),
(232, LIT("b")),
(234, KWD(b"{")),
(235, b"c"),
(238, KWD(b"do*")),
(241, KWD(b"}")),
(242, KWD(b"def")),
(246, KWD(b"[")),
(248, 1),
(250, b"z"),
(254, KWD(b"!")),
(256, KWD(b"]")),
(258, KWD(b"<<")),
(261, LIT("foo")),
(266, b"bar"),
(272, KWD(b">>")),
] ]
OBJS = [ OBJS = [
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), (23, LIT("a")),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5), (25, LIT("BCD")),
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'), (30, LIT("Some_Name")),
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'), (41, LIT("foo_xbaa")),
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (54, 0),
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '), (56, 1),
(211, b'\xab\xcd\x00\x124\x05'), (230, LIT('a')), (232, LIT('b')), (59, -2),
(234, [b'c']), (246, [1, b'z']), (258, {'foo': b'bar'}), (62, 0.5),
(65, 1.234),
(71, b"abc"),
(77, b""),
(80, b"abc ( def ) ghi"),
(98, b"def \x00 4ghi"),
(118, b"bach\\slask"),
(132, b"foo\nbaa"),
(143, b"this % is not a comment."),
(170, b"foo\nbaa"),
(180, b"foobaa"),
(191, b""),
(194, b" "),
(199, b"@@ "),
(211, b"\xab\xcd\x00\x124\x05"),
(230, LIT("a")),
(232, LIT("b")),
(234, [b"c"]),
(246, [1, b"z"]),
(258, {"foo": b"bar"}),
] ]
def get_tokens(self, s): def get_tokens(self, s):

View File

@ -6,10 +6,10 @@ from pdfminer.pdfparser import PDFParser
class TestPdfPage(object): class TestPdfPage(object):
def test_page_labels(self): def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf') path = absolute_sample_path("contrib/pagelabels.pdf")
expected_labels = ['iii', 'iv', '1', '2', '1'] expected_labels = ["iii", "iv", "1", "2", "1"]
with open(path, 'rb') as fp: with open(path, "rb") as fp:
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)
for (i, page) in enumerate(PDFPage.create_pages(doc)): for (i, page) in enumerate(PDFPage.create_pages(doc)):

View File

@ -11,48 +11,47 @@ def run(filename, options=None):
absolute_path = absolute_sample_path(filename) absolute_path = absolute_sample_path(filename)
with TemporaryFilePath() as output_file_name: with TemporaryFilePath() as output_file_name:
if options: if options:
s = 'dumppdf -o %s %s %s' % (output_file_name, s = "dumppdf -o %s %s %s" % (output_file_name, options, absolute_path)
options, absolute_path)
else: else:
s = 'dumppdf -o %s %s' % (output_file_name, absolute_path) s = "dumppdf -o %s %s" % (output_file_name, absolute_path)
dumppdf.main(s.split(' ')[1:]) dumppdf.main(s.split(" ")[1:])
class TestDumpPDF(unittest.TestCase): class TestDumpPDF(unittest.TestCase):
def test_simple1(self): def test_simple1(self):
run('simple1.pdf', '-t -a') run("simple1.pdf", "-t -a")
def test_simple2(self): def test_simple2(self):
run('simple2.pdf', '-t -a') run("simple2.pdf", "-t -a")
def test_jo(self): def test_jo(self):
run('jo.pdf', '-t -a') run("jo.pdf", "-t -a")
def test_simple3(self): def test_simple3(self):
run('simple3.pdf', '-t -a') run("simple3.pdf", "-t -a")
def test_2(self): def test_2(self):
run('nonfree/dmca.pdf', '-t -a') run("nonfree/dmca.pdf", "-t -a")
def test_3(self): def test_3(self):
run('nonfree/f1040nr.pdf') run("nonfree/f1040nr.pdf")
def test_4(self): def test_4(self):
run('nonfree/i1040nr.pdf') run("nonfree/i1040nr.pdf")
def test_5(self): def test_5(self):
run('nonfree/kampo.pdf', '-t -a') run("nonfree/kampo.pdf", "-t -a")
def test_6(self): def test_6(self):
run('nonfree/naacl06-shinyama.pdf', '-t -a') run("nonfree/naacl06-shinyama.pdf", "-t -a")
def test_simple1_raw(self): def test_simple1_raw(self):
"""Known issue: crash in dumpxml writing binary to text stream.""" """Known issue: crash in dumpxml writing binary to text stream."""
with pytest.raises(TypeError): with pytest.raises(TypeError):
run('simple1.pdf', '-r -a') run("simple1.pdf", "-r -a")
def test_simple1_binary(self): def test_simple1_binary(self):
"""Known issue: crash in dumpxml writing binary to text stream.""" """Known issue: crash in dumpxml writing binary to text stream."""
with pytest.raises(TypeError): with pytest.raises(TypeError):
run('simple1.pdf', '-b -a') run("simple1.pdf", "-b -a")

View File

@ -12,115 +12,119 @@ def run(sample_path, options=None):
absolute_path = absolute_sample_path(sample_path) absolute_path = absolute_sample_path(sample_path)
with TemporaryFilePath() as output_file_name: with TemporaryFilePath() as output_file_name:
if options: if options:
s = 'pdf2txt -o{} {} {}' \ s = "pdf2txt -o{} {} {}".format(output_file_name, options, absolute_path)
.format(output_file_name, options, absolute_path)
else: else:
s = 'pdf2txt -o{} {}'.format(output_file_name, absolute_path) s = "pdf2txt -o{} {}".format(output_file_name, absolute_path)
pdf2txt.main(s.split(' ')[1:]) pdf2txt.main(s.split(" ")[1:])
class TestPdf2Txt(): class TestPdf2Txt:
def test_jo(self): def test_jo(self):
run('jo.pdf') run("jo.pdf")
def test_simple1(self): def test_simple1(self):
run('simple1.pdf') run("simple1.pdf")
def test_simple2(self): def test_simple2(self):
run('simple2.pdf') run("simple2.pdf")
def test_simple3(self): def test_simple3(self):
run('simple3.pdf') run("simple3.pdf")
def test_sample_one_byte_identity_encode(self): def test_sample_one_byte_identity_encode(self):
run('sampleOneByteIdentityEncode.pdf') run("sampleOneByteIdentityEncode.pdf")
def test_nonfree_175(self): def test_nonfree_175(self):
"""Regression test for: """Regression test for:
https://github.com/pdfminer/pdfminer.six/issues/65 https://github.com/pdfminer/pdfminer.six/issues/65
""" """
run('nonfree/175.pdf') run("nonfree/175.pdf")
def test_nonfree_dmca(self): def test_nonfree_dmca(self):
run('nonfree/dmca.pdf') run("nonfree/dmca.pdf")
def test_nonfree_f1040nr(self): def test_nonfree_f1040nr(self):
run('nonfree/f1040nr.pdf', '-p 1') run("nonfree/f1040nr.pdf", "-p 1")
def test_nonfree_i1040nr(self): def test_nonfree_i1040nr(self):
run('nonfree/i1040nr.pdf', '-p 1') run("nonfree/i1040nr.pdf", "-p 1")
def test_nonfree_kampo(self): def test_nonfree_kampo(self):
run('nonfree/kampo.pdf') run("nonfree/kampo.pdf")
def test_nonfree_naacl06_shinyama(self): def test_nonfree_naacl06_shinyama(self):
run('nonfree/naacl06-shinyama.pdf') run("nonfree/naacl06-shinyama.pdf")
def test_nlp2004slides(self): def test_nlp2004slides(self):
run('nonfree/nlp2004slides.pdf', '-p 1') run("nonfree/nlp2004slides.pdf", "-p 1")
def test_contrib_2b(self): def test_contrib_2b(self):
run('contrib/2b.pdf', '-A -t xml') run("contrib/2b.pdf", "-A -t xml")
def test_contrib_issue_350(self): def test_contrib_issue_350(self):
"""Regression test for """Regression test for
https://github.com/pdfminer/pdfminer.six/issues/350""" https://github.com/pdfminer/pdfminer.six/issues/350"""
run('contrib/issue-00352-asw-oct96-p41.pdf') run("contrib/issue-00352-asw-oct96-p41.pdf")
def test_scancode_patchelf(self): def test_scancode_patchelf(self):
"""Regression test for https://github.com/euske/pdfminer/issues/96""" """Regression test for https://github.com/euske/pdfminer/issues/96"""
run('scancode/patchelf.pdf') run("scancode/patchelf.pdf")
def test_contrib_hash_two_complement(self): def test_contrib_hash_two_complement(self):
"""Check that unsigned integer is added correctly to encryption hash.et """Check that unsigned integer is added correctly to encryption hash.et
See https://github.com/pdfminer/pdfminer.six/issues/186 See https://github.com/pdfminer/pdfminer.six/issues/186
""" """
run('contrib/issue-00352-hash-twos-complement.pdf') run("contrib/issue-00352-hash-twos-complement.pdf")
def test_contrib_excel(self): def test_contrib_excel(self):
"""Regression test for """Regression test for
https://github.com/pdfminer/pdfminer.six/issues/369 https://github.com/pdfminer/pdfminer.six/issues/369
""" """
run('contrib/issue-00369-excel.pdf', '-t html') run("contrib/issue-00369-excel.pdf", "-t html")
def test_encryption_aes128(self): def test_encryption_aes128(self):
run('encryption/aes-128.pdf', '-P foo') run("encryption/aes-128.pdf", "-P foo")
def test_encryption_aes128m(self): def test_encryption_aes128m(self):
run('encryption/aes-128-m.pdf', '-P foo') run("encryption/aes-128-m.pdf", "-P foo")
def test_encryption_aes256(self): def test_encryption_aes256(self):
run('encryption/aes-256.pdf', '-P foo') run("encryption/aes-256.pdf", "-P foo")
def test_encryption_aes256m(self): def test_encryption_aes256m(self):
run('encryption/aes-256-m.pdf', '-P foo') run("encryption/aes-256-m.pdf", "-P foo")
def test_encryption_aes256_r6_user(self): def test_encryption_aes256_r6_user(self):
run('encryption/aes-256-r6.pdf', '-P usersecret') run("encryption/aes-256-r6.pdf", "-P usersecret")
def test_encryption_aes256_r6_owner(self): def test_encryption_aes256_r6_owner(self):
run('encryption/aes-256-r6.pdf', '-P ownersecret') run("encryption/aes-256-r6.pdf", "-P ownersecret")
def test_encryption_base(self): def test_encryption_base(self):
run('encryption/base.pdf', '-P foo') run("encryption/base.pdf", "-P foo")
def test_encryption_rc4_40(self): def test_encryption_rc4_40(self):
run('encryption/rc4-40.pdf', '-P foo') run("encryption/rc4-40.pdf", "-P foo")
def test_encryption_rc4_128(self): def test_encryption_rc4_128(self):
run('encryption/rc4-128.pdf', '-P foo') run("encryption/rc4-128.pdf", "-P foo")
class TestDumpImages: class TestDumpImages:
@staticmethod @staticmethod
def extract_images(input_file, *args): def extract_images(input_file, *args):
output_dir = mkdtemp() output_dir = mkdtemp()
with TemporaryFilePath() as output_file_name: with TemporaryFilePath() as output_file_name:
commands = ['-o', output_file_name, '--output-dir', commands = [
output_dir, input_file, *args] "-o",
output_file_name,
"--output-dir",
output_dir,
input_file,
*args,
]
pdf2txt.main(commands) pdf2txt.main(commands)
image_files = os.listdir(output_dir) image_files = os.listdir(output_dir)
rmtree(output_dir) rmtree(output_dir)
@ -132,39 +136,38 @@ class TestDumpImages:
Regression test for: Regression test for:
https://github.com/pdfminer/pdfminer.six/issues/131 https://github.com/pdfminer/pdfminer.six/issues/131
""" """
filepath = absolute_sample_path('../samples/nonfree/dmca.pdf') filepath = absolute_sample_path("../samples/nonfree/dmca.pdf")
image_files = self.extract_images(filepath, '-p', '1') image_files = self.extract_images(filepath, "-p", "1")
assert image_files[0].endswith('bmp') assert image_files[0].endswith("bmp")
def test_nonfree_175(self): def test_nonfree_175(self):
"""Extract images of pdf containing jpg images""" """Extract images of pdf containing jpg images"""
self.extract_images(absolute_sample_path('../samples/nonfree/175.pdf')) self.extract_images(absolute_sample_path("../samples/nonfree/175.pdf"))
def test_jbig2_image_export(self): def test_jbig2_image_export(self):
"""Extract images of pdf containing jbig2 images """Extract images of pdf containing jbig2 images
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46 Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
""" """
input_file = absolute_sample_path( input_file = absolute_sample_path("../samples/contrib/pdf-with-jbig2.pdf")
'../samples/contrib/pdf-with-jbig2.pdf')
output_dir = mkdtemp() output_dir = mkdtemp()
with TemporaryFilePath() as output_file_name: with TemporaryFilePath() as output_file_name:
commands = ['-o', output_file_name, '--output-dir', commands = ["-o", output_file_name, "--output-dir", output_dir, input_file]
output_dir, input_file]
pdf2txt.main(commands) pdf2txt.main(commands)
image_files = os.listdir(output_dir) image_files = os.listdir(output_dir)
try: try:
assert image_files[0].endswith('.jb2') assert image_files[0].endswith(".jb2")
assert filecmp.cmp(output_dir + '/' + image_files[0], assert filecmp.cmp(
absolute_sample_path( output_dir + "/" + image_files[0],
'../samples/contrib/XIPLAYER0.jb2')) absolute_sample_path("../samples/contrib/XIPLAYER0.jb2"),
)
finally: finally:
rmtree(output_dir) rmtree(output_dir)
def test_contrib_matplotlib(self): def test_contrib_matplotlib(self):
"""Test a pdf with Type3 font""" """Test a pdf with Type3 font"""
run('contrib/matplotlib.pdf') run("contrib/matplotlib.pdf")
def test_nonfree_cmp_itext_logo(self): def test_nonfree_cmp_itext_logo(self):
"""Test a pdf with Type3 font""" """Test a pdf with Type3 font"""
run('nonfree/cmp_itext_logo.pdf') run("nonfree/cmp_itext_logo.pdf")

View File

@ -4,8 +4,13 @@ import pytest
from helpers import absolute_sample_path from helpers import absolute_sample_path
from pdfminer.layout import LTComponent from pdfminer.layout import LTComponent
from pdfminer.utils import open_filename, Plane, shorten_str, \ from pdfminer.utils import (
format_int_roman, format_int_alpha open_filename,
Plane,
shorten_str,
format_int_roman,
format_int_alpha,
)
class TestOpenFilename: class TestOpenFilename:
@ -48,14 +53,12 @@ class TestPlane:
assert result == [obj] assert result == [obj]
def test_find_if_object_is_smaller_than_gridsize(self): def test_find_if_object_is_smaller_than_gridsize(self):
plane, obj = self.given_plane_with_one_object(object_size=1, plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100)
gridsize=100)
result = list(plane.find((0, 0, 100, 100))) result = list(plane.find((0, 0, 100, 100)))
assert result == [obj] assert result == [obj]
def test_find_object_if_much_larger_than_gridsize(self): def test_find_object_if_much_larger_than_gridsize(self):
plane, obj = self.given_plane_with_one_object(object_size=100, plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10)
gridsize=10)
result = list(plane.find((0, 0, 100, 100))) result = list(plane.find((0, 0, 100, 100)))
assert result == [obj] assert result == [obj]
@ -70,43 +73,43 @@ class TestPlane:
class TestFunctions(object): class TestFunctions(object):
def test_shorten_str(self): def test_shorten_str(self):
s = shorten_str('Hello there World', 15) s = shorten_str("Hello there World", 15)
assert s == 'Hello ... World' assert s == "Hello ... World"
def test_shorten_short_str_is_same(self): def test_shorten_short_str_is_same(self):
s = 'Hello World' s = "Hello World"
assert shorten_str(s, 50) == s assert shorten_str(s, 50) == s
def test_shorten_to_really_short(self): def test_shorten_to_really_short(self):
assert shorten_str('Hello World', 5) == 'Hello' assert shorten_str("Hello World", 5) == "Hello"
def test_format_int_alpha(self): def test_format_int_alpha(self):
assert format_int_alpha(1) == 'a' assert format_int_alpha(1) == "a"
assert format_int_alpha(2) == 'b' assert format_int_alpha(2) == "b"
assert format_int_alpha(26) == 'z' assert format_int_alpha(26) == "z"
assert format_int_alpha(27) == 'aa' assert format_int_alpha(27) == "aa"
assert format_int_alpha(28) == 'ab' assert format_int_alpha(28) == "ab"
assert format_int_alpha(26 * 2) == 'az' assert format_int_alpha(26 * 2) == "az"
assert format_int_alpha(26 * 2 + 1) == 'ba' assert format_int_alpha(26 * 2 + 1) == "ba"
assert format_int_alpha(26 * 27) == 'zz' assert format_int_alpha(26 * 27) == "zz"
assert format_int_alpha(26 * 27 + 1) == 'aaa' assert format_int_alpha(26 * 27 + 1) == "aaa"
def test_format_int_roman(self): def test_format_int_roman(self):
assert format_int_roman(1) == 'i' assert format_int_roman(1) == "i"
assert format_int_roman(2) == 'ii' assert format_int_roman(2) == "ii"
assert format_int_roman(3) == 'iii' assert format_int_roman(3) == "iii"
assert format_int_roman(4) == 'iv' assert format_int_roman(4) == "iv"
assert format_int_roman(5) == 'v' assert format_int_roman(5) == "v"
assert format_int_roman(6) == 'vi' assert format_int_roman(6) == "vi"
assert format_int_roman(7) == 'vii' assert format_int_roman(7) == "vii"
assert format_int_roman(8) == 'viii' assert format_int_roman(8) == "viii"
assert format_int_roman(9) == 'ix' assert format_int_roman(9) == "ix"
assert format_int_roman(10) == 'x' assert format_int_roman(10) == "x"
assert format_int_roman(11) == 'xi' assert format_int_roman(11) == "xi"
assert format_int_roman(20) == 'xx' assert format_int_roman(20) == "xx"
assert format_int_roman(40) == 'xl' assert format_int_roman(40) == "xl"
assert format_int_roman(45) == 'xlv' assert format_int_roman(45) == "xlv"
assert format_int_roman(50) == 'l' assert format_int_roman(50) == "l"
assert format_int_roman(90) == 'xc' assert format_int_roman(90) == "xc"
assert format_int_roman(91) == 'xci' assert format_int_roman(91) == "xci"
assert format_int_roman(100) == 'c' assert format_int_roman(100) == "c"

View File

@ -7,39 +7,38 @@ import fileinput
def main(argv): def main(argv):
fonts = {} fonts = {}
for line in fileinput.input(): for line in fileinput.input():
f = line.strip().split(' ') f = line.strip().split(" ")
if not f: if not f:
continue continue
k = f[0] k = f[0]
if k == 'FontName': if k == "FontName":
fontname = f[1] fontname = f[1]
props = {'FontName': fontname, 'Flags': 0} props = {"FontName": fontname, "Flags": 0}
chars = {} chars = {}
fonts[fontname] = (props, chars) fonts[fontname] = (props, chars)
elif k == 'C': elif k == "C":
cid = int(f[1]) cid = int(f[1])
if 0 <= cid and cid <= 255: if 0 <= cid and cid <= 255:
width = int(f[4]) width = int(f[4])
chars[cid] = width chars[cid] = width
elif k in ('CapHeight', 'XHeight', 'ItalicAngle', elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"):
'Ascender', 'Descender'): k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k)
k = {'Ascender': 'Ascent', 'Descender': 'Descent'}.get(k, k)
props[k] = float(f[1]) props[k] = float(f[1])
elif k in ('FontName', 'FamilyName', 'Weight'): elif k in ("FontName", "FamilyName", "Weight"):
k = {'FamilyName': 'FontFamily', 'Weight': 'FontWeight'}.get(k, k) k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k)
props[k] = f[1] props[k] = f[1]
elif k == 'IsFixedPitch': elif k == "IsFixedPitch":
if f[1].lower() == 'true': if f[1].lower() == "true":
props['Flags'] = 64 props["Flags"] = 64
elif k == 'FontBBox': elif k == "FontBBox":
props[k] = tuple(map(float, f[1:5])) props[k] = tuple(map(float, f[1:5]))
print('# -*- python -*-') print("# -*- python -*-")
print('FONT_METRICS = {') print("FONT_METRICS = {")
for (fontname, (props, chars)) in fonts.items(): for (fontname, (props, chars)) in fonts.items():
print(' {!r}: {!r},'.format(fontname, (props, chars))) print(" {!r}: {!r},".format(fontname, (props, chars)))
print('}') print("}")
return 0 return 0
if __name__ == '__main__': if __name__ == "__main__":
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]

View File

@ -6,7 +6,6 @@ import codecs
class CMapConverter: class CMapConverter:
def __init__(self, enc2codec={}): def __init__(self, enc2codec={}):
self.enc2codec = enc2codec self.enc2codec = enc2codec
self.code2cid = {} # {'cmapname': ...} self.code2cid = {} # {'cmapname': ...}
@ -19,12 +18,12 @@ class CMapConverter:
return self.code2cid.keys() return self.code2cid.keys()
def get_maps(self, enc): def get_maps(self, enc):
if enc.endswith('-H'): if enc.endswith("-H"):
(hmapenc, vmapenc) = (enc, None) (hmapenc, vmapenc) = (enc, None)
elif enc == 'H': elif enc == "H":
(hmapenc, vmapenc) = ('H', 'V') (hmapenc, vmapenc) = ("H", "V")
else: else:
(hmapenc, vmapenc) = (enc+'-H', enc+'-V') (hmapenc, vmapenc) = (enc + "-H", enc + "-V")
if hmapenc in self.code2cid: if hmapenc in self.code2cid:
hmap = self.code2cid[hmapenc] hmap = self.code2cid[hmapenc]
else: else:
@ -43,12 +42,12 @@ class CMapConverter:
def load(self, fp): def load(self, fp):
encs = None encs = None
for line in fp: for line in fp:
(line, _, _) = line.strip().partition('#') (line, _, _) = line.strip().partition("#")
if not line: if not line:
continue continue
values = line.split('\t') values = line.split("\t")
if encs is None: if encs is None:
assert values[0] == 'CID', str(values) assert values[0] == "CID", str(values)
encs = values encs = values
continue continue
@ -68,7 +67,7 @@ class CMapConverter:
def add(unimap, enc, code): def add(unimap, enc, code):
try: try:
codec = self.enc2codec[enc] codec = self.enc2codec[enc]
c = code.decode(codec, 'strict') c = code.decode(codec, "strict")
if len(c) == 1: if len(c) == 1:
if c not in unimap: if c not in unimap:
unimap[c] = 0 unimap[c] = 0
@ -89,20 +88,20 @@ class CMapConverter:
unimap_h = {} unimap_h = {}
unimap_v = {} unimap_v = {}
for (enc, value) in zip(encs, values): for (enc, value) in zip(encs, values):
if enc == 'CID': if enc == "CID":
continue continue
if value == '*': if value == "*":
continue continue
# hcodes, vcodes: encoded bytes for each writing mode. # hcodes, vcodes: encoded bytes for each writing mode.
hcodes = [] hcodes = []
vcodes = [] vcodes = []
for code in value.split(','): for code in value.split(","):
vertical = code.endswith('v') vertical = code.endswith("v")
if vertical: if vertical:
code = code[:-1] code = code[:-1]
try: try:
code = codecs.decode(code, 'hex_codec') code = codecs.decode(code, "hex_codec")
except Exception: except Exception:
code = chr(int(code, 16)) code = chr(int(code, 16))
if vertical: if vertical:
@ -155,17 +154,19 @@ def main(argv):
import os.path import os.path
def usage(): def usage():
print('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' print(
% argv[0]) "usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]" % argv[0]
)
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'c:') (opts, args) = getopt.getopt(argv[1:], "c:")
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
enc2codec = {} enc2codec = {}
for (k, v) in opts: for (k, v) in opts:
if k == '-c': if k == "-c":
(enc, _, codec) = v.partition('=') (enc, _, codec) = v.partition("=")
enc2codec[enc] = codec enc2codec[enc] = codec
if not args: if not args:
return usage() return usage()
@ -176,27 +177,27 @@ def main(argv):
converter = CMapConverter(enc2codec) converter = CMapConverter(enc2codec)
for path in args: for path in args:
print('reading: %r...' % path) print("reading: %r..." % path)
fp = open(path) fp = open(path)
converter.load(fp) converter.load(fp)
fp.close() fp.close()
for enc in converter.get_encs(): for enc in converter.get_encs():
fname = '%s.pickle.gz' % enc fname = "%s.pickle.gz" % enc
path = os.path.join(outdir, fname) path = os.path.join(outdir, fname)
print('writing: %r...' % path) print("writing: %r..." % path)
fp = gzip.open(path, 'wb') fp = gzip.open(path, "wb")
converter.dump_cmap(fp, enc) converter.dump_cmap(fp, enc)
fp.close() fp.close()
fname = 'to-unicode-%s.pickle.gz' % regname fname = "to-unicode-%s.pickle.gz" % regname
path = os.path.join(outdir, fname) path = os.path.join(outdir, fname)
print('writing: %r...' % path) print("writing: %r..." % path)
fp = gzip.open(path, 'wb') fp = gzip.open(path, "wb")
converter.dump_unicodemap(fp) converter.dump_unicodemap(fp)
fp.close() fp.close()
return return
if __name__ == '__main__': if __name__ == "__main__":
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]

View File

@ -8,20 +8,19 @@ def main(argv):
state = 0 state = 0
for line in fileinput.input(): for line in fileinput.input():
line = line.strip() line = line.strip()
if not line or line.startswith('#'): if not line or line.startswith("#"):
if state == 1: if state == 1:
state = 2 state = 2
print('}\n') print("}\n")
print(line) print(line)
continue continue
if state == 0: if state == 0:
print('\nglyphname2unicode = {') print("\nglyphname2unicode = {")
state = 1 state = 1
(name, x) = line.split(';') (name, x) = line.split(";")
codes = x.split(' ') codes = x.split(" ")
print(' {!r}: u\'{}\',' print(" {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)))
.format(name, ''.join('\\u%s' % code for code in codes)))
if __name__ == '__main__': if __name__ == "__main__":
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]

View File

@ -4,8 +4,7 @@ import logging
import os.path import os.path
import re import re
import sys import sys
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \ from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast
Union, cast
from argparse import ArgumentParser from argparse import ArgumentParser
import pdfminer import pdfminer
@ -25,33 +24,33 @@ ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
def escape(s: Union[str, bytes]) -> str: def escape(s: Union[str, bytes]) -> str:
if isinstance(s, bytes): if isinstance(s, bytes):
us = str(s, 'latin-1') us = str(s, "latin-1")
else: else:
us = s us = s
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), us) return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us)
def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None: def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
if obj is None: if obj is None:
out.write('<null />') out.write("<null />")
return return
if isinstance(obj, dict): if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj)) out.write('<dict size="%d">\n' % len(obj))
for (k, v) in obj.items(): for (k, v) in obj.items():
out.write('<key>%s</key>\n' % k) out.write("<key>%s</key>\n" % k)
out.write('<value>') out.write("<value>")
dumpxml(out, v) dumpxml(out, v)
out.write('</value>\n') out.write("</value>\n")
out.write('</dict>') out.write("</dict>")
return return
if isinstance(obj, list): if isinstance(obj, list):
out.write('<list size="%d">\n' % len(obj)) out.write('<list size="%d">\n' % len(obj))
for v in obj: for v in obj:
dumpxml(out, v) dumpxml(out, v)
out.write('\n') out.write("\n")
out.write('</list>') out.write("</list>")
return return
if isinstance(obj, (str, bytes)): if isinstance(obj, (str, bytes)):
@ -59,21 +58,20 @@ def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
return return
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
if codec == 'raw': if codec == "raw":
# Bug: writing bytes to text I/O. This will raise TypeError. # Bug: writing bytes to text I/O. This will raise TypeError.
out.write(obj.get_rawdata()) # type: ignore [arg-type] out.write(obj.get_rawdata()) # type: ignore [arg-type]
elif codec == 'binary': elif codec == "binary":
# Bug: writing bytes to text I/O. This will raise TypeError. # Bug: writing bytes to text I/O. This will raise TypeError.
out.write(obj.get_data()) # type: ignore [arg-type] out.write(obj.get_data()) # type: ignore [arg-type]
else: else:
out.write('<stream>\n<props>\n') out.write("<stream>\n<props>\n")
dumpxml(out, obj.attrs) dumpxml(out, obj.attrs)
out.write('\n</props>\n') out.write("\n</props>\n")
if codec == 'text': if codec == "text":
data = obj.get_data() data = obj.get_data()
out.write('<data size="%d">%s</data>\n' out.write('<data size="%d">%s</data>\n' % (len(data), escape(data)))
% (len(data), escape(data))) out.write("</stream>")
out.write('</stream>')
return return
if isinstance(obj, PDFObjRef): if isinstance(obj, PDFObjRef):
@ -82,38 +80,36 @@ def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
if isinstance(obj, PSKeyword): if isinstance(obj, PSKeyword):
# Likely bug: obj.name is bytes, not str # Likely bug: obj.name is bytes, not str
out.write('<keyword>%s</keyword>' out.write("<keyword>%s</keyword>" % obj.name) # type: ignore [str-bytes-safe]
% obj.name) # type: ignore [str-bytes-safe]
return return
if isinstance(obj, PSLiteral): if isinstance(obj, PSLiteral):
# Likely bug: obj.name may be bytes, not str # Likely bug: obj.name may be bytes, not str
out.write('<literal>%s</literal>' out.write("<literal>%s</literal>" % obj.name) # type: ignore [str-bytes-safe]
% obj.name) # type: ignore [str-bytes-safe]
return return
if isnumber(obj): if isnumber(obj):
out.write('<number>%s</number>' % obj) out.write("<number>%s</number>" % obj)
return return
raise TypeError(obj) raise TypeError(obj)
def dumptrailers( def dumptrailers(
out: TextIO, out: TextIO, doc: PDFDocument, show_fallback_xref: bool = False
doc: PDFDocument,
show_fallback_xref: bool = False
) -> None: ) -> None:
for xref in doc.xrefs: for xref in doc.xrefs:
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref: if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
out.write('<trailer>\n') out.write("<trailer>\n")
dumpxml(out, xref.get_trailer()) dumpxml(out, xref.get_trailer())
out.write('\n</trailer>\n\n') out.write("\n</trailer>\n\n")
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs) no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
if no_xrefs and not show_fallback_xref: if no_xrefs and not show_fallback_xref:
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \ msg = (
'you want to display the content of a fallback xref that ' \ "This PDF does not have an xref. Use --show-fallback-xref if "
'contains all objects.' "you want to display the content of a fallback xref that "
"contains all objects."
)
logger.warning(msg) logger.warning(msg)
return return
@ -122,10 +118,10 @@ def dumpallobjs(
out: TextIO, out: TextIO,
doc: PDFDocument, doc: PDFDocument,
codec: Optional[str] = None, codec: Optional[str] = None,
show_fallback_xref: bool = False show_fallback_xref: bool = False,
) -> None: ) -> None:
visited = set() visited = set()
out.write('<pdf>') out.write("<pdf>")
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xref.get_objids(): for objid in xref.get_objids():
if objid in visited: if objid in visited:
@ -137,11 +133,11 @@ def dumpallobjs(
continue continue
out.write('<object id="%d">\n' % objid) out.write('<object id="%d">\n' % objid)
dumpxml(out, obj, codec=codec) dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n') out.write("\n</object>\n\n")
except PDFObjectNotFound as e: except PDFObjectNotFound as e:
print('not found: %r' % e) print("not found: %r" % e)
dumptrailers(out, doc, show_fallback_xref) dumptrailers(out, doc, show_fallback_xref)
out.write('</pdf>') out.write("</pdf>")
return return
@ -150,16 +146,18 @@ def dumpoutline(
fname: str, fname: str,
objids: Any, objids: Any,
pagenos: Container[int], pagenos: Container[int],
password: str = '', password: str = "",
dumpall: bool = False, dumpall: bool = False,
codec: Optional[str] = None, codec: Optional[str] = None,
extractdir: Optional[str] = None extractdir: Optional[str] = None,
) -> None: ) -> None:
fp = open(fname, 'rb') fp = open(fname, "rb")
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
pages = {page.pageid: pageno for (pageno, page) pages = {
in enumerate(PDFPage.create_pages(doc), 1)} page.pageid: pageno
for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
}
def resolve_dest(dest: object) -> Any: def resolve_dest(dest: object) -> Any:
if isinstance(dest, (str, bytes)): if isinstance(dest, (str, bytes)):
@ -167,14 +165,14 @@ def dumpoutline(
elif isinstance(dest, PSLiteral): elif isinstance(dest, PSLiteral):
dest = resolve1(doc.get_dest(dest.name)) dest = resolve1(doc.get_dest(dest.name))
if isinstance(dest, dict): if isinstance(dest, dict):
dest = dest['D'] dest = dest["D"]
if isinstance(dest, PDFObjRef): if isinstance(dest, PDFObjRef):
dest = dest.resolve() dest = dest.resolve()
return dest return dest
try: try:
outlines = doc.get_outlines() outlines = doc.get_outlines()
outfp.write('<outlines>\n') outfp.write("<outlines>\n")
for (level, title, dest, a, se) in outlines: for (level, title, dest, a, se) in outlines:
pageno = None pageno = None
if dest: if dest:
@ -183,21 +181,20 @@ def dumpoutline(
elif a: elif a:
action = a action = a
if isinstance(action, dict): if isinstance(action, dict):
subtype = action.get('S') subtype = action.get("S")
if subtype and repr(subtype) == '/\'GoTo\'' and action.get( if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
'D'): dest = resolve_dest(action["D"])
dest = resolve_dest(action['D'])
pageno = pages[dest[0].objid] pageno = pages[dest[0].objid]
s = escape(title) s = escape(title)
outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s)) outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
if dest is not None: if dest is not None:
outfp.write('<dest>') outfp.write("<dest>")
dumpxml(outfp, dest) dumpxml(outfp, dest)
outfp.write('</dest>\n') outfp.write("</dest>\n")
if pageno is not None: if pageno is not None:
outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write("<pageno>%r</pageno>\n" % pageno)
outfp.write('</outline>\n') outfp.write("</outline>\n")
outfp.write('</outlines>\n') outfp.write("</outlines>\n")
except PDFNoOutlines: except PDFNoOutlines:
pass pass
parser.close() parser.close()
@ -205,43 +202,48 @@ def dumpoutline(
return return
LITERAL_FILESPEC = LIT('Filespec') LITERAL_FILESPEC = LIT("Filespec")
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile') LITERAL_EMBEDDEDFILE = LIT("EmbeddedFile")
def extractembedded(fname: str, password: str, extractdir: str) -> None: def extractembedded(fname: str, password: str, extractdir: str) -> None:
def extract1(objid: int, obj: Dict[str, Any]) -> None: def extract1(objid: int, obj: Dict[str, Any]) -> None:
filename = os.path.basename(obj.get('UF') or filename = os.path.basename(obj.get("UF") or cast(bytes, obj.get("F")).decode())
cast(bytes, obj.get('F')).decode()) fileref = obj["EF"].get("UF") or obj["EF"].get("F")
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
fileobj = doc.getobj(fileref.objid) fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream): if not isinstance(fileobj, PDFStream):
error_msg = 'unable to process PDF: reference for %r is not a ' \ error_msg = (
'PDFStream' % filename "unable to process PDF: reference for %r is not a "
"PDFStream" % filename
)
raise PDFValueError(error_msg) raise PDFValueError(error_msg)
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE:
raise PDFValueError( raise PDFValueError(
'unable to process PDF: reference for %r ' "unable to process PDF: reference for %r "
'is not an EmbeddedFile' % (filename)) "is not an EmbeddedFile" % (filename)
path = os.path.join(extractdir, '%.6d-%s' % (objid, filename)) )
path = os.path.join(extractdir, "%.6d-%s" % (objid, filename))
if os.path.exists(path): if os.path.exists(path):
raise IOError('file exists: %r' % path) raise IOError("file exists: %r" % path)
print('extracting: %r' % path) print("extracting: %r" % path)
os.makedirs(os.path.dirname(path), exist_ok=True) os.makedirs(os.path.dirname(path), exist_ok=True)
out = open(path, 'wb') out = open(path, "wb")
out.write(fileobj.get_data()) out.write(fileobj.get_data())
out.close() out.close()
return return
with open(fname, 'rb') as fp: with open(fname, "rb") as fp:
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
extracted_objids = set() extracted_objids = set()
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xref.get_objids(): for objid in xref.get_objids():
obj = doc.getobj(objid) obj = doc.getobj(objid)
if objid not in extracted_objids and isinstance(obj, dict) \ if (
and obj.get('Type') is LITERAL_FILESPEC: objid not in extracted_objids
and isinstance(obj, dict)
and obj.get("Type") is LITERAL_FILESPEC
):
extracted_objids.add(objid) extracted_objids.add(objid)
extract1(objid, obj) extract1(objid, obj)
return return
@ -252,13 +254,13 @@ def dumppdf(
fname: str, fname: str,
objids: Iterable[int], objids: Iterable[int],
pagenos: Container[int], pagenos: Container[int],
password: str = '', password: str = "",
dumpall: bool = False, dumpall: bool = False,
codec: Optional[str] = None, codec: Optional[str] = None,
extractdir: Optional[str] = None, extractdir: Optional[str] = None,
show_fallback_xref: bool = False show_fallback_xref: bool = False,
) -> None: ) -> None:
fp = open(fname, 'rb') fp = open(fname, "rb")
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
if objids: if objids:
@ -279,71 +281,125 @@ def dumppdf(
if (not objids) and (not pagenos) and (not dumpall): if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc, show_fallback_xref) dumptrailers(outfp, doc, show_fallback_xref)
fp.close() fp.close()
if codec not in ('raw', 'binary'): if codec not in ("raw", "binary"):
outfp.write('\n') outfp.write("\n")
return return
def create_parser() -> ArgumentParser: def create_parser() -> ArgumentParser:
parser = ArgumentParser(description=__doc__, add_help=True) parser = ArgumentParser(description=__doc__, add_help=True)
parser.add_argument('files', type=str, default=None, nargs='+', parser.add_argument(
help='One or more paths to PDF files.') "files",
type=str,
default=None,
nargs="+",
help="One or more paths to PDF files.",
)
parser.add_argument( parser.add_argument(
"--version", "-v", action="version", "--version",
version="pdfminer.six v{}".format(pdfminer.__version__)) "-v",
action="version",
version="pdfminer.six v{}".format(pdfminer.__version__),
)
parser.add_argument( parser.add_argument(
'--debug', '-d', default=False, action='store_true', "--debug",
help='Use debug logging level.') "-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
procedure_parser = parser.add_mutually_exclusive_group() procedure_parser = parser.add_mutually_exclusive_group()
procedure_parser.add_argument( procedure_parser.add_argument(
'--extract-toc', '-T', default=False, action='store_true', "--extract-toc",
help='Extract structure of outline') "-T",
default=False,
action="store_true",
help="Extract structure of outline",
)
procedure_parser.add_argument( procedure_parser.add_argument(
'--extract-embedded', '-E', type=str, "--extract-embedded", "-E", type=str, help="Extract embedded files"
help='Extract embedded files') )
parse_params = parser.add_argument_group( parse_params = parser.add_argument_group(
'Parser', description='Used during PDF parsing') "Parser", description="Used during PDF parsing"
)
parse_params.add_argument( parse_params.add_argument(
'--page-numbers', type=int, default=None, nargs='+', "--page-numbers",
help='A space-seperated list of page numbers to parse.') type=int,
default=None,
nargs="+",
help="A space-seperated list of page numbers to parse.",
)
parse_params.add_argument( parse_params.add_argument(
'--pagenos', '-p', type=str, "--pagenos",
help='A comma-separated list of page numbers to parse. Included for ' "-p",
'legacy applications, use --page-numbers for more idiomatic ' type=str,
'argument entry.') help="A comma-separated list of page numbers to parse. Included for "
"legacy applications, use --page-numbers for more idiomatic "
"argument entry.",
)
parse_params.add_argument( parse_params.add_argument(
'--objects', '-i', type=str, "--objects",
help='Comma separated list of object numbers to extract') "-i",
type=str,
help="Comma separated list of object numbers to extract",
)
parse_params.add_argument( parse_params.add_argument(
'--all', '-a', default=False, action='store_true', "--all",
help='If the structure of all objects should be extracted') "-a",
default=False,
action="store_true",
help="If the structure of all objects should be extracted",
)
parse_params.add_argument( parse_params.add_argument(
'--show-fallback-xref', action='store_true', "--show-fallback-xref",
help='Additionally show the fallback xref. Use this if the PDF ' action="store_true",
'has zero or only invalid xref\'s. This setting is ignored if ' help="Additionally show the fallback xref. Use this if the PDF "
'--extract-toc or --extract-embedded is used.') "has zero or only invalid xref's. This setting is ignored if "
"--extract-toc or --extract-embedded is used.",
)
parse_params.add_argument( parse_params.add_argument(
'--password', '-P', type=str, default='', "--password",
help='The password to use for decrypting PDF file.') "-P",
type=str,
default="",
help="The password to use for decrypting PDF file.",
)
output_params = parser.add_argument_group( output_params = parser.add_argument_group(
'Output', description='Used during output generation.') "Output", description="Used during output generation."
)
output_params.add_argument( output_params.add_argument(
'--outfile', '-o', type=str, default='-', "--outfile",
"-o",
type=str,
default="-",
help='Path to file where output is written. Or "-" (default) to ' help='Path to file where output is written. Or "-" (default) to '
'write to stdout.') "write to stdout.",
)
codec_parser = output_params.add_mutually_exclusive_group() codec_parser = output_params.add_mutually_exclusive_group()
codec_parser.add_argument( codec_parser.add_argument(
'--raw-stream', '-r', default=False, action='store_true', "--raw-stream",
help='Write stream objects without encoding') "-r",
default=False,
action="store_true",
help="Write stream objects without encoding",
)
codec_parser.add_argument( codec_parser.add_argument(
'--binary-stream', '-b', default=False, action='store_true', "--binary-stream",
help='Write stream objects with binary encoding') "-b",
default=False,
action="store_true",
help="Write stream objects with binary encoding",
)
codec_parser.add_argument( codec_parser.add_argument(
'--text-stream', '-t', default=False, action='store_true', "--text-stream",
help='Write stream objects as plain text') "-t",
default=False,
action="store_true",
help="Write stream objects as plain text",
)
return parser return parser
@ -355,53 +411,63 @@ def main(argv: Optional[List[str]] = None) -> None:
if args.debug: if args.debug:
logging.getLogger().setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG)
if args.outfile == '-': if args.outfile == "-":
outfp = sys.stdout outfp = sys.stdout
else: else:
outfp = open(args.outfile, 'w') outfp = open(args.outfile, "w")
if args.objects: if args.objects:
objids = [int(x) for x in args.objects.split(',')] objids = [int(x) for x in args.objects.split(",")]
else: else:
objids = [] objids = []
if args.page_numbers: if args.page_numbers:
pagenos = {x - 1 for x in args.page_numbers} pagenos = {x - 1 for x in args.page_numbers}
elif args.pagenos: elif args.pagenos:
pagenos = {int(x) - 1 for x in args.pagenos.split(',')} pagenos = {int(x) - 1 for x in args.pagenos.split(",")}
else: else:
pagenos = set() pagenos = set()
password = args.password password = args.password
if args.raw_stream: if args.raw_stream:
codec: Optional[str] = 'raw' codec: Optional[str] = "raw"
elif args.binary_stream: elif args.binary_stream:
codec = 'binary' codec = "binary"
elif args.text_stream: elif args.text_stream:
codec = 'text' codec = "text"
else: else:
codec = None codec = None
for fname in args.files: for fname in args.files:
if args.extract_toc: if args.extract_toc:
dumpoutline( dumpoutline(
outfp, fname, objids, pagenos, password=password, outfp,
dumpall=args.all, codec=codec, extractdir=None fname,
objids,
pagenos,
password=password,
dumpall=args.all,
codec=codec,
extractdir=None,
) )
elif args.extract_embedded: elif args.extract_embedded:
extractembedded( extractembedded(fname, password=password, extractdir=args.extract_embedded)
fname, password=password, extractdir=args.extract_embedded
)
else: else:
dumppdf( dumppdf(
outfp, fname, objids, pagenos, password=password, outfp,
dumpall=args.all, codec=codec, extractdir=None, fname,
show_fallback_xref=args.show_fallback_xref objids,
pagenos,
password=password,
dumpall=args.all,
codec=codec,
extractdir=None,
show_fallback_xref=args.show_fallback_xref,
) )
outfp.close() outfp.close()
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

@ -12,10 +12,7 @@ from pdfminer.utils import AnyIO
logging.basicConfig() logging.basicConfig()
OUTPUT_TYPES = ((".htm", "html"), OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
(".html", "html"),
(".xml", "xml"),
(".tag", "tag"))
def float_or_disabled(x: str) -> Optional[float]: def float_or_disabled(x: str) -> Optional[float]:
@ -29,17 +26,17 @@ def float_or_disabled(x: str) -> Optional[float]:
def extract_text( def extract_text(
files: Iterable[str] = [], files: Iterable[str] = [],
outfile: str = '-', outfile: str = "-",
laparams: Optional[LAParams] = None, laparams: Optional[LAParams] = None,
output_type: str = 'text', output_type: str = "text",
codec: str = 'utf-8', codec: str = "utf-8",
strip_control: bool = False, strip_control: bool = False,
maxpages: int = 0, maxpages: int = 0,
page_numbers: Optional[Container[int]] = None, page_numbers: Optional[Container[int]] = None,
password: str = "", password: str = "",
scale: float = 1.0, scale: float = 1.0,
rotation: int = 0, rotation: int = 0,
layoutmode: str = 'normal', layoutmode: str = "normal",
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
debug: bool = False, debug: bool = False,
disable_caching: bool = False, disable_caching: bool = False,
@ -56,7 +53,7 @@ def extract_text(
if outfile == "-": if outfile == "-":
outfp: AnyIO = sys.stdout outfp: AnyIO = sys.stdout
if sys.stdout.encoding is not None: if sys.stdout.encoding is not None:
codec = 'utf-8' codec = "utf-8"
else: else:
outfp = open(outfile, "wb") outfp = open(outfile, "wb")
@ -69,118 +66,211 @@ def extract_text(
def parse_args(args: Optional[List[str]]) -> argparse.Namespace: def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument( parser.add_argument(
"files", type=str, default=None, nargs="+", "files",
help="One or more paths to PDF files.") type=str,
default=None,
nargs="+",
help="One or more paths to PDF files.",
)
parser.add_argument( parser.add_argument(
"--version", "-v", action="version", "--version",
version="pdfminer.six v{}".format(pdfminer.__version__)) "-v",
action="version",
version="pdfminer.six v{}".format(pdfminer.__version__),
)
parser.add_argument( parser.add_argument(
"--debug", "-d", default=False, action="store_true", "--debug",
help="Use debug logging level.") "-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
parser.add_argument( parser.add_argument(
"--disable-caching", "-C", default=False, action="store_true", "--disable-caching",
help="If caching or resources, such as fonts, should be disabled.") "-C",
default=False,
action="store_true",
help="If caching or resources, such as fonts, should be disabled.",
)
parse_params = parser.add_argument_group( parse_params = parser.add_argument_group(
'Parser', description='Used during PDF parsing') "Parser", description="Used during PDF parsing"
)
parse_params.add_argument( parse_params.add_argument(
"--page-numbers", type=int, default=None, nargs="+", "--page-numbers",
help="A space-seperated list of page numbers to parse.") type=int,
default=None,
nargs="+",
help="A space-seperated list of page numbers to parse.",
)
parse_params.add_argument( parse_params.add_argument(
"--pagenos", "-p", type=str, "--pagenos",
"-p",
type=str,
help="A comma-separated list of page numbers to parse. " help="A comma-separated list of page numbers to parse. "
"Included for legacy applications, use --page-numbers " "Included for legacy applications, use --page-numbers "
"for more idiomatic argument entry.") "for more idiomatic argument entry.",
)
parse_params.add_argument( parse_params.add_argument(
"--maxpages", "-m", type=int, default=0, "--maxpages",
help="The maximum number of pages to parse.") "-m",
type=int,
default=0,
help="The maximum number of pages to parse.",
)
parse_params.add_argument( parse_params.add_argument(
"--password", "-P", type=str, default="", "--password",
help="The password to use for decrypting PDF file.") "-P",
type=str,
default="",
help="The password to use for decrypting PDF file.",
)
parse_params.add_argument( parse_params.add_argument(
"--rotation", "-R", default=0, type=int, "--rotation",
"-R",
default=0,
type=int,
help="The number of degrees to rotate the PDF " help="The number of degrees to rotate the PDF "
"before other types of processing.") "before other types of processing.",
)
la_params = LAParams() # will be used for defaults la_params = LAParams() # will be used for defaults
la_param_group = parser.add_argument_group( la_param_group = parser.add_argument_group(
'Layout analysis', description='Used during layout analysis.') "Layout analysis", description="Used during layout analysis."
)
la_param_group.add_argument( la_param_group.add_argument(
"--no-laparams", "-n", default=False, action="store_true", "--no-laparams",
help="If layout analysis parameters should be ignored.") "-n",
la_param_group.add_argument( default=False,
"--detect-vertical", "-V", default=la_params.detect_vertical,
action="store_true", action="store_true",
help="If vertical text should be considered during layout analysis") help="If layout analysis parameters should be ignored.",
)
la_param_group.add_argument( la_param_group.add_argument(
"--line-overlap", type=float, default=la_params.line_overlap, "--detect-vertical",
help='If two characters have more overlap than this they ' "-V",
'are considered to be on the same line. The overlap is specified ' default=la_params.detect_vertical,
'relative to the minimum height of both characters.') action="store_true",
help="If vertical text should be considered during layout analysis",
)
la_param_group.add_argument( la_param_group.add_argument(
"--char-margin", "-M", type=float, default=la_params.char_margin, "--line-overlap",
type=float,
default=la_params.line_overlap,
help="If two characters have more overlap than this they "
"are considered to be on the same line. The overlap is specified "
"relative to the minimum height of both characters.",
)
la_param_group.add_argument(
"--char-margin",
"-M",
type=float,
default=la_params.char_margin,
help="If two characters are closer together than this margin they " help="If two characters are closer together than this margin they "
"are considered to be part of the same line. The margin is " "are considered to be part of the same line. The margin is "
"specified relative to the width of the character.") "specified relative to the width of the character.",
)
la_param_group.add_argument( la_param_group.add_argument(
"--word-margin", "-W", type=float, default=la_params.word_margin, "--word-margin",
"-W",
type=float,
default=la_params.word_margin,
help="If two characters on the same line are further apart than this " help="If two characters on the same line are further apart than this "
"margin then they are considered to be two separate words, and " "margin then they are considered to be two separate words, and "
"an intermediate space will be added for readability. The margin " "an intermediate space will be added for readability. The margin "
"is specified relative to the width of the character.") "is specified relative to the width of the character.",
)
la_param_group.add_argument( la_param_group.add_argument(
"--line-margin", "-L", type=float, default=la_params.line_margin, "--line-margin",
"-L",
type=float,
default=la_params.line_margin,
help="If two lines are close together they are considered to " help="If two lines are close together they are considered to "
"be part of the same paragraph. The margin is specified " "be part of the same paragraph. The margin is specified "
"relative to the height of a line.") "relative to the height of a line.",
)
la_param_group.add_argument( la_param_group.add_argument(
"--boxes-flow", "-F", type=float_or_disabled, "--boxes-flow",
"-F",
type=float_or_disabled,
default=la_params.boxes_flow, default=la_params.boxes_flow,
help="Specifies how much a horizontal and vertical position of a " help="Specifies how much a horizontal and vertical position of a "
"text matters when determining the order of lines. The value " "text matters when determining the order of lines. The value "
"should be within the range of -1.0 (only horizontal position " "should be within the range of -1.0 (only horizontal position "
"matters) to +1.0 (only vertical position matters). You can also " "matters) to +1.0 (only vertical position matters). You can also "
"pass `disabled` to disable advanced layout analysis, and " "pass `disabled` to disable advanced layout analysis, and "
"instead return text based on the position of the bottom left " "instead return text based on the position of the bottom left "
"corner of the text box.") "corner of the text box.",
)
la_param_group.add_argument( la_param_group.add_argument(
"--all-texts", "-A", default=la_params.all_texts, action="store_true", "--all-texts",
help="If layout analysis should be performed on text in figures.") "-A",
default=la_params.all_texts,
action="store_true",
help="If layout analysis should be performed on text in figures.",
)
output_params = parser.add_argument_group( output_params = parser.add_argument_group(
'Output', description='Used during output generation.') "Output", description="Used during output generation."
)
output_params.add_argument( output_params.add_argument(
"--outfile", "-o", type=str, default="-", "--outfile",
"-o",
type=str,
default="-",
help="Path to file where output is written. " help="Path to file where output is written. "
"Or \"-\" (default) to write to stdout.") 'Or "-" (default) to write to stdout.',
)
output_params.add_argument( output_params.add_argument(
"--output_type", "-t", type=str, default="text", "--output_type",
help="Type of output to generate {text,html,xml,tag}.") "-t",
type=str,
default="text",
help="Type of output to generate {text,html,xml,tag}.",
)
output_params.add_argument( output_params.add_argument(
"--codec", "-c", type=str, default="utf-8", "--codec",
help="Text encoding to use in output file.") "-c",
type=str,
default="utf-8",
help="Text encoding to use in output file.",
)
output_params.add_argument( output_params.add_argument(
"--output-dir", "-O", default=None, "--output-dir",
"-O",
default=None,
help="The output directory to put extracted images in. If not given, " help="The output directory to put extracted images in. If not given, "
"images are not extracted.") "images are not extracted.",
)
output_params.add_argument( output_params.add_argument(
"--layoutmode", "-Y", default="normal", "--layoutmode",
type=str, help="Type of layout to use when generating html " "-Y",
"{normal,exact,loose}. If normal,each line is" default="normal",
" positioned separately in the html. If exact" type=str,
", each character is positioned separately in" help="Type of layout to use when generating html "
" the html. If loose, same result as normal " "{normal,exact,loose}. If normal,each line is"
"but with an additional newline after each " " positioned separately in the html. If exact"
"text line. Only used when output_type is html.") ", each character is positioned separately in"
" the html. If loose, same result as normal "
"but with an additional newline after each "
"text line. Only used when output_type is html.",
)
output_params.add_argument( output_params.add_argument(
"--scale", "-s", type=float, default=1.0, "--scale",
"-s",
type=float,
default=1.0,
help="The amount of zoom to use when generating html file. " help="The amount of zoom to use when generating html file. "
"Only used when output_type is html.") "Only used when output_type is html.",
)
output_params.add_argument( output_params.add_argument(
"--strip-control", "-S", default=False, action="store_true", "--strip-control",
"-S",
default=False,
action="store_true",
help="Remove control statement from text. " help="Remove control statement from text. "
"Only used when output_type is xml.") "Only used when output_type is xml.",
)
parsed_args = parser.parse_args(args=args) parsed_args = parser.parse_args(args=args)
@ -199,13 +289,10 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
) )
if parsed_args.page_numbers: if parsed_args.page_numbers:
parsed_args.page_numbers = {x-1 for x in parsed_args.page_numbers} parsed_args.page_numbers = {x - 1 for x in parsed_args.page_numbers}
if parsed_args.pagenos: if parsed_args.pagenos:
parsed_args.page_numbers = { parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
int(x) - 1
for x in parsed_args.pagenos.split(",")
}
if parsed_args.output_type == "text" and parsed_args.outfile != "-": if parsed_args.output_type == "text" and parsed_args.outfile != "-":
for override, alttype in OUTPUT_TYPES: for override, alttype in OUTPUT_TYPES:
@ -222,5 +309,5 @@ def main(args: Optional[List[str]] = None) -> int:
return 0 return 0
if __name__ == '__main__': if __name__ == "__main__":
sys.exit(main()) sys.exit(main())

View File

@ -21,14 +21,20 @@ def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]:
# If any LAParams group arguments were passed, # If any LAParams group arguments were passed,
# create an LAParams object and # create an LAParams object and
# populate with given args. Otherwise, set it to None. # populate with given args. Otherwise, set it to None.
if kwargs.get('laparams', None) is None: if kwargs.get("laparams", None) is None:
laparams = layout.LAParams() laparams = layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", for param in (
"char_margin", "line_margin", "boxes_flow"): "all_texts",
"detect_vertical",
"word_margin",
"char_margin",
"line_margin",
"boxes_flow",
):
paramv = kwargs.get(param, None) paramv = kwargs.get(param, None)
if paramv is not None: if paramv is not None:
setattr(laparams, param, paramv) setattr(laparams, param, paramv)
kwargs['laparams'] = laparams kwargs["laparams"] = laparams
s1 = io.StringIO() s1 = io.StringIO()
with open(file1, "rb") as fp: with open(file1, "rb") as fp:
@ -39,81 +45,140 @@ def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]:
high_level.extract_text_to_fp(fp, s2, **kwargs) high_level.extract_text_to_fp(fp, s2, **kwargs)
import difflib import difflib
s1.seek(0) s1.seek(0)
s2.seek(0) s2.seek(0)
s1_lines, s2_lines = s1.readlines(), s2.readlines() s1_lines, s2_lines = s1.readlines(), s2.readlines()
import os.path import os.path
try: try:
extension = os.path.splitext(kwargs['outfile'])[1][1:4] extension = os.path.splitext(kwargs["outfile"])[1][1:4]
if extension.lower() == 'htm': if extension.lower() == "htm":
return difflib.HtmlDiff().make_file(s1_lines, s2_lines) return difflib.HtmlDiff().make_file(s1_lines, s2_lines)
except KeyError: except KeyError:
pass pass
return difflib.unified_diff(s1_lines, s2_lines, n=kwargs['context_lines']) return difflib.unified_diff(s1_lines, s2_lines, n=kwargs["context_lines"])
# main # main
def main(args: Optional[List[str]] = None) -> int: def main(args: Optional[List[str]] = None) -> int:
import argparse import argparse
P = argparse.ArgumentParser(description=__doc__) P = argparse.ArgumentParser(description=__doc__)
P.add_argument("file1", type=str, default=None, help="File 1 to compare.") P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
P.add_argument("file2", type=str, default=None, help="File 2 to compare.") P.add_argument("file2", type=str, default=None, help="File 2 to compare.")
P.add_argument("-o", "--outfile", type=str, default="-", P.add_argument(
help="Output file(default/'-' is stdout) if .htm or .html," "-o",
" create an HTML table (or a complete HTML file " "--outfile",
"containing the table) showing a side by side, " type=str,
"line by line comparison of text with inter-line and " default="-",
"intra-line change highlights. The table can be " help="Output file(default/'-' is stdout) if .htm or .html,"
"generated in either full or " " create an HTML table (or a complete HTML file "
"contextual difference mode.") "containing the table) showing a side by side, "
P.add_argument("-N", "--context-lines", default=3, type=int, "line by line comparison of text with inter-line and "
help="context lines shown") "intra-line change highlights. The table can be "
P.add_argument("-d", "--debug", default=False, action="store_true", "generated in either full or "
help="Debug output.") "contextual difference mode.",
)
P.add_argument(
"-N", "--context-lines", default=3, type=int, help="context lines shown"
)
P.add_argument(
"-d", "--debug", default=False, action="store_true", help="Debug output."
)
# params for pdf2txt # params for pdf2txt
P.add_argument("-p", "--pagenos", type=str, P.add_argument(
help="Comma-separated list of page numbers to parse. " "-p",
"Included for legacy applications, " "--pagenos",
"use --page-numbers for more " type=str,
"idiomatic argument entry.") help="Comma-separated list of page numbers to parse. "
P.add_argument("--page-numbers", type=int, default=None, nargs="+", "Included for legacy applications, "
help="Alternative to --pagenos with space-separated " "use --page-numbers for more "
"numbers; supercedes --pagenos where it is used.") "idiomatic argument entry.",
P.add_argument("-m", "--maxpages", type=int, default=0, )
help="Maximum pages to parse") P.add_argument(
P.add_argument("-P", "--password", type=str, default="", "--page-numbers",
help="Decryption password for both PDFs") type=int,
P.add_argument("-t", "--output_type", type=str, default="text", default=None,
help="pdf2txt type: text|html|xml|tag (default is text)") nargs="+",
P.add_argument("-c", "--codec", type=str, default="utf-8", help="Alternative to --pagenos with space-separated "
help="Text encoding") "numbers; supercedes --pagenos where it is used.",
)
P.add_argument(
"-m", "--maxpages", type=int, default=0, help="Maximum pages to parse"
)
P.add_argument(
"-P",
"--password",
type=str,
default="",
help="Decryption password for both PDFs",
)
P.add_argument(
"-t",
"--output_type",
type=str,
default="text",
help="pdf2txt type: text|html|xml|tag (default is text)",
)
P.add_argument("-c", "--codec", type=str, default="utf-8", help="Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help="Scale") P.add_argument("-s", "--scale", type=float, default=1.0, help="Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", P.add_argument(
help="LAParams all texts") "-A",
P.add_argument("-V", "--detect-vertical", default=None, "--all-texts",
action="store_true", help="LAParams detect vertical") default=None,
P.add_argument("-W", "--word-margin", type=float, default=None, action="store_true",
help="LAParams word margin") help="LAParams all texts",
P.add_argument("-M", "--char-margin", type=float, default=None, )
help="LAParams char margin") P.add_argument(
P.add_argument("-L", "--line-margin", type=float, default=None, "-V",
help="LAParams line margin") "--detect-vertical",
P.add_argument("-F", "--boxes-flow", type=float, default=None, default=None,
help="LAParams boxes flow") action="store_true",
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="LAParams detect vertical",
help="HTML Layout Mode") )
P.add_argument("-n", "--no-laparams", default=False, P.add_argument(
action="store_true", help="Pass None as LAParams") "-W", "--word-margin", type=float, default=None, help="LAParams word margin"
P.add_argument("-R", "--rotation", default=0, type=int, )
help="Rotation") P.add_argument(
P.add_argument("-O", "--output-dir", default=None, "-M", "--char-margin", type=float, default=None, help="LAParams char margin"
help="Output directory for images") )
P.add_argument("-C", "--disable-caching", default=False, P.add_argument(
action="store_true", help="Disable caching") "-L", "--line-margin", type=float, default=None, help="LAParams line margin"
P.add_argument("-S", "--strip-control", default=False, )
action="store_true", help="Strip control in XML mode") P.add_argument(
"-F", "--boxes-flow", type=float, default=None, help="LAParams boxes flow"
)
P.add_argument(
"-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode"
)
P.add_argument(
"-n",
"--no-laparams",
default=False,
action="store_true",
help="Pass None as LAParams",
)
P.add_argument("-R", "--rotation", default=0, type=int, help="Rotation")
P.add_argument(
"-O", "--output-dir", default=None, help="Output directory for images"
)
P.add_argument(
"-C",
"--disable-caching",
default=False,
action="store_true",
help="Disable caching",
)
P.add_argument(
"-S",
"--strip-control",
default=False,
action="store_true",
help="Strip control in XML mode",
)
A = P.parse_args(args=args) A = P.parse_args(args=args)
@ -121,26 +186,28 @@ def main(args: Optional[List[str]] = None) -> int:
logging.getLogger().setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG)
if A.page_numbers: if A.page_numbers:
A.page_numbers = {x-1 for x in A.page_numbers} A.page_numbers = {x - 1 for x in A.page_numbers}
if A.pagenos: if A.pagenos:
A.page_numbers = {int(x)-1 for x in A.pagenos.split(",")} A.page_numbers = {int(x) - 1 for x in A.pagenos.split(",")}
if A.output_type == "text" and A.outfile != "-": if A.output_type == "text" and A.outfile != "-":
for override, alttype in ((".htm", "html"), for override, alttype in (
(".html", "html"), (".htm", "html"),
(".xml", "xml"), (".html", "html"),
(".tag", "tag")): (".xml", "xml"),
(".tag", "tag"),
):
if A.outfile.endswith(override): if A.outfile.endswith(override):
A.output_type = alttype A.output_type = alttype
if A.outfile == "-": if A.outfile == "-":
outfp = sys.stdout outfp = sys.stdout
else: else:
outfp = open(A.outfile, "w", encoding='utf-8') outfp = open(A.outfile, "w", encoding="utf-8")
outfp.writelines(compare(**vars(A))) outfp.writelines(compare(**vars(A)))
outfp.close() outfp.close()
return 0 return 0
if __name__ == '__main__': if __name__ == "__main__":
sys.exit(main()) sys.exit(main())

View File

@ -21,7 +21,7 @@ _, SCRIPT = os.path.split(__file__)
def msg(*args: object, **kwargs: Any) -> None: def msg(*args: object, **kwargs: Any) -> None:
print(' '.join(map(str, args)), **kwargs) # noqa E999 print(" ".join(map(str, args)), **kwargs) # noqa E999
def flat_iter(obj: object) -> Iterator[object]: def flat_iter(obj: object) -> Iterator[object]:
@ -35,22 +35,22 @@ def main(args: List[str]) -> int:
msg(SCRIPT, args) msg(SCRIPT, args)
if len(args) != 1: if len(args) != 1:
msg('Parse a PDF file and print some pdfminer-specific stats') msg("Parse a PDF file and print some pdfminer-specific stats")
msg('Usage:', SCRIPT, '<PDF-filename>') msg("Usage:", SCRIPT, "<PDF-filename>")
return 1 return 1
infilename, = args (infilename,) = args
lt_types: Counter[str] = collections.Counter() lt_types: Counter[str] = collections.Counter()
with open(infilename, 'rb') as pdf_file: with open(infilename, "rb") as pdf_file:
# Create a PDF parser object associated with the file object. # Create a PDF parser object associated with the file object.
parser = PDFParser(pdf_file) parser = PDFParser(pdf_file)
# Create a PDF document object that stores the document structure. # Create a PDF document object that stores the document structure.
# Supply the password for initialization. # Supply the password for initialization.
password = '' password = ""
document = PDFDocument(parser, password) document = PDFDocument(parser, password)
# Check if the document allows text extraction. # Check if the document allows text extraction.
if not document.is_extractable: if not document.is_extractable:
@ -63,7 +63,7 @@ def main(args: List[str]) -> int:
laparams = LAParams( laparams = LAParams(
detect_vertical=True, detect_vertical=True,
all_texts=True, all_texts=True,
) )
device = PDFPageAggregator(rsrcmgr, laparams=laparams) device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
@ -75,11 +75,11 @@ def main(args: List[str]) -> int:
lt_types.update(type(item).__name__ for item in flat_iter(layout)) lt_types.update(type(item).__name__ for item in flat_iter(layout))
msg('page_count', page_count) msg("page_count", page_count)
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items())) msg("lt_types:", " ".join("{}:{}".format(*tc) for tc in lt_types.items()))
return 0 return 0
if __name__ == '__main__': if __name__ == "__main__":
sys.exit(main(sys.argv[1:])) sys.exit(main(sys.argv[1:]))

View File

@ -7,15 +7,16 @@ def prof_main(argv: List[str]) -> int:
import hotshot.stats # type: ignore[import] import hotshot.stats # type: ignore[import]
def usage() -> int: def usage() -> int:
print('usage: %s module.function [args ...]' % argv[0]) print("usage: %s module.function [args ...]" % argv[0])
return 100 return 100
args = argv[1:] args = argv[1:]
if len(args) < 1: if len(args) < 1:
return usage() return usage()
name = args.pop(0) name = args.pop(0)
prof = name+'.prof' prof = name + ".prof"
i = name.rindex('.') i = name.rindex(".")
(modname, funcname) = (name[:i], name[i+1:]) (modname, funcname) = (name[:i], name[i + 1 :])
# Type error: fromlist expects sequence of strings; presumably the intent # Type error: fromlist expects sequence of strings; presumably the intent
# is to retrieve the named module rather than a top-level package (as in # is to retrieve the named module rather than a top-level package (as in
@ -31,10 +32,10 @@ def prof_main(argv: List[str]) -> int:
else: else:
stats = hotshot.stats.load(prof) stats = hotshot.stats.load(prof)
stats.strip_dirs() stats.strip_dirs()
stats.sort_stats('time', 'calls') stats.sort_stats("time", "calls")
stats.print_stats(1000) stats.print_stats(1000)
return 0 return 0
if __name__ == '__main__': if __name__ == "__main__":
sys.exit(prof_main(sys.argv)) sys.exit(prof_main(sys.argv))