Check blackness in github actions (#711)

* Check blackness in github actions

* Blacken code

* Update github action names

* Add contributing guidelines on using black

* Add to checklist for PR
pull/688/head^2
Pieter Marsman 2022-02-11 22:46:51 +01:00 committed by GitHub
parent 830acff94c
commit b9a8920cdf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
60 changed files with 12836 additions and 7435 deletions

5
.flake8 Normal file
View File

@ -0,0 +1,5 @@
[flake8]
max-line-length = 88
extend-ignore =
# See https://github.com/PyCQA/pycodestyle/issues/373
E203,

View File

@ -1,22 +1,17 @@
**Pull request**
Thanks for improving pdfminer.six! Please include the following information to
help us discuss and merge this PR:
- A description of why this PR is needed. What does it fix? What does it
improve?
- A summary of the things that this PR changes.
- Reference the issues that this PR fixes (use the fixes #(issue nr) syntax).
If this PR does not fix any issue, create the issue first and mention that
you are willing to work on it.
Please remove this paragraph and replace it with a description of your PR.
Also include links to the issues that it fixes.
**How Has This Been Tested?**
Please describe the tests that you ran to verify your changes. Provide
instructions so we can reproduce. Include an example pdf if you have one.
Please repalce this paragraph with a description of how this PR has been
tested. Include the necessary instructions and files such that other can
reproduce it.
**Checklist**
- [ ] I have formatted my code with [black](https://github.com/psf/black).
- [ ] I have added tests that prove my fix is effective or that my feature
works
- [ ] I have added docstrings to newly created methods and classes

View File

@ -15,6 +15,15 @@ env:
jobs:
check-code-formatting:
name: Check code formatting
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Check code formatting
uses: psf/black@stable
check-coding-style:
name: Check coding style
runs-on: ubuntu-latest

View File

@ -31,7 +31,7 @@ Any contribution is appreciated! You might want to:
* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
of features, this will show that your code works correctly.
* Code should work for Python 3.6+.
* Code should conform to PEP8 coding style.
* Code should be formatted with [black](https://github.com/psf/black).
* New features should be well documented using docstrings.
* Check spelling and grammar.
* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased])
@ -68,3 +68,9 @@ Any contribution is appreciated! You might want to:
```sh
nox -e py36
```
4. After changing the code, run the black formatter.
```sh
black .
```

View File

@ -16,14 +16,13 @@ from typing import List
import pdfminer
sys.path.insert(0, os.path.join(
os.path.abspath(os.path.dirname(__file__)), '../../'))
sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../"))
# -- Project information -----------------------------------------------------
project = 'pdfminer.six'
copyright = '2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman'
author = 'Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman'
project = "pdfminer.six"
copyright = "2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
author = "Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
# The full version, including alpha/beta/rc tags
release = pdfminer.__version__
@ -35,16 +34,16 @@ release = pdfminer.__version__
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinxarg.ext',
'sphinx.ext.autodoc',
'sphinx.ext.doctest',
"sphinxarg.ext",
"sphinx.ext.autodoc",
"sphinx.ext.doctest",
]
# Root rst file
master_doc = 'index'
master_doc = "index"
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
@ -57,9 +56,9 @@ exclude_patterns: List[str] = []
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
html_theme = "alabaster"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ["_static"]

View File

@ -6,53 +6,30 @@ PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"]
@nox.session
def lint(session):
session.install('flake8')
session.run(
'flake8',
'pdfminer/',
'tools/',
'tests/',
'--count',
'--statistics'
)
session.install("flake8")
session.run("flake8", "pdfminer/", "tools/", "tests/", "--count", "--statistics")
@nox.session
def types(session):
session.install('mypy')
session.install("mypy")
session.run(
'mypy',
'--install-types',
'--non-interactive',
'--show-error-codes',
'.'
"mypy", "--install-types", "--non-interactive", "--show-error-codes", "."
)
@nox.session(python=PYTHON_ALL_VERSIONS)
def tests(session):
session.install("-e", ".[dev]")
session.run('pytest')
session.run("pytest")
@nox.session
def docs(session):
session.install("-e", ".[docs]")
session.run(
'python',
'-m',
'sphinx',
'-b',
'html',
'docs/source',
'docs/build/html'
"python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html"
)
session.run(
'python',
'-m',
'sphinx',
'-b',
'doctest',
'docs/source',
'docs/build/doctest'
"python", "-m", "sphinx", "-b", "doctest", "docs/source", "docs/build/doctest"
)

View File

@ -1,4 +1,4 @@
__version__ = '20211012'
__version__ = "20211012"
if __name__ == '__main__':
if __name__ == "__main__":
print(__version__)

View File

@ -18,7 +18,7 @@
"""An implementation of RFC4013 SASLprep."""
__all__ = ['saslprep']
__all__ = ["saslprep"]
import stringprep
from typing import Callable, Tuple
@ -37,7 +37,8 @@ _PROHIBITED: Tuple[Callable[[str], bool], ...] = (
stringprep.in_table_c6,
stringprep.in_table_c7,
stringprep.in_table_c8,
stringprep.in_table_c9)
stringprep.in_table_c9,
)
def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
@ -63,12 +64,12 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
in_table_c12 = stringprep.in_table_c12
in_table_b1 = stringprep.in_table_b1
data = "".join(
["\u0020" if in_table_c12(elt) else elt
for elt in data if not in_table_b1(elt)])
["\u0020" if in_table_c12(elt) else elt for elt in data if not in_table_b1(elt)]
)
# RFC3454 section 2, step 2 - Normalize
# RFC4013 section 2.2 normalization
data = unicodedata.ucd_3_2_0.normalize('NFKC', data)
data = unicodedata.ucd_3_2_0.normalize("NFKC", data)
in_table_d1 = stringprep.in_table_d1
if in_table_d1(data[0]):
@ -89,7 +90,6 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
# RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
for char in data:
if any(in_table(char) for in_table in prohibited):
raise ValueError(
"SASLprep: failed prohibited character check")
raise ValueError("SASLprep: failed prohibited character check")
return data

View File

@ -9,7 +9,6 @@ from typing import Sequence
class Arcfour:
def __init__(self, key: Sequence[int]) -> None:
# because Py3 range is not indexable
s = [i for i in range(256)]
@ -24,7 +23,7 @@ class Arcfour:
def process(self, data: bytes) -> bytes:
(i, j) = (self.i, self.j)
s = self.s
r = b''
r = b""
for c in iter(data):
i = (i + 1) % 256
j = (j + s[i]) % 256

View File

@ -21,30 +21,30 @@ def ascii85decode(data: bytes) -> bytes:
"""
n = b = 0
out = b''
out = b""
for i in iter(data):
c = bytes((i,))
if b'!' <= c and c <= b'u':
if b"!" <= c and c <= b"u":
n += 1
b = b * 85 + (ord(c) - 33)
if n == 5:
out += struct.pack('>L', b)
out += struct.pack(">L", b)
n = b = 0
elif c == b'z':
elif c == b"z":
assert n == 0, str(n)
out += b'\0\0\0\0'
elif c == b'~':
out += b"\0\0\0\0"
elif c == b"~":
if n:
for _ in range(5 - n):
b = b * 85 + 84
out += struct.pack('>L', b)[:n-1]
out += struct.pack(">L", b)[: n - 1]
break
return out
# asciihexdecode(data)
hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
def asciihexdecode(data: bytes) -> bytes:
@ -57,15 +57,16 @@ def asciihexdecode(data: bytes) -> bytes:
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
"""
def decode(x: bytes) -> bytes:
i = int(x, 16)
return bytes((i,))
out = b''
out = b""
for x in hex_re.findall(data):
out += decode(x)
m = trail_re.search(data)
if m:
out += decode(m.group(1)+b'0')
out += decode(m.group(1) + b"0")
return out

View File

@ -12,8 +12,18 @@
import array
from typing import (Any, Callable, Dict, Iterator, List, MutableSequence,
Optional, Sequence, Union, cast)
from typing import (
Any,
Callable,
Dict,
Iterator,
List,
MutableSequence,
Optional,
Sequence,
Union,
cast,
)
def get_bytes(data: bytes) -> Iterator[int]:
@ -46,7 +56,7 @@ class BitParser:
if p[b] is None:
p[b] = [None, None]
p = p[b]
if bits[i] == '1':
if bits[i] == "1":
b = 1
else:
b = 0
@ -74,252 +84,252 @@ class BitParser:
class CCITTG4Parser(BitParser):
MODE = [None, None]
BitParser.add(MODE, 0, '1')
BitParser.add(MODE, +1, '011')
BitParser.add(MODE, -1, '010')
BitParser.add(MODE, 'h', '001')
BitParser.add(MODE, 'p', '0001')
BitParser.add(MODE, +2, '000011')
BitParser.add(MODE, -2, '000010')
BitParser.add(MODE, +3, '0000011')
BitParser.add(MODE, -3, '0000010')
BitParser.add(MODE, 'u', '0000001111')
BitParser.add(MODE, 'x1', '0000001000')
BitParser.add(MODE, 'x2', '0000001001')
BitParser.add(MODE, 'x3', '0000001010')
BitParser.add(MODE, 'x4', '0000001011')
BitParser.add(MODE, 'x5', '0000001100')
BitParser.add(MODE, 'x6', '0000001101')
BitParser.add(MODE, 'x7', '0000001110')
BitParser.add(MODE, 'e', '000000000001000000000001')
BitParser.add(MODE, 0, "1")
BitParser.add(MODE, +1, "011")
BitParser.add(MODE, -1, "010")
BitParser.add(MODE, "h", "001")
BitParser.add(MODE, "p", "0001")
BitParser.add(MODE, +2, "000011")
BitParser.add(MODE, -2, "000010")
BitParser.add(MODE, +3, "0000011")
BitParser.add(MODE, -3, "0000010")
BitParser.add(MODE, "u", "0000001111")
BitParser.add(MODE, "x1", "0000001000")
BitParser.add(MODE, "x2", "0000001001")
BitParser.add(MODE, "x3", "0000001010")
BitParser.add(MODE, "x4", "0000001011")
BitParser.add(MODE, "x5", "0000001100")
BitParser.add(MODE, "x6", "0000001101")
BitParser.add(MODE, "x7", "0000001110")
BitParser.add(MODE, "e", "000000000001000000000001")
WHITE = [None, None]
BitParser.add(WHITE, 0, '00110101')
BitParser.add(WHITE, 1, '000111')
BitParser.add(WHITE, 2, '0111')
BitParser.add(WHITE, 3, '1000')
BitParser.add(WHITE, 4, '1011')
BitParser.add(WHITE, 5, '1100')
BitParser.add(WHITE, 6, '1110')
BitParser.add(WHITE, 7, '1111')
BitParser.add(WHITE, 8, '10011')
BitParser.add(WHITE, 9, '10100')
BitParser.add(WHITE, 10, '00111')
BitParser.add(WHITE, 11, '01000')
BitParser.add(WHITE, 12, '001000')
BitParser.add(WHITE, 13, '000011')
BitParser.add(WHITE, 14, '110100')
BitParser.add(WHITE, 15, '110101')
BitParser.add(WHITE, 16, '101010')
BitParser.add(WHITE, 17, '101011')
BitParser.add(WHITE, 18, '0100111')
BitParser.add(WHITE, 19, '0001100')
BitParser.add(WHITE, 20, '0001000')
BitParser.add(WHITE, 21, '0010111')
BitParser.add(WHITE, 22, '0000011')
BitParser.add(WHITE, 23, '0000100')
BitParser.add(WHITE, 24, '0101000')
BitParser.add(WHITE, 25, '0101011')
BitParser.add(WHITE, 26, '0010011')
BitParser.add(WHITE, 27, '0100100')
BitParser.add(WHITE, 28, '0011000')
BitParser.add(WHITE, 29, '00000010')
BitParser.add(WHITE, 30, '00000011')
BitParser.add(WHITE, 31, '00011010')
BitParser.add(WHITE, 32, '00011011')
BitParser.add(WHITE, 33, '00010010')
BitParser.add(WHITE, 34, '00010011')
BitParser.add(WHITE, 35, '00010100')
BitParser.add(WHITE, 36, '00010101')
BitParser.add(WHITE, 37, '00010110')
BitParser.add(WHITE, 38, '00010111')
BitParser.add(WHITE, 39, '00101000')
BitParser.add(WHITE, 40, '00101001')
BitParser.add(WHITE, 41, '00101010')
BitParser.add(WHITE, 42, '00101011')
BitParser.add(WHITE, 43, '00101100')
BitParser.add(WHITE, 44, '00101101')
BitParser.add(WHITE, 45, '00000100')
BitParser.add(WHITE, 46, '00000101')
BitParser.add(WHITE, 47, '00001010')
BitParser.add(WHITE, 48, '00001011')
BitParser.add(WHITE, 49, '01010010')
BitParser.add(WHITE, 50, '01010011')
BitParser.add(WHITE, 51, '01010100')
BitParser.add(WHITE, 52, '01010101')
BitParser.add(WHITE, 53, '00100100')
BitParser.add(WHITE, 54, '00100101')
BitParser.add(WHITE, 55, '01011000')
BitParser.add(WHITE, 56, '01011001')
BitParser.add(WHITE, 57, '01011010')
BitParser.add(WHITE, 58, '01011011')
BitParser.add(WHITE, 59, '01001010')
BitParser.add(WHITE, 60, '01001011')
BitParser.add(WHITE, 61, '00110010')
BitParser.add(WHITE, 62, '00110011')
BitParser.add(WHITE, 63, '00110100')
BitParser.add(WHITE, 64, '11011')
BitParser.add(WHITE, 128, '10010')
BitParser.add(WHITE, 192, '010111')
BitParser.add(WHITE, 256, '0110111')
BitParser.add(WHITE, 320, '00110110')
BitParser.add(WHITE, 384, '00110111')
BitParser.add(WHITE, 448, '01100100')
BitParser.add(WHITE, 512, '01100101')
BitParser.add(WHITE, 576, '01101000')
BitParser.add(WHITE, 640, '01100111')
BitParser.add(WHITE, 704, '011001100')
BitParser.add(WHITE, 768, '011001101')
BitParser.add(WHITE, 832, '011010010')
BitParser.add(WHITE, 896, '011010011')
BitParser.add(WHITE, 960, '011010100')
BitParser.add(WHITE, 1024, '011010101')
BitParser.add(WHITE, 1088, '011010110')
BitParser.add(WHITE, 1152, '011010111')
BitParser.add(WHITE, 1216, '011011000')
BitParser.add(WHITE, 1280, '011011001')
BitParser.add(WHITE, 1344, '011011010')
BitParser.add(WHITE, 1408, '011011011')
BitParser.add(WHITE, 1472, '010011000')
BitParser.add(WHITE, 1536, '010011001')
BitParser.add(WHITE, 1600, '010011010')
BitParser.add(WHITE, 1664, '011000')
BitParser.add(WHITE, 1728, '010011011')
BitParser.add(WHITE, 1792, '00000001000')
BitParser.add(WHITE, 1856, '00000001100')
BitParser.add(WHITE, 1920, '00000001101')
BitParser.add(WHITE, 1984, '000000010010')
BitParser.add(WHITE, 2048, '000000010011')
BitParser.add(WHITE, 2112, '000000010100')
BitParser.add(WHITE, 2176, '000000010101')
BitParser.add(WHITE, 2240, '000000010110')
BitParser.add(WHITE, 2304, '000000010111')
BitParser.add(WHITE, 2368, '000000011100')
BitParser.add(WHITE, 2432, '000000011101')
BitParser.add(WHITE, 2496, '000000011110')
BitParser.add(WHITE, 2560, '000000011111')
BitParser.add(WHITE, 0, "00110101")
BitParser.add(WHITE, 1, "000111")
BitParser.add(WHITE, 2, "0111")
BitParser.add(WHITE, 3, "1000")
BitParser.add(WHITE, 4, "1011")
BitParser.add(WHITE, 5, "1100")
BitParser.add(WHITE, 6, "1110")
BitParser.add(WHITE, 7, "1111")
BitParser.add(WHITE, 8, "10011")
BitParser.add(WHITE, 9, "10100")
BitParser.add(WHITE, 10, "00111")
BitParser.add(WHITE, 11, "01000")
BitParser.add(WHITE, 12, "001000")
BitParser.add(WHITE, 13, "000011")
BitParser.add(WHITE, 14, "110100")
BitParser.add(WHITE, 15, "110101")
BitParser.add(WHITE, 16, "101010")
BitParser.add(WHITE, 17, "101011")
BitParser.add(WHITE, 18, "0100111")
BitParser.add(WHITE, 19, "0001100")
BitParser.add(WHITE, 20, "0001000")
BitParser.add(WHITE, 21, "0010111")
BitParser.add(WHITE, 22, "0000011")
BitParser.add(WHITE, 23, "0000100")
BitParser.add(WHITE, 24, "0101000")
BitParser.add(WHITE, 25, "0101011")
BitParser.add(WHITE, 26, "0010011")
BitParser.add(WHITE, 27, "0100100")
BitParser.add(WHITE, 28, "0011000")
BitParser.add(WHITE, 29, "00000010")
BitParser.add(WHITE, 30, "00000011")
BitParser.add(WHITE, 31, "00011010")
BitParser.add(WHITE, 32, "00011011")
BitParser.add(WHITE, 33, "00010010")
BitParser.add(WHITE, 34, "00010011")
BitParser.add(WHITE, 35, "00010100")
BitParser.add(WHITE, 36, "00010101")
BitParser.add(WHITE, 37, "00010110")
BitParser.add(WHITE, 38, "00010111")
BitParser.add(WHITE, 39, "00101000")
BitParser.add(WHITE, 40, "00101001")
BitParser.add(WHITE, 41, "00101010")
BitParser.add(WHITE, 42, "00101011")
BitParser.add(WHITE, 43, "00101100")
BitParser.add(WHITE, 44, "00101101")
BitParser.add(WHITE, 45, "00000100")
BitParser.add(WHITE, 46, "00000101")
BitParser.add(WHITE, 47, "00001010")
BitParser.add(WHITE, 48, "00001011")
BitParser.add(WHITE, 49, "01010010")
BitParser.add(WHITE, 50, "01010011")
BitParser.add(WHITE, 51, "01010100")
BitParser.add(WHITE, 52, "01010101")
BitParser.add(WHITE, 53, "00100100")
BitParser.add(WHITE, 54, "00100101")
BitParser.add(WHITE, 55, "01011000")
BitParser.add(WHITE, 56, "01011001")
BitParser.add(WHITE, 57, "01011010")
BitParser.add(WHITE, 58, "01011011")
BitParser.add(WHITE, 59, "01001010")
BitParser.add(WHITE, 60, "01001011")
BitParser.add(WHITE, 61, "00110010")
BitParser.add(WHITE, 62, "00110011")
BitParser.add(WHITE, 63, "00110100")
BitParser.add(WHITE, 64, "11011")
BitParser.add(WHITE, 128, "10010")
BitParser.add(WHITE, 192, "010111")
BitParser.add(WHITE, 256, "0110111")
BitParser.add(WHITE, 320, "00110110")
BitParser.add(WHITE, 384, "00110111")
BitParser.add(WHITE, 448, "01100100")
BitParser.add(WHITE, 512, "01100101")
BitParser.add(WHITE, 576, "01101000")
BitParser.add(WHITE, 640, "01100111")
BitParser.add(WHITE, 704, "011001100")
BitParser.add(WHITE, 768, "011001101")
BitParser.add(WHITE, 832, "011010010")
BitParser.add(WHITE, 896, "011010011")
BitParser.add(WHITE, 960, "011010100")
BitParser.add(WHITE, 1024, "011010101")
BitParser.add(WHITE, 1088, "011010110")
BitParser.add(WHITE, 1152, "011010111")
BitParser.add(WHITE, 1216, "011011000")
BitParser.add(WHITE, 1280, "011011001")
BitParser.add(WHITE, 1344, "011011010")
BitParser.add(WHITE, 1408, "011011011")
BitParser.add(WHITE, 1472, "010011000")
BitParser.add(WHITE, 1536, "010011001")
BitParser.add(WHITE, 1600, "010011010")
BitParser.add(WHITE, 1664, "011000")
BitParser.add(WHITE, 1728, "010011011")
BitParser.add(WHITE, 1792, "00000001000")
BitParser.add(WHITE, 1856, "00000001100")
BitParser.add(WHITE, 1920, "00000001101")
BitParser.add(WHITE, 1984, "000000010010")
BitParser.add(WHITE, 2048, "000000010011")
BitParser.add(WHITE, 2112, "000000010100")
BitParser.add(WHITE, 2176, "000000010101")
BitParser.add(WHITE, 2240, "000000010110")
BitParser.add(WHITE, 2304, "000000010111")
BitParser.add(WHITE, 2368, "000000011100")
BitParser.add(WHITE, 2432, "000000011101")
BitParser.add(WHITE, 2496, "000000011110")
BitParser.add(WHITE, 2560, "000000011111")
BLACK = [None, None]
BitParser.add(BLACK, 0, '0000110111')
BitParser.add(BLACK, 1, '010')
BitParser.add(BLACK, 2, '11')
BitParser.add(BLACK, 3, '10')
BitParser.add(BLACK, 4, '011')
BitParser.add(BLACK, 5, '0011')
BitParser.add(BLACK, 6, '0010')
BitParser.add(BLACK, 7, '00011')
BitParser.add(BLACK, 8, '000101')
BitParser.add(BLACK, 9, '000100')
BitParser.add(BLACK, 10, '0000100')
BitParser.add(BLACK, 11, '0000101')
BitParser.add(BLACK, 12, '0000111')
BitParser.add(BLACK, 13, '00000100')
BitParser.add(BLACK, 14, '00000111')
BitParser.add(BLACK, 15, '000011000')
BitParser.add(BLACK, 16, '0000010111')
BitParser.add(BLACK, 17, '0000011000')
BitParser.add(BLACK, 18, '0000001000')
BitParser.add(BLACK, 19, '00001100111')
BitParser.add(BLACK, 20, '00001101000')
BitParser.add(BLACK, 21, '00001101100')
BitParser.add(BLACK, 22, '00000110111')
BitParser.add(BLACK, 23, '00000101000')
BitParser.add(BLACK, 24, '00000010111')
BitParser.add(BLACK, 25, '00000011000')
BitParser.add(BLACK, 26, '000011001010')
BitParser.add(BLACK, 27, '000011001011')
BitParser.add(BLACK, 28, '000011001100')
BitParser.add(BLACK, 29, '000011001101')
BitParser.add(BLACK, 30, '000001101000')
BitParser.add(BLACK, 31, '000001101001')
BitParser.add(BLACK, 32, '000001101010')
BitParser.add(BLACK, 33, '000001101011')
BitParser.add(BLACK, 34, '000011010010')
BitParser.add(BLACK, 35, '000011010011')
BitParser.add(BLACK, 36, '000011010100')
BitParser.add(BLACK, 37, '000011010101')
BitParser.add(BLACK, 38, '000011010110')
BitParser.add(BLACK, 39, '000011010111')
BitParser.add(BLACK, 40, '000001101100')
BitParser.add(BLACK, 41, '000001101101')
BitParser.add(BLACK, 42, '000011011010')
BitParser.add(BLACK, 43, '000011011011')
BitParser.add(BLACK, 44, '000001010100')
BitParser.add(BLACK, 45, '000001010101')
BitParser.add(BLACK, 46, '000001010110')
BitParser.add(BLACK, 47, '000001010111')
BitParser.add(BLACK, 48, '000001100100')
BitParser.add(BLACK, 49, '000001100101')
BitParser.add(BLACK, 50, '000001010010')
BitParser.add(BLACK, 51, '000001010011')
BitParser.add(BLACK, 52, '000000100100')
BitParser.add(BLACK, 53, '000000110111')
BitParser.add(BLACK, 54, '000000111000')
BitParser.add(BLACK, 55, '000000100111')
BitParser.add(BLACK, 56, '000000101000')
BitParser.add(BLACK, 57, '000001011000')
BitParser.add(BLACK, 58, '000001011001')
BitParser.add(BLACK, 59, '000000101011')
BitParser.add(BLACK, 60, '000000101100')
BitParser.add(BLACK, 61, '000001011010')
BitParser.add(BLACK, 62, '000001100110')
BitParser.add(BLACK, 63, '000001100111')
BitParser.add(BLACK, 64, '0000001111')
BitParser.add(BLACK, 128, '000011001000')
BitParser.add(BLACK, 192, '000011001001')
BitParser.add(BLACK, 256, '000001011011')
BitParser.add(BLACK, 320, '000000110011')
BitParser.add(BLACK, 384, '000000110100')
BitParser.add(BLACK, 448, '000000110101')
BitParser.add(BLACK, 512, '0000001101100')
BitParser.add(BLACK, 576, '0000001101101')
BitParser.add(BLACK, 640, '0000001001010')
BitParser.add(BLACK, 704, '0000001001011')
BitParser.add(BLACK, 768, '0000001001100')
BitParser.add(BLACK, 832, '0000001001101')
BitParser.add(BLACK, 896, '0000001110010')
BitParser.add(BLACK, 960, '0000001110011')
BitParser.add(BLACK, 1024, '0000001110100')
BitParser.add(BLACK, 1088, '0000001110101')
BitParser.add(BLACK, 1152, '0000001110110')
BitParser.add(BLACK, 1216, '0000001110111')
BitParser.add(BLACK, 1280, '0000001010010')
BitParser.add(BLACK, 1344, '0000001010011')
BitParser.add(BLACK, 1408, '0000001010100')
BitParser.add(BLACK, 1472, '0000001010101')
BitParser.add(BLACK, 1536, '0000001011010')
BitParser.add(BLACK, 1600, '0000001011011')
BitParser.add(BLACK, 1664, '0000001100100')
BitParser.add(BLACK, 1728, '0000001100101')
BitParser.add(BLACK, 1792, '00000001000')
BitParser.add(BLACK, 1856, '00000001100')
BitParser.add(BLACK, 1920, '00000001101')
BitParser.add(BLACK, 1984, '000000010010')
BitParser.add(BLACK, 2048, '000000010011')
BitParser.add(BLACK, 2112, '000000010100')
BitParser.add(BLACK, 2176, '000000010101')
BitParser.add(BLACK, 2240, '000000010110')
BitParser.add(BLACK, 2304, '000000010111')
BitParser.add(BLACK, 2368, '000000011100')
BitParser.add(BLACK, 2432, '000000011101')
BitParser.add(BLACK, 2496, '000000011110')
BitParser.add(BLACK, 2560, '000000011111')
BitParser.add(BLACK, 0, "0000110111")
BitParser.add(BLACK, 1, "010")
BitParser.add(BLACK, 2, "11")
BitParser.add(BLACK, 3, "10")
BitParser.add(BLACK, 4, "011")
BitParser.add(BLACK, 5, "0011")
BitParser.add(BLACK, 6, "0010")
BitParser.add(BLACK, 7, "00011")
BitParser.add(BLACK, 8, "000101")
BitParser.add(BLACK, 9, "000100")
BitParser.add(BLACK, 10, "0000100")
BitParser.add(BLACK, 11, "0000101")
BitParser.add(BLACK, 12, "0000111")
BitParser.add(BLACK, 13, "00000100")
BitParser.add(BLACK, 14, "00000111")
BitParser.add(BLACK, 15, "000011000")
BitParser.add(BLACK, 16, "0000010111")
BitParser.add(BLACK, 17, "0000011000")
BitParser.add(BLACK, 18, "0000001000")
BitParser.add(BLACK, 19, "00001100111")
BitParser.add(BLACK, 20, "00001101000")
BitParser.add(BLACK, 21, "00001101100")
BitParser.add(BLACK, 22, "00000110111")
BitParser.add(BLACK, 23, "00000101000")
BitParser.add(BLACK, 24, "00000010111")
BitParser.add(BLACK, 25, "00000011000")
BitParser.add(BLACK, 26, "000011001010")
BitParser.add(BLACK, 27, "000011001011")
BitParser.add(BLACK, 28, "000011001100")
BitParser.add(BLACK, 29, "000011001101")
BitParser.add(BLACK, 30, "000001101000")
BitParser.add(BLACK, 31, "000001101001")
BitParser.add(BLACK, 32, "000001101010")
BitParser.add(BLACK, 33, "000001101011")
BitParser.add(BLACK, 34, "000011010010")
BitParser.add(BLACK, 35, "000011010011")
BitParser.add(BLACK, 36, "000011010100")
BitParser.add(BLACK, 37, "000011010101")
BitParser.add(BLACK, 38, "000011010110")
BitParser.add(BLACK, 39, "000011010111")
BitParser.add(BLACK, 40, "000001101100")
BitParser.add(BLACK, 41, "000001101101")
BitParser.add(BLACK, 42, "000011011010")
BitParser.add(BLACK, 43, "000011011011")
BitParser.add(BLACK, 44, "000001010100")
BitParser.add(BLACK, 45, "000001010101")
BitParser.add(BLACK, 46, "000001010110")
BitParser.add(BLACK, 47, "000001010111")
BitParser.add(BLACK, 48, "000001100100")
BitParser.add(BLACK, 49, "000001100101")
BitParser.add(BLACK, 50, "000001010010")
BitParser.add(BLACK, 51, "000001010011")
BitParser.add(BLACK, 52, "000000100100")
BitParser.add(BLACK, 53, "000000110111")
BitParser.add(BLACK, 54, "000000111000")
BitParser.add(BLACK, 55, "000000100111")
BitParser.add(BLACK, 56, "000000101000")
BitParser.add(BLACK, 57, "000001011000")
BitParser.add(BLACK, 58, "000001011001")
BitParser.add(BLACK, 59, "000000101011")
BitParser.add(BLACK, 60, "000000101100")
BitParser.add(BLACK, 61, "000001011010")
BitParser.add(BLACK, 62, "000001100110")
BitParser.add(BLACK, 63, "000001100111")
BitParser.add(BLACK, 64, "0000001111")
BitParser.add(BLACK, 128, "000011001000")
BitParser.add(BLACK, 192, "000011001001")
BitParser.add(BLACK, 256, "000001011011")
BitParser.add(BLACK, 320, "000000110011")
BitParser.add(BLACK, 384, "000000110100")
BitParser.add(BLACK, 448, "000000110101")
BitParser.add(BLACK, 512, "0000001101100")
BitParser.add(BLACK, 576, "0000001101101")
BitParser.add(BLACK, 640, "0000001001010")
BitParser.add(BLACK, 704, "0000001001011")
BitParser.add(BLACK, 768, "0000001001100")
BitParser.add(BLACK, 832, "0000001001101")
BitParser.add(BLACK, 896, "0000001110010")
BitParser.add(BLACK, 960, "0000001110011")
BitParser.add(BLACK, 1024, "0000001110100")
BitParser.add(BLACK, 1088, "0000001110101")
BitParser.add(BLACK, 1152, "0000001110110")
BitParser.add(BLACK, 1216, "0000001110111")
BitParser.add(BLACK, 1280, "0000001010010")
BitParser.add(BLACK, 1344, "0000001010011")
BitParser.add(BLACK, 1408, "0000001010100")
BitParser.add(BLACK, 1472, "0000001010101")
BitParser.add(BLACK, 1536, "0000001011010")
BitParser.add(BLACK, 1600, "0000001011011")
BitParser.add(BLACK, 1664, "0000001100100")
BitParser.add(BLACK, 1728, "0000001100101")
BitParser.add(BLACK, 1792, "00000001000")
BitParser.add(BLACK, 1856, "00000001100")
BitParser.add(BLACK, 1920, "00000001101")
BitParser.add(BLACK, 1984, "000000010010")
BitParser.add(BLACK, 2048, "000000010011")
BitParser.add(BLACK, 2112, "000000010100")
BitParser.add(BLACK, 2176, "000000010101")
BitParser.add(BLACK, 2240, "000000010110")
BitParser.add(BLACK, 2304, "000000010111")
BitParser.add(BLACK, 2368, "000000011100")
BitParser.add(BLACK, 2432, "000000011101")
BitParser.add(BLACK, 2496, "000000011110")
BitParser.add(BLACK, 2560, "000000011111")
UNCOMPRESSED = [None, None]
BitParser.add(UNCOMPRESSED, '1', '1')
BitParser.add(UNCOMPRESSED, '01', '01')
BitParser.add(UNCOMPRESSED, '001', '001')
BitParser.add(UNCOMPRESSED, '0001', '0001')
BitParser.add(UNCOMPRESSED, '00001', '00001')
BitParser.add(UNCOMPRESSED, '00000', '000001')
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
BitParser.add(UNCOMPRESSED, "1", "1")
BitParser.add(UNCOMPRESSED, "01", "01")
BitParser.add(UNCOMPRESSED, "001", "001")
BitParser.add(UNCOMPRESSED, "0001", "0001")
BitParser.add(UNCOMPRESSED, "00001", "00001")
BitParser.add(UNCOMPRESSED, "00000", "000001")
BitParser.add(UNCOMPRESSED, "T00", "00000011")
BitParser.add(UNCOMPRESSED, "T10", "00000010")
BitParser.add(UNCOMPRESSED, "T000", "000000011")
BitParser.add(UNCOMPRESSED, "T100", "000000010")
BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
class EOFB(Exception):
pass
@ -352,21 +362,21 @@ class CCITTG4Parser(BitParser):
return
def _parse_mode(self, mode: object) -> BitParserState:
if mode == 'p':
if mode == "p":
self._do_pass()
self._flush_line()
return self.MODE
elif mode == 'h':
elif mode == "h":
self._n1 = 0
self._accept = self._parse_horiz1
if self._color:
return self.WHITE
else:
return self.BLACK
elif mode == 'u':
elif mode == "u":
self._accept = self._parse_uncompressed
return self.UNCOMPRESSED
elif mode == 'e':
elif mode == "e":
raise self.EOFB
elif isinstance(mode, int):
self._do_vertical(mode)
@ -406,7 +416,7 @@ class CCITTG4Parser(BitParser):
def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
if not bits:
raise self.InvalidData
if bits.startswith('T'):
if bits.startswith("T"):
self._accept = self._parse_mode
self._color = int(bits[1])
self._do_uncompressed(bits[2:])
@ -416,33 +426,37 @@ class CCITTG4Parser(BitParser):
return self.UNCOMPRESSED
def _get_bits(self) -> str:
return ''.join(str(b) for b in self._curline[:self._curpos])
return "".join(str(b) for b in self._curline[: self._curpos])
def _get_refline(self, i: int) -> str:
if i < 0:
return '[]'+''.join(str(b) for b in self._refline)
return "[]" + "".join(str(b) for b in self._refline)
elif len(self._refline) <= i:
return ''.join(str(b) for b in self._refline)+'[]'
return "".join(str(b) for b in self._refline) + "[]"
else:
return (''.join(str(b) for b in self._refline[:i]) +
'['+str(self._refline[i])+']' +
''.join(str(b) for b in self._refline[i+1:]))
return (
"".join(str(b) for b in self._refline[:i])
+ "["
+ str(self._refline[i])
+ "]"
+ "".join(str(b) for b in self._refline[i + 1 :])
)
def reset(self) -> None:
self._y = 0
self._curline = array.array('b', [1]*self.width)
self._curline = array.array("b", [1] * self.width)
self._reset_line()
self._accept = self._parse_mode
self._state = self.MODE
return
def output_line(self, y: int, bits: Sequence[int]) -> None:
print(y, ''.join(str(b) for b in bits))
print(y, "".join(str(b) for b in bits))
return
def _reset_line(self) -> None:
self._refline = self._curline
self._curline = array.array('b', [1]*self.width)
self._curline = array.array("b", [1] * self.width)
self._curpos = -1
self._color = 1
return
@ -460,12 +474,14 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos + 1
while 1:
if x1 == 0:
if (self._color == 1 and self._refline[x1] != self._color):
if self._color == 1 and self._refline[x1] != self._color:
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color):
elif (
self._refline[x1 - 1] == self._color
and self._refline[x1] != self._color
):
break
x1 += 1
x1 += dx
@ -485,22 +501,26 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos + 1
while 1:
if x1 == 0:
if (self._color == 1 and self._refline[x1] != self._color):
if self._color == 1 and self._refline[x1] != self._color:
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color):
elif (
self._refline[x1 - 1] == self._color
and self._refline[x1] != self._color
):
break
x1 += 1
while 1:
if x1 == 0:
if (self._color == 0 and self._refline[x1] == self._color):
if self._color == 0 and self._refline[x1] == self._color:
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] != self._color and
self._refline[x1] == self._color):
elif (
self._refline[x1 - 1] != self._color
and self._refline[x1] == self._color
):
break
x1 += 1
for x in range(self._curpos, x1):
@ -534,19 +554,19 @@ class CCITTG4Parser(BitParser):
class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width: int, bytealign: bool = False,
reversed: bool = False) -> None:
def __init__(
self, width: int, bytealign: bool = False, reversed: bool = False
) -> None:
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed
self._buf = b''
self._buf = b""
return
def close(self) -> bytes:
return self._buf
def output_line(self, y: int, bits: Sequence[int]) -> None:
arr = array.array('B', [0]*((len(bits)+7)//8))
arr = array.array("B", [0] * ((len(bits) + 7) // 8))
if self.reversed:
bits = [1 - b for b in bits]
for (i, b) in enumerate(bits):
@ -557,11 +577,11 @@ class CCITTFaxDecoder(CCITTG4Parser):
def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
K = params.get('K')
K = params.get("K")
if K == -1:
cols = cast(int, params.get('Columns'))
bytealign = cast(bool, params.get('EncodedByteAlign'))
reversed = cast(bool, params.get('BlackIs1'))
cols = cast(int, params.get("Columns"))
bytealign = cast(bool, params.get("EncodedByteAlign"))
reversed = cast(bool, params.get("BlackIs1"))
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
else:
raise ValueError(K)
@ -573,12 +593,14 @@ def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
def main(argv: List[str]) -> None:
if not argv[1:]:
import unittest
unittest.main()
return
class Parser(CCITTG4Parser):
def __init__(self, width: int, bytealign: bool = False) -> None:
import pygame # type: ignore[import]
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width, 1000))
return
@ -593,11 +615,13 @@ def main(argv: List[str]) -> None:
def close(self) -> None:
import pygame
pygame.image.save(self.img, 'out.bmp')
pygame.image.save(self.img, "out.bmp")
return
for path in argv[1:]:
fp = open(path, 'rb')
(_, _, k, w, h, _) = path.split('.')
fp = open(path, "rb")
(_, _, k, w, h, _) = path.split(".")
parser = Parser(int(w))
parser.feedbytes(fp.read())
parser.close()

View File

@ -16,8 +16,20 @@ import os.path
import pickle as pickle
import struct
import sys
from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List,
MutableMapping, Optional, TextIO, Tuple, Union, cast)
from typing import (
Any,
BinaryIO,
Dict,
Iterable,
Iterator,
List,
MutableMapping,
Optional,
TextIO,
Tuple,
Union,
cast,
)
from .encodingdb import name2unicode
from .psparser import KWD
@ -45,7 +57,7 @@ class CMapBase:
self.attrs: MutableMapping[str, object] = kwargs.copy()
def is_vertical(self) -> bool:
return self.attrs.get('WMode', 0) != 0
return self.attrs.get("WMode", 0) != 0
def set_attr(self, k: str, v: object) -> None:
self.attrs[k] = v
@ -53,8 +65,7 @@ class CMapBase:
def add_code2cid(self, code: str, cid: int) -> None:
pass
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
) -> None:
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
pass
def use_cmap(self, cmap: "CMapBase") -> None:
@ -65,13 +76,12 @@ class CMapBase:
class CMap(CMapBase):
def __init__(self, **kwargs: Union[str, int]) -> None:
CMapBase.__init__(self, **kwargs)
self.code2cid: Dict[int, object] = {}
def __repr__(self) -> str:
return '<CMap: %s>' % self.attrs.get('CMapName')
return "<CMap: %s>" % self.attrs.get("CMapName")
def use_cmap(self, cmap: CMapBase) -> None:
assert isinstance(cmap, CMap), str(type(cmap))
@ -84,10 +94,11 @@ class CMap(CMapBase):
copy(d, v)
else:
dst[k] = v
copy(self.code2cid, cmap.code2cid)
def decode(self, code: bytes) -> Iterator[int]:
log.debug('decode: %r, %r', self, code)
log.debug("decode: %r, %r", self, code)
d = self.code2cid
for i in iter(code):
if i in d:
@ -100,70 +111,70 @@ class CMap(CMapBase):
else:
d = self.code2cid
def dump(self, out: TextIO = sys.stdout,
def dump(
self,
out: TextIO = sys.stdout,
code2cid: Optional[Dict[int, object]] = None,
code: Tuple[int, ...] = ()) -> None:
code: Tuple[int, ...] = (),
) -> None:
if code2cid is None:
code2cid = self.code2cid
code = ()
for (k, v) in sorted(code2cid.items()):
c = code + (k,)
if isinstance(v, int):
out.write('code %r = cid %d\n' % (c, v))
out.write("code %r = cid %d\n" % (c, v))
else:
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
class IdentityCMap(CMapBase):
def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code) // 2
if n:
return struct.unpack('>%dH' % n, code)
return struct.unpack(">%dH" % n, code)
else:
return ()
class IdentityCMapByte(IdentityCMap):
def decode(self, code: bytes) -> Tuple[int, ...]:
n = len(code)
if n:
return struct.unpack('>%dB' % n, code)
return struct.unpack(">%dB" % n, code)
else:
return ()
class UnicodeMap(CMapBase):
def __init__(self, **kwargs: Union[str, int]) -> None:
CMapBase.__init__(self, **kwargs)
self.cid2unichr: Dict[int, str] = {}
def __repr__(self) -> str:
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
return "<UnicodeMap: %s>" % self.attrs.get("CMapName")
def get_unichr(self, cid: int) -> str:
log.debug('get_unichr: %r, %r', self, cid)
log.debug("get_unichr: %r, %r", self, cid)
return self.cid2unichr[cid]
def dump(self, out: TextIO = sys.stdout) -> None:
for (k, v) in sorted(self.cid2unichr.items()):
out.write('cid %d = unicode %r\n' % (k, v))
out.write("cid %d = unicode %r\n" % (k, v))
class IdentityUnicodeMap(UnicodeMap):
def get_unichr(self, cid: int) -> str:
"""Interpret character id as unicode codepoint"""
log.debug('get_unichr: %r, %r', self, cid)
log.debug("get_unichr: %r, %r", self, cid)
return chr(cid)
class FileCMap(CMap):
def add_code2cid(self, code: str, cid: int) -> None:
assert isinstance(code, str) and isinstance(cid, int),\
str((type(code), type(cid)))
assert isinstance(code, str) and isinstance(cid, int), str(
(type(code), type(cid))
)
d = self.code2cid
for c in code[:-1]:
ci = ord(c)
@ -178,9 +189,7 @@ class FileCMap(CMap):
class FileUnicodeMap(UnicodeMap):
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
) -> None:
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
assert isinstance(cid, int), str(type(cid))
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
@ -188,7 +197,7 @@ class FileUnicodeMap(UnicodeMap):
self.cid2unichr[cid] = name2unicode(code.name)
elif isinstance(code, bytes):
# Interpret as UTF-16BE.
self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore")
elif isinstance(code, int):
self.cid2unichr[cid] = chr(code)
else:
@ -196,21 +205,19 @@ class FileUnicodeMap(UnicodeMap):
class PyCMap(CMap):
def __init__(self, name: str, module: Any) -> None:
super().__init__(CMapName=name)
self.code2cid = module.CODE2CID
if module.IS_VERTICAL:
self.attrs['WMode'] = 1
self.attrs["WMode"] = 1
class PyUnicodeMap(UnicodeMap):
def __init__(self, name: str, module: Any, vertical: bool) -> None:
super().__init__(CMapName=name)
if vertical:
self.cid2unichr = module.CID2UNICHR_V
self.attrs['WMode'] = 1
self.attrs["WMode"] = 1
else:
self.cid2unichr = module.CID2UNICHR_H
@ -226,10 +233,12 @@ class CMapDB:
@classmethod
def _load_data(cls, name: str) -> Any:
name = name.replace("\0", "")
filename = '%s.pickle.gz' % name
log.debug('loading: %r', name)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),)
filename = "%s.pickle.gz" % name
log.debug("loading: %r", name)
cmap_paths = (
os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
os.path.join(os.path.dirname(__file__), "cmap"),
)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
@ -243,13 +252,13 @@ class CMapDB:
@classmethod
def get_cmap(cls, name: str) -> CMapBase:
if name == 'Identity-H':
if name == "Identity-H":
return IdentityCMap(WMode=0)
elif name == 'Identity-V':
elif name == "Identity-V":
return IdentityCMap(WMode=1)
elif name == 'OneByteIdentityH':
elif name == "OneByteIdentityH":
return IdentityCMapByte(WMode=0)
elif name == 'OneByteIdentityV':
elif name == "OneByteIdentityV":
return IdentityCMapByte(WMode=1)
try:
return cls._cmap_cache[name]
@ -265,14 +274,12 @@ class CMapDB:
return cls._umap_cache[name][vertical]
except KeyError:
pass
data = cls._load_data('to-unicode-%s' % name)
cls._umap_cache[name] = [PyUnicodeMap(name, data, v)
for v in (False, True)]
data = cls._load_data("to-unicode-%s" % name)
cls._umap_cache[name] = [PyUnicodeMap(name, data, v) for v in (False, True)]
return cls._umap_cache[name][vertical]
class CMapParser(PSStackParser[PSKeyword]):
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
self.cmap = cmap
@ -287,22 +294,22 @@ class CMapParser(PSStackParser[PSKeyword]):
pass
return
KEYWORD_BEGINCMAP = KWD(b'begincmap')
KEYWORD_ENDCMAP = KWD(b'endcmap')
KEYWORD_USECMAP = KWD(b'usecmap')
KEYWORD_DEF = KWD(b'def')
KEYWORD_BEGINCODESPACERANGE = KWD(b'begincodespacerange')
KEYWORD_ENDCODESPACERANGE = KWD(b'endcodespacerange')
KEYWORD_BEGINCIDRANGE = KWD(b'begincidrange')
KEYWORD_ENDCIDRANGE = KWD(b'endcidrange')
KEYWORD_BEGINCIDCHAR = KWD(b'begincidchar')
KEYWORD_ENDCIDCHAR = KWD(b'endcidchar')
KEYWORD_BEGINBFRANGE = KWD(b'beginbfrange')
KEYWORD_ENDBFRANGE = KWD(b'endbfrange')
KEYWORD_BEGINBFCHAR = KWD(b'beginbfchar')
KEYWORD_ENDBFCHAR = KWD(b'endbfchar')
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
KEYWORD_BEGINCMAP = KWD(b"begincmap")
KEYWORD_ENDCMAP = KWD(b"endcmap")
KEYWORD_USECMAP = KWD(b"usecmap")
KEYWORD_DEF = KWD(b"def")
KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BEGINCMAP:
@ -346,8 +353,12 @@ class CMapParser(PSStackParser[PSKeyword]):
if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
not isinstance(cid, int) or len(s) != len(e)):
if (
not isinstance(s, bytes)
or not isinstance(e, bytes)
or not isinstance(cid, int)
or len(s) != len(e)
):
continue
sprefix = s[:-4]
eprefix = e[:-4]
@ -359,7 +370,7 @@ class CMapParser(PSStackParser[PSKeyword]):
e1 = nunpack(evar)
vlen = len(svar)
for i in range(e1 - s1 + 1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
x = sprefix + struct.pack(">L", s1 + i)[-vlen:]
self.cmap.add_cid2unichr(cid + i, x)
return
@ -379,8 +390,11 @@ class CMapParser(PSStackParser[PSKeyword]):
if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs):
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
len(s) != len(e)):
if (
not isinstance(s, bytes)
or not isinstance(e, bytes)
or len(s) != len(e)
):
continue
s1 = nunpack(s)
e1 = nunpack(e)
@ -394,7 +408,7 @@ class CMapParser(PSStackParser[PSKeyword]):
prefix = code[:-4]
vlen = len(var)
for i in range(e1 - s1 + 1):
x = prefix+struct.pack('>L', base+i)[-vlen:]
x = prefix + struct.pack(">L", base + i)[-vlen:]
self.cmap.add_cid2unichr(s1 + i, x)
return
@ -422,7 +436,7 @@ class CMapParser(PSStackParser[PSKeyword]):
def main(argv: List[str]) -> None:
args = argv[1:]
for fname in args:
fp = open(fname, 'rb')
fp = open(fname, "rb")
cmap = FileUnicodeMap()
CMapParser(cmap, fp).run()
fp.close()
@ -430,5 +444,5 @@ def main(argv: List[str]) -> None:
return
if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv)

View File

@ -1,8 +1,19 @@
import io
import logging
import re
from typing import (BinaryIO, Dict, Generic, List, Optional, Sequence, TextIO,
Tuple, TypeVar, Union, cast)
from typing import (
BinaryIO,
Dict,
Generic,
List,
Optional,
Sequence,
TextIO,
Tuple,
TypeVar,
Union,
cast,
)
from pdfminer.pdfcolor import PDFColorSpace
from . import utils
@ -46,7 +57,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: Optional[LAParams] = None
laparams: Optional[LAParams] = None,
) -> None:
PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno
@ -80,9 +91,11 @@ class PDFLayoutAnalyzer(PDFTextDevice):
def render_image(self, name: str, stream: PDFStream) -> None:
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
item = LTImage(name, stream,
(self.cur_item.x0, self.cur_item.y0,
self.cur_item.x1, self.cur_item.y1))
item = LTImage(
name,
stream,
(self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
)
self.cur_item.add(item)
def paint_path(
@ -91,14 +104,14 @@ class PDFLayoutAnalyzer(PDFTextDevice):
stroke: bool,
fill: bool,
evenodd: bool,
path: Sequence[PathSegment]
path: Sequence[PathSegment],
) -> None:
"""Paint paths described in section 4.4 of the PDF reference manual"""
shape = ''.join(x[0] for x in path)
shape = "".join(x[0] for x in path)
if shape.count('m') > 1:
if shape.count("m") > 1:
# recurse if there are multiple m's in this shape
for m in re.finditer(r'm[^m]+', shape):
for m in re.finditer(r"m[^m]+", shape):
subpath = path[m.start(0) : m.end(0)]
self.paint_path(gstate, stroke, fill, evenodd, subpath)
@ -110,38 +123,68 @@ class PDFLayoutAnalyzer(PDFTextDevice):
# And, per Section 4.4's Table 4.9, all other path commands place
# their point-position in their final two arguments. (Any preceding
# arguments represent control points on Bézier curves.)
raw_pts = [cast(Point, p[-2:] if p[0] != 'h' else path[0][-2:])
for p in path]
raw_pts = [
cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path
]
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
if shape in {'mlh', 'ml'}:
if shape in {"mlh", "ml"}:
# single line segment
#
# Note: 'ml', in conditional above, is a frequent anomaly
# that we want to support.
line = LTLine(gstate.linewidth, pts[0], pts[1], stroke,
fill, evenodd, gstate.scolor, gstate.ncolor)
line = LTLine(
gstate.linewidth,
pts[0],
pts[1],
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(line)
elif shape in {'mlllh', 'mllll'}:
elif shape in {"mlllh", "mllll"}:
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
is_closed_loop = (pts[0] == pts[4])
has_square_coordinates = \
(x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) \
or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
is_closed_loop = pts[0] == pts[4]
has_square_coordinates = (
x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0
) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
if is_closed_loop and has_square_coordinates:
rect = LTRect(gstate.linewidth, (*pts[0], *pts[2]), stroke,
fill, evenodd, gstate.scolor, gstate.ncolor)
rect = LTRect(
gstate.linewidth,
(*pts[0], *pts[2]),
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(rect)
else:
curve = LTCurve(gstate.linewidth, pts, stroke, fill,
evenodd, gstate.scolor, gstate.ncolor)
curve = LTCurve(
gstate.linewidth,
pts,
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(curve)
else:
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
gstate.scolor, gstate.ncolor)
curve = LTCurve(
gstate.linewidth,
pts,
stroke,
fill,
evenodd,
gstate.scolor,
gstate.ncolor,
)
self.cur_item.add(curve)
def render_char(
@ -153,7 +196,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
rise: float,
cid: int,
ncs: PDFColorSpace,
graphicstate: PDFGraphicState
graphicstate: PDFGraphicState,
) -> float:
try:
text = font.to_unichr(cid)
@ -162,14 +205,24 @@ class PDFLayoutAnalyzer(PDFTextDevice):
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth,
textdisp, ncs, graphicstate)
item = LTChar(
matrix,
font,
fontsize,
scaling,
rise,
text,
textwidth,
textdisp,
ncs,
graphicstate,
)
self.cur_item.add(item)
return item.adv
def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
log.debug('undefined: %r, %r', font, cid)
return '(cid:%d)' % cid
log.debug("undefined: %r, %r", font, cid)
return "(cid:%d)" % cid
def receive_layout(self, ltpage: LTPage) -> None:
pass
@ -180,10 +233,9 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: Optional[LAParams] = None
laparams: Optional[LAParams] = None,
) -> None:
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.result: Optional[LTPage] = None
def receive_layout(self, ltpage: LTPage) -> None:
@ -195,7 +247,7 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
# Some PDFConverter children support only binary I/O
IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO)
IOType = TypeVar("IOType", TextIO, BinaryIO, AnyIO)
class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
@ -203,12 +255,11 @@ class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
self,
rsrcmgr: PDFResourceManager,
outfp: IOType,
codec: str = 'utf-8',
codec: str = "utf-8",
pageno: int = 1,
laparams: Optional[LAParams] = None
laparams: Optional[LAParams] = None,
) -> None:
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
laparams=laparams)
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.outfp: IOType = outfp
self.codec = codec
self.outfp_binary = self._is_binary_stream(self.outfp)
@ -216,9 +267,9 @@ class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
@staticmethod
def _is_binary_stream(outfp: AnyIO) -> bool:
"""Test if an stream is binary or not"""
if 'b' in getattr(outfp, 'mode', ''):
if "b" in getattr(outfp, "mode", ""):
return True
elif hasattr(outfp, 'mode'):
elif hasattr(outfp, "mode"):
# output stream has a mode, but it does not contain 'b'
return False
elif isinstance(outfp, io.BytesIO):
@ -236,19 +287,18 @@ class TextConverter(PDFConverter[AnyIO]):
self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = 'utf-8',
codec: str = "utf-8",
pageno: int = 1,
laparams: Optional[LAParams] = None,
showpageno: bool = False,
imagewriter: Optional[ImageWriter] = None
imagewriter: Optional[ImageWriter] = None,
) -> None:
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.imagewriter = imagewriter
def write_text(self, text: str) -> None:
text = utils.compatible_encode_method(text, self.codec, 'ignore')
text = utils.compatible_encode_method(text, self.codec, "ignore")
if self.outfp_binary:
cast(BinaryIO, self.outfp).write(text.encode())
else:
@ -262,14 +312,15 @@ class TextConverter(PDFConverter[AnyIO]):
elif isinstance(item, LTText):
self.write_text(item.get_text())
if isinstance(item, LTTextBox):
self.write_text('\n')
self.write_text("\n")
elif isinstance(item, LTImage):
if self.imagewriter is not None:
self.imagewriter.export_image(item)
if self.showpageno:
self.write_text('Page %s\n' % ltpage.pageid)
self.write_text("Page %s\n" % ltpage.pageid)
render(ltpage)
self.write_text('\f')
self.write_text("\f")
# Some dummy functions to save memory/CPU when all that is wanted
# is text. This stops all the image and drawing output from being
@ -286,54 +337,55 @@ class TextConverter(PDFConverter[AnyIO]):
stroke: bool,
fill: bool,
evenodd: bool,
path: Sequence[PathSegment]
path: Sequence[PathSegment],
) -> None:
return
class HTMLConverter(PDFConverter[AnyIO]):
RECT_COLORS = {
'figure': 'yellow',
'textline': 'magenta',
'textbox': 'cyan',
'textgroup': 'red',
'curve': 'black',
'page': 'gray',
"figure": "yellow",
"textline": "magenta",
"textbox": "cyan",
"textgroup": "red",
"curve": "black",
"page": "gray",
}
TEXT_COLORS = {
'textbox': 'blue',
'char': 'black',
"textbox": "blue",
"char": "black",
}
def __init__(
self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = 'utf-8',
codec: str = "utf-8",
pageno: int = 1,
laparams: Optional[LAParams] = None,
scale: float = 1,
fontscale: float = 1.0,
layoutmode: str = 'normal',
layoutmode: str = "normal",
showpageno: bool = True,
pagemargin: int = 50,
imagewriter: Optional[ImageWriter] = None,
debug: int = 0,
rect_colors: Optional[Dict[str, str]] = None,
text_colors: Optional[Dict[str, str]] = None
text_colors: Optional[Dict[str, str]] = None,
) -> None:
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
PDFConverter.__init__(
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
)
# write() assumes a codec for binary I/O, or no codec for text I/O.
if self.outfp_binary == (not self.codec):
raise ValueError("Codec is required for a binary I/O output")
if text_colors is None:
text_colors = {'char': 'black'}
text_colors = {"char": "black"}
if rect_colors is None:
rect_colors = {'curve': 'black', 'page': 'gray'}
rect_colors = {"curve": "black", "page": "gray"}
self.scale = scale
self.fontscale = fontscale
@ -360,23 +412,27 @@ class HTMLConverter(PDFConverter[AnyIO]):
return
def write_header(self) -> None:
self.write('<html><head>\n')
self.write("<html><head>\n")
if self.codec:
s = '<meta http-equiv="Content-Type" content="text/html; ' \
s = (
'<meta http-equiv="Content-Type" content="text/html; '
'charset=%s">\n' % self.codec
)
else:
s = '<meta http-equiv="Content-Type" content="text/html">\n'
self.write(s)
self.write('</head><body>\n')
self.write("</head><body>\n")
return
def write_footer(self) -> None:
page_links = ['<a href="#{}">{}</a>'.format(i, i)
for i in range(1, self.pageno)]
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
', '.join(page_links)
page_links = [
'<a href="#{}">{}</a>'.format(i, i) for i in range(1, self.pageno)
]
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % ", ".join(
page_links
)
self.write(s)
self.write('</body></html>\n')
self.write("</body></html>\n")
return
def write_text(self, text: str) -> None:
@ -384,71 +440,67 @@ class HTMLConverter(PDFConverter[AnyIO]):
return
def place_rect(
self,
color: str,
borderwidth: int,
x: float,
y: float,
w: float,
h: float
self, color: str, borderwidth: int, x: float, y: float, w: float, h: float
) -> None:
color2 = self.rect_colors.get(color)
if color2 is not None:
s = '<span style="position:absolute; border: %s %dpx solid; ' \
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \
(color2, borderwidth, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale,
h * self.scale)
self.write(
s)
s = (
'<span style="position:absolute; border: %s %dpx solid; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n'
% (
color2,
borderwidth,
x * self.scale,
(self._yoffset - y) * self.scale,
w * self.scale,
h * self.scale,
)
)
self.write(s)
return
def place_border(
self,
color: str,
borderwidth: int,
item: LTComponent
) -> None:
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
item.height)
def place_border(self, color: str, borderwidth: int, item: LTComponent) -> None:
self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
return
def place_image(
self,
item: LTImage,
borderwidth: int,
x: float,
y: float,
w: float,
h: float
self, item: LTImage, borderwidth: int, x: float, y: float, w: float, h: float
) -> None:
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
s = '<img src="%s" border="%d" style="position:absolute; ' \
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
(enc(name), borderwidth, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale,
h * self.scale)
s = (
'<img src="%s" border="%d" style="position:absolute; '
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n'
% (
enc(name),
borderwidth,
x * self.scale,
(self._yoffset - y) * self.scale,
w * self.scale,
h * self.scale,
)
)
self.write(s)
return
def place_text(
self,
color: str,
text: str,
x: float,
y: float,
size: float
self, color: str, text: str, x: float, y: float, size: float
) -> None:
color2 = self.text_colors.get(color)
if color2 is not None:
s = '<span style="position:absolute; color:%s; left:%dpx; ' \
'top:%dpx; font-size:%dpx;">' % \
(color2, x * self.scale, (self._yoffset - y) * self.scale,
size * self.scale * self.fontscale)
s = (
'<span style="position:absolute; color:%s; left:%dpx; '
'top:%dpx; font-size:%dpx;">'
% (
color2,
x * self.scale,
(self._yoffset - y) * self.scale,
size * self.scale * self.fontscale,
)
)
self.write(s)
self.write_text(text)
self.write('</span>\n')
self.write("</span>\n")
return
def begin_div(
@ -459,47 +511,57 @@ class HTMLConverter(PDFConverter[AnyIO]):
y: float,
w: float,
h: float,
writing_mode: str = 'False'
writing_mode: str = "False",
) -> None:
self._fontstack.append(self._font)
self._font = None
s = '<div style="position:absolute; border: %s %dpx solid; ' \
'writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; ' \
'height:%dpx;">' % \
(color, borderwidth, writing_mode, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale, h * self.scale)
s = (
'<div style="position:absolute; border: %s %dpx solid; '
"writing-mode:%s; left:%dpx; top:%dpx; width:%dpx; "
'height:%dpx;">'
% (
color,
borderwidth,
writing_mode,
x * self.scale,
(self._yoffset - y) * self.scale,
w * self.scale,
h * self.scale,
)
)
self.write(s)
return
def end_div(self, color: str) -> None:
if self._font is not None:
self.write('</span>')
self.write("</span>")
self._font = self._fontstack.pop()
self.write('</div>')
self.write("</div>")
return
def put_text(self, text: str, fontname: str, fontsize: float) -> None:
font = (fontname, fontsize)
if font != self._font:
if self._font is not None:
self.write('</span>')
self.write("</span>")
# Remove subset tag from fontname, see PDF Reference 5.5.3
fontname_without_subset_tag = fontname.split('+')[-1]
self.write('<span style="font-family: %s; font-size:%dpx">' %
(fontname_without_subset_tag,
fontsize * self.scale * self.fontscale))
fontname_without_subset_tag = fontname.split("+")[-1]
self.write(
'<span style="font-family: %s; font-size:%dpx">'
% (fontname_without_subset_tag, fontsize * self.scale * self.fontscale)
)
self._font = font
self.write_text(text)
return
def put_newline(self) -> None:
self.write('<br>')
self.write("<br>")
return
def receive_layout(self, ltpage: LTPage) -> None:
def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
if isinstance(item, LTTextGroup):
self.place_border('textgroup', 1, item)
self.place_border("textgroup", 1, item)
for child in item:
show_group(child)
return
@ -508,63 +570,74 @@ class HTMLConverter(PDFConverter[AnyIO]):
child: LTItem
if isinstance(item, LTPage):
self._yoffset += item.y1
self.place_border('page', 1, item)
self.place_border("page", 1, item)
if self.showpageno:
self.write('<div style="position:absolute; top:%dpx;">' %
((self._yoffset-item.y1)*self.scale))
self.write('<a name="{}">Page {}</a></div>\n'
.format(item.pageid, item.pageid))
self.write(
'<div style="position:absolute; top:%dpx;">'
% ((self._yoffset - item.y1) * self.scale)
)
self.write(
'<a name="{}">Page {}</a></div>\n'.format(
item.pageid, item.pageid
)
)
for child in item:
render(child)
if item.groups is not None:
for group in item.groups:
show_group(group)
elif isinstance(item, LTCurve):
self.place_border('curve', 1, item)
self.place_border("curve", 1, item)
elif isinstance(item, LTFigure):
self.begin_div('figure', 1, item.x0, item.y1, item.width,
item.height)
self.begin_div("figure", 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
self.end_div('figure')
self.end_div("figure")
elif isinstance(item, LTImage):
self.place_image(item, 1, item.x0, item.y1, item.width,
item.height)
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
else:
if self.layoutmode == 'exact':
if self.layoutmode == "exact":
if isinstance(item, LTTextLine):
self.place_border('textline', 1, item)
self.place_border("textline", 1, item)
for child in item:
render(child)
elif isinstance(item, LTTextBox):
self.place_border('textbox', 1, item)
self.place_text('textbox', str(item.index+1), item.x0,
item.y1, 20)
self.place_border("textbox", 1, item)
self.place_text(
"textbox", str(item.index + 1), item.x0, item.y1, 20
)
for child in item:
render(child)
elif isinstance(item, LTChar):
self.place_border('char', 1, item)
self.place_text('char', item.get_text(), item.x0,
item.y1, item.size)
self.place_border("char", 1, item)
self.place_text(
"char", item.get_text(), item.x0, item.y1, item.size
)
else:
if isinstance(item, LTTextLine):
for child in item:
render(child)
if self.layoutmode != 'loose':
if self.layoutmode != "loose":
self.put_newline()
elif isinstance(item, LTTextBox):
self.begin_div('textbox', 1, item.x0, item.y1,
item.width, item.height,
item.get_writing_mode())
self.begin_div(
"textbox",
1,
item.x0,
item.y1,
item.width,
item.height,
item.get_writing_mode(),
)
for child in item:
render(child)
self.end_div('textbox')
self.end_div("textbox")
elif isinstance(item, LTChar):
self.put_text(item.get_text(), item.fontname,
item.size)
self.put_text(item.get_text(), item.fontname, item.size)
elif isinstance(item, LTText):
self.write_text(item.get_text())
return
render(ltpage)
self._yoffset += self.pagemargin
return
@ -576,20 +649,21 @@ class HTMLConverter(PDFConverter[AnyIO]):
class XMLConverter(PDFConverter[AnyIO]):
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
CONTROL = re.compile("[\x00-\x08\x0b-\x0c\x0e-\x1f]")
def __init__(
self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = 'utf-8',
codec: str = "utf-8",
pageno: int = 1,
laparams: Optional[LAParams] = None,
imagewriter: Optional[ImageWriter] = None,
stripcontrol: bool = False
stripcontrol: bool = False,
) -> None:
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
laparams=laparams)
PDFConverter.__init__(
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
)
# write() assumes a codec for binary I/O, or no codec for text I/O.
if self.outfp_binary == (not self.codec):
@ -612,100 +686,125 @@ class XMLConverter(PDFConverter[AnyIO]):
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
else:
self.write('<?xml version="1.0" ?>\n')
self.write('<pages>\n')
self.write("<pages>\n")
return
def write_footer(self) -> None:
self.write('</pages>\n')
self.write("</pages>\n")
return
def write_text(self, text: str) -> None:
if self.stripcontrol:
text = self.CONTROL.sub('', text)
text = self.CONTROL.sub("", text)
self.write(enc(text))
return
def receive_layout(self, ltpage: LTPage) -> None:
def show_group(item: LTItem) -> None:
if isinstance(item, LTTextBox):
self.write('<textbox id="%d" bbox="%s" />\n' %
(item.index, bbox2str(item.bbox)))
self.write(
'<textbox id="%d" bbox="%s" />\n'
% (item.index, bbox2str(item.bbox))
)
elif isinstance(item, LTTextGroup):
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
show_group(child)
self.write('</textgroup>\n')
self.write("</textgroup>\n")
return
def render(item: LTItem) -> None:
child: LTItem
if isinstance(item, LTPage):
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
(item.pageid, bbox2str(item.bbox), item.rotate)
s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
item.pageid,
bbox2str(item.bbox),
item.rotate,
)
self.write(s)
for child in item:
render(child)
if item.groups is not None:
self.write('<layout>\n')
self.write("<layout>\n")
for group in item.groups:
show_group(group)
self.write('</layout>\n')
self.write('</page>\n')
self.write("</layout>\n")
self.write("</page>\n")
elif isinstance(item, LTLine):
s = '<line linewidth="%d" bbox="%s" />\n' % \
(item.linewidth, bbox2str(item.bbox))
s = '<line linewidth="%d" bbox="%s" />\n' % (
item.linewidth,
bbox2str(item.bbox),
)
self.write(s)
elif isinstance(item, LTRect):
s = '<rect linewidth="%d" bbox="%s" />\n' % \
(item.linewidth, bbox2str(item.bbox))
s = '<rect linewidth="%d" bbox="%s" />\n' % (
item.linewidth,
bbox2str(item.bbox),
)
self.write(s)
elif isinstance(item, LTCurve):
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % \
(item.linewidth, bbox2str(item.bbox), item.get_pts())
s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
item.linewidth,
bbox2str(item.bbox),
item.get_pts(),
)
self.write(s)
elif isinstance(item, LTFigure):
s = '<figure name="%s" bbox="%s">\n' % \
(item.name, bbox2str(item.bbox))
s = '<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox))
self.write(s)
for child in item:
render(child)
self.write('</figure>\n')
self.write("</figure>\n")
elif isinstance(item, LTTextLine):
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
render(child)
self.write('</textline>\n')
self.write("</textline>\n")
elif isinstance(item, LTTextBox):
wmode = ''
wmode = ""
if isinstance(item, LTTextBoxVertical):
wmode = ' wmode="vertical"'
s = '<textbox id="%d" bbox="%s"%s>\n' %\
(item.index, bbox2str(item.bbox), wmode)
s = '<textbox id="%d" bbox="%s"%s>\n' % (
item.index,
bbox2str(item.bbox),
wmode,
)
self.write(s)
for child in item:
render(child)
self.write('</textbox>\n')
self.write("</textbox>\n")
elif isinstance(item, LTChar):
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
'ncolour="%s" size="%.3f">' % \
(enc(item.fontname), bbox2str(item.bbox),
item.ncs.name, item.graphicstate.ncolor, item.size)
s = (
'<text font="%s" bbox="%s" colourspace="%s" '
'ncolour="%s" size="%.3f">'
% (
enc(item.fontname),
bbox2str(item.bbox),
item.ncs.name,
item.graphicstate.ncolor,
item.size,
)
)
self.write(s)
self.write_text(item.get_text())
self.write('</text>\n')
self.write("</text>\n")
elif isinstance(item, LTText):
self.write('<text>%s</text>\n' % item.get_text())
self.write("<text>%s</text>\n" % item.get_text())
elif isinstance(item, LTImage):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name), item.width, item.height))
self.write(
'<image src="%s" width="%d" height="%d" />\n'
% (enc(name), item.width, item.height)
)
else:
self.write('<image width="%d" height="%d" />\n' %
(item.width, item.height))
self.write(
'<image width="%d" height="%d" />\n' % (item.width, item.height)
)
else:
assert False, str(('Unhandled', item))
assert False, str(("Unhandled", item))
return
render(ltpage)
return

View File

@ -11,18 +11,19 @@ class NumberTree:
See Section 3.8.6 of the PDF Reference.
"""
def __init__(self, obj: Any):
self._obj = dict_value(obj)
self.nums: Optional[Iterable[Any]] = None
self.kids: Optional[Iterable[Any]] = None
self.limits: Optional[Iterable[Any]] = None
if 'Nums' in self._obj:
self.nums = list_value(self._obj['Nums'])
if 'Kids' in self._obj:
self.kids = list_value(self._obj['Kids'])
if 'Limits' in self._obj:
self.limits = list_value(self._obj['Limits'])
if "Nums" in self._obj:
self.nums = list_value(self._obj["Nums"])
if "Kids" in self._obj:
self.kids = list_value(self._obj["Kids"])
if "Limits" in self._obj:
self.limits = list_value(self._obj["Limits"])
def _parse(self) -> List[Tuple[int, Any]]:
items = []
@ -44,7 +45,7 @@ class NumberTree:
if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError('Number tree elements are out of order')
raise PDFSyntaxError("Number tree elements are out of order")
else:
values.sort(key=lambda t: t[0])

View File

@ -6,7 +6,7 @@ from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING
from .psparser import PSLiteral
HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
log = logging.getLogger(__name__)
@ -25,39 +25,41 @@ def name2unicode(name: str) -> str:
:returns unicode character if name resembles something,
otherwise a KeyError
"""
name = name.split('.')[0]
components = name.split('_')
name = name.split(".")[0]
components = name.split("_")
if len(components) > 1:
return ''.join(map(name2unicode, components))
return "".join(map(name2unicode, components))
else:
if name in glyphname2unicode:
return glyphname2unicode[name]
elif name.startswith('uni'):
name_without_uni = name.strip('uni')
elif name.startswith("uni"):
name_without_uni = name.strip("uni")
if HEXADECIMAL.match(name_without_uni) and \
len(name_without_uni) % 4 == 0:
unicode_digits = [int(name_without_uni[i:i + 4], base=16)
for i in range(0, len(name_without_uni), 4)]
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
unicode_digits = [
int(name_without_uni[i : i + 4], base=16)
for i in range(0, len(name_without_uni), 4)
]
for digit in unicode_digits:
raise_key_error_for_invalid_unicode(digit)
characters = map(chr, unicode_digits)
return ''.join(characters)
return "".join(characters)
elif name.startswith('u'):
name_without_u = name.strip('u')
elif name.startswith("u"):
name_without_u = name.strip("u")
if HEXADECIMAL.match(name_without_u) and \
4 <= len(name_without_u) <= 6:
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16)
raise_key_error_for_invalid_unicode(unicode_digit)
return chr(unicode_digit)
raise KeyError('Could not convert unicode name "%s" to character because '
'it does not match specification' % name)
raise KeyError(
'Could not convert unicode name "%s" to character because '
"it does not match specification" % name
)
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
@ -67,8 +69,10 @@ def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
:raises KeyError if unicode digit is invalid
"""
if 55295 < unicode_digit < 57344:
raise KeyError('Unicode digit %d is invalid because '
'it is in the range D800 through DFFF' % unicode_digit)
raise KeyError(
"Unicode digit %d is invalid because "
"it is in the range D800 through DFFF" % unicode_digit
)
class EncodingDB:
@ -89,17 +93,15 @@ class EncodingDB:
pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
"StandardEncoding": std2unicode,
"MacRomanEncoding": mac2unicode,
"WinAnsiEncoding": win2unicode,
"PDFDocEncoding": pdf2unicode,
}
@classmethod
def get_encoding(
cls,
name: str,
diff: Optional[Iterable[object]] = None
cls, name: str, diff: Optional[Iterable[object]] = None
) -> Dict[int, str]:
cid2unicode = cls.encodings.get(name, cls.std2unicode)
if diff:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,8 +5,7 @@ import sys
from io import StringIO
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
from .converter import XMLConverter, HTMLConverter, TextConverter, \
PDFPageAggregator
from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
from .image import ImageWriter
from .layout import LAParams, LTPage
from .pdfdevice import PDFDevice, TagExtractor
@ -18,20 +17,20 @@ from .utils import open_filename, FileOrName, AnyIO
def extract_text_to_fp(
inf: BinaryIO,
outfp: AnyIO,
output_type: str = 'text',
codec: str = 'utf-8',
output_type: str = "text",
codec: str = "utf-8",
laparams: Optional[LAParams] = None,
maxpages: int = 0,
page_numbers: Optional[Container[int]] = None,
password: str = "",
scale: float = 1.0,
rotation: int = 0,
layoutmode: str = 'normal',
layoutmode: str = "normal",
output_dir: Optional[str] = None,
strip_control: bool = False,
debug: bool = False,
disable_caching: bool = False,
**kwargs: Any
**kwargs: Any,
) -> None:
"""Parses text from inf-file and writes to outfp file-like object.
@ -72,39 +71,52 @@ def extract_text_to_fp(
rsrcmgr = PDFResourceManager(caching=not disable_caching)
device: Optional[PDFDevice] = None
if output_type != 'text' and outfp == sys.stdout:
if output_type != "text" and outfp == sys.stdout:
outfp = sys.stdout.buffer
if output_type == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
if output_type == "text":
device = TextConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter
)
elif output_type == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
elif output_type == "xml":
device = XMLConverter(
rsrcmgr,
outfp,
codec=codec,
laparams=laparams,
imagewriter=imagewriter,
stripcontrol=strip_control)
stripcontrol=strip_control,
)
elif output_type == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif output_type == "html":
device = HTMLConverter(
rsrcmgr,
outfp,
codec=codec,
scale=scale,
layoutmode=layoutmode,
laparams=laparams,
imagewriter=imagewriter,
)
elif output_type == 'tag':
elif output_type == "tag":
# Binary I/O is required, but we have no good way to test it here.
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
else:
msg = f"Output type can be text, html, xml or tag but is " \
f"{output_type}"
msg = f"Output type can be text, html, xml or tag but is " f"{output_type}"
raise ValueError(msg)
assert device is not None
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(inf,
for page in PDFPage.get_pages(
inf,
page_numbers,
maxpages=maxpages,
password=password,
caching=not disable_caching):
caching=not disable_caching,
):
page.rotate = (page.rotate + rotation) % 360
interpreter.process_page(page)
@ -113,12 +125,12 @@ def extract_text_to_fp(
def extract_text(
pdf_file: FileOrName,
password: str = '',
password: str = "",
page_numbers: Optional[Container[int]] = None,
maxpages: int = 0,
caching: bool = True,
codec: str = 'utf-8',
laparams: Optional[LAParams] = None
codec: str = "utf-8",
laparams: Optional[LAParams] = None,
) -> str:
"""Parse and return the text contained in a PDF file.
@ -139,8 +151,7 @@ def extract_text(
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
fp = cast(BinaryIO, fp) # we opened in binary mode
rsrcmgr = PDFResourceManager(caching=caching)
device = TextConverter(rsrcmgr, output_string, codec=codec,
laparams=laparams)
device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(
@ -157,11 +168,11 @@ def extract_text(
def extract_pages(
pdf_file: FileOrName,
password: str = '',
password: str = "",
page_numbers: Optional[Container[int]] = None,
maxpages: int = 0,
caching: bool = True,
laparams: Optional[LAParams] = None
laparams: Optional[LAParams] = None,
) -> Iterator[LTPage]:
"""Extract and yield LTPage objects
@ -183,8 +194,9 @@ def extract_pages(
resource_manager = PDFResourceManager(caching=caching)
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
password=password, caching=caching):
for page in PDFPage.get_pages(
fp, page_numbers, maxpages=maxpages, password=password, caching=caching
):
interpreter.process_page(page)
layout = device.get_result()
yield layout

View File

@ -9,8 +9,7 @@ from .layout import LTImage
from .pdfcolor import LITERAL_DEVICE_CMYK
from .pdfcolor import LITERAL_DEVICE_GRAY
from .pdfcolor import LITERAL_DEVICE_RGB
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, \
LITERALS_JPX_DECODE
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_JPX_DECODE
def align32(x: int) -> int:
@ -18,13 +17,7 @@ def align32(x: int) -> int:
class BMPWriter:
def __init__(
self,
fp: BinaryIO,
bits: int,
width: int,
height: int
) -> None:
def __init__(self, fp: BinaryIO, bits: int, width: int, height: int) -> None:
self.fp = fp
self.bits = bits
self.width = width
@ -40,22 +33,35 @@ class BMPWriter:
self.linesize = align32((self.width * self.bits + 7) // 8)
self.datasize = self.linesize * self.height
headersize = 14 + 40 + ncols * 4
info = struct.pack('<IiiHHIIIIII', 40, self.width, self.height,
1, self.bits, 0, self.datasize, 0, 0, ncols, 0)
info = struct.pack(
"<IiiHHIIIIII",
40,
self.width,
self.height,
1,
self.bits,
0,
self.datasize,
0,
0,
ncols,
0,
)
assert len(info) == 40, str(len(info))
header = struct.pack('<ccIHHI', b'B', b'M',
headersize+self.datasize, 0, 0, headersize)
header = struct.pack(
"<ccIHHI", b"B", b"M", headersize + self.datasize, 0, 0, headersize
)
assert len(header) == 14, str(len(header))
self.fp.write(header)
self.fp.write(info)
if ncols == 2:
# B&W color table
for i in (0, 255):
self.fp.write(struct.pack('BBBx', i, i, i))
self.fp.write(struct.pack("BBBx", i, i, i))
elif ncols == 256:
# grayscale color table
for i in range(256):
self.fp.write(struct.pack('BBBx', i, i, i))
self.fp.write(struct.pack("BBBx", i, i, i))
self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize
@ -80,43 +86,46 @@ class ImageWriter:
is_jbig2 = self.is_jbig2_image(image)
ext = self._get_image_extension(image, width, height, is_jbig2)
name, path = self._create_unique_image_name(self.outdir,
image.name, ext)
name, path = self._create_unique_image_name(self.outdir, image.name, ext)
fp = open(path, 'wb')
if ext == '.jpg':
fp = open(path, "wb")
if ext == ".jpg":
raw_data = image.stream.get_rawdata()
assert raw_data is not None
if LITERAL_DEVICE_CMYK in image.colorspace:
from PIL import Image # type: ignore[import]
from PIL import ImageChops
ifp = BytesIO(raw_data)
i = Image.open(ifp)
i = ImageChops.invert(i)
i = i.convert('RGB')
i.save(fp, 'JPEG')
i = i.convert("RGB")
i.save(fp, "JPEG")
else:
fp.write(raw_data)
elif ext == '.jp2':
elif ext == ".jp2":
# if we just write the raw data, most image programs
# that I have tried cannot open the file. However,
# open and saving with PIL produces a file that
# seems to be easily opened by other programs
from PIL import Image
raw_data = image.stream.get_rawdata()
assert raw_data is not None
ifp = BytesIO(raw_data)
i = Image.open(ifp)
i.save(fp, 'JPEG2000')
i.save(fp, "JPEG2000")
elif is_jbig2:
input_stream = BytesIO()
global_streams = self.jbig2_global(image)
if len(global_streams) > 1:
msg = 'There should never be more than one JBIG2Globals ' \
'associated with a JBIG2 embedded image'
msg = (
"There should never be more than one JBIG2Globals "
"associated with a JBIG2 embedded image"
)
raise ValueError(msg)
if len(global_streams) == 1:
input_stream.write(global_streams[0].get_data().rstrip(b'\n'))
input_stream.write(global_streams[0].get_data().rstrip(b"\n"))
input_stream.write(image.stream.get_data())
input_stream.seek(0)
reader = JBIG2StreamReader(input_stream)
@ -168,43 +177,42 @@ class ImageWriter:
filters = image.stream.get_filters()
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
global_streams.append(params['JBIG2Globals'].resolve())
global_streams.append(params["JBIG2Globals"].resolve())
return global_streams
@staticmethod
def _get_image_extension(
image: LTImage,
width: int,
height: int,
is_jbig2: bool
image: LTImage, width: int, height: int, is_jbig2: bool
) -> str:
filters = image.stream.get_filters()
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg'
ext = ".jpg"
elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
ext = '.jp2'
ext = ".jp2"
elif is_jbig2:
ext = '.jb2'
elif (image.bits == 1 or
image.bits == 8 and
(LITERAL_DEVICE_RGB in image.colorspace or
LITERAL_DEVICE_GRAY in image.colorspace)):
ext = '.%dx%d.bmp' % (width, height)
ext = ".jb2"
elif (
image.bits == 1
or image.bits == 8
and (
LITERAL_DEVICE_RGB in image.colorspace
or LITERAL_DEVICE_GRAY in image.colorspace
)
):
ext = ".%dx%d.bmp" % (width, height)
else:
ext = '.%d.%dx%d.img' % (image.bits, width, height)
ext = ".%d.%dx%d.img" % (image.bits, width, height)
return ext
@staticmethod
def _create_unique_image_name(
dirname: str,
image_name: str,
ext: str
dirname: str, image_name: str, ext: str
) -> Tuple[str, str]:
name = image_name + ext
path = os.path.join(dirname, name)
img_index = 0
while os.path.exists(path):
name = '%s.%d%s' % (image_name, img_index, ext)
name = "%s.%d%s" % (image_name, img_index, ext)
path = os.path.join(dirname, name)
img_index += 1
return name, path

View File

@ -19,10 +19,10 @@ HEADER_FLAG_PAGE_ASSOC_LONG = 0b01000000
SEG_TYPE_MASK = 0b00111111
REF_COUNT_SHORT_MASK = 0b11100000
REF_COUNT_LONG_MASK = 0x1fffffff
REF_COUNT_LONG_MASK = 0x1FFFFFFF
REF_COUNT_LONG = 7
DATA_LEN_UNKNOWN = 0xffffffff
DATA_LEN_UNKNOWN = 0xFFFFFFFF
# segment types
SEG_TYPE_IMMEDIATE_GEN_REGION = 38
@ -30,7 +30,7 @@ SEG_TYPE_END_OF_PAGE = 49
SEG_TYPE_END_OF_FILE = 51
# file literals
FILE_HEADER_ID = b'\x97\x4A\x42\x32\x0D\x0A\x1A\x0A'
FILE_HEADER_ID = b"\x97\x4A\x42\x32\x0D\x0A\x1A\x0A"
FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
@ -66,12 +66,14 @@ def unpack_int(format: str, buffer: bytes) -> int:
JBIG2SegmentFlags = Dict[str, Union[int, bool]]
JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]]
JBIG2Segment = Dict[str, Union[bool, int, bytes, JBIG2SegmentFlags,
JBIG2RetentionFlags]]
JBIG2Segment = Dict[
str, Union[bool, int, bytes, JBIG2SegmentFlags, JBIG2RetentionFlags]
]
class JBIG2StreamReader:
"""Read segments from a JBIG2 byte stream"""
def __init__(self, stream: BinaryIO) -> None:
self.stream = stream
@ -96,29 +98,23 @@ class JBIG2StreamReader:
return segments
def is_eof(self) -> bool:
if self.stream.read(1) == b'':
if self.stream.read(1) == b"":
return True
else:
self.stream.seek(-1, os.SEEK_CUR)
return False
def parse_flags(
self,
segment: JBIG2Segment,
flags: int,
field: bytes
self, segment: JBIG2Segment, flags: int, field: bytes
) -> JBIG2SegmentFlags:
return {
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
"type": masked_value(SEG_TYPE_MASK, flags)
"type": masked_value(SEG_TYPE_MASK, flags),
}
def parse_retention_flags(
self,
segment: JBIG2Segment,
flags: int,
field: bytes
self, segment: JBIG2Segment, flags: int, field: bytes
) -> JBIG2RetentionFlags:
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
retain_segments = []
@ -159,31 +155,23 @@ class JBIG2StreamReader:
"ref_segments": ref_segments,
}
def parse_page_assoc(
self,
segment: JBIG2Segment,
page: int,
field: bytes
) -> int:
def parse_page_assoc(self, segment: JBIG2Segment, page: int, field: bytes) -> int:
if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
field += self.stream.read(3)
page = unpack_int(">L", field)
return page
def parse_data_length(
self,
segment: JBIG2Segment,
length: int,
field: bytes
self, segment: JBIG2Segment, length: int, field: bytes
) -> int:
if length:
if (cast(JBIG2SegmentFlags, segment["flags"])["type"] ==
SEG_TYPE_IMMEDIATE_GEN_REGION) \
and (length == DATA_LEN_UNKNOWN):
if (
cast(JBIG2SegmentFlags, segment["flags"])["type"]
== SEG_TYPE_IMMEDIATE_GEN_REGION
) and (length == DATA_LEN_UNKNOWN):
raise NotImplementedError(
"Working with unknown segment length "
"is not implemented yet"
"Working with unknown segment length " "is not implemented yet"
)
else:
segment["raw_data"] = self.stream.read(length)
@ -195,18 +183,16 @@ class JBIG2StreamWriter:
"""Write JBIG2 segments to a file in JBIG2 format"""
EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
'ref_count': 0,
'ref_segments': cast(List[int], []),
'retain_segments': cast(List[bool], [])
"ref_count": 0,
"ref_segments": cast(List[int], []),
"retain_segments": cast(List[bool], []),
}
def __init__(self, stream: BinaryIO) -> None:
self.stream = stream
def write_segments(
self,
segments: Iterable[JBIG2Segment],
fix_last_page: bool = True
self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True
) -> int:
data_len = 0
current_page: Optional[int] = None
@ -222,8 +208,10 @@ class JBIG2StreamWriter:
if fix_last_page:
seg_page = cast(int, segment.get("page_assoc"))
if cast(JBIG2SegmentFlags, segment["flags"])["type"] == \
SEG_TYPE_END_OF_PAGE:
if (
cast(JBIG2SegmentFlags, segment["flags"])["type"]
== SEG_TYPE_END_OF_PAGE
):
current_page = None
elif seg_page:
current_page = seg_page
@ -237,9 +225,7 @@ class JBIG2StreamWriter:
return data_len
def write_file(
self,
segments: Iterable[JBIG2Segment],
fix_last_page: bool = True
self, segments: Iterable[JBIG2Segment], fix_last_page: bool = True
) -> int:
header = FILE_HEADER_ID
header_flags = FILE_HEAD_FLAG_SEQUENTIAL
@ -270,7 +256,7 @@ class JBIG2StreamWriter:
return data_len
def encode_segment(self, segment: JBIG2Segment) -> bytes:
data = b''
data = b""
for field_format, name in SEG_STRUCT:
value = segment.get(name)
encoder = getattr(self, "encode_%s" % name, None)
@ -281,27 +267,26 @@ class JBIG2StreamWriter:
data += field
return data
def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment
) -> bytes:
def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment) -> bytes:
flags = 0
if value.get("deferred"):
flags |= HEADER_FLAG_DEFERRED
if "page_assoc_long" in value:
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
if value["page_assoc_long"] else flags
flags |= HEADER_FLAG_PAGE_ASSOC_LONG if value["page_assoc_long"] else flags
else:
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
if cast(int, segment.get("page", 0)) > 255 else flags
flags |= (
HEADER_FLAG_PAGE_ASSOC_LONG
if cast(int, segment.get("page", 0)) > 255
else flags
)
flags |= mask_value(SEG_TYPE_MASK, value["type"])
return pack(">B", flags)
def encode_retention_flags(
self,
value: JBIG2RetentionFlags,
segment: JBIG2Segment
self, value: JBIG2RetentionFlags, segment: JBIG2Segment
) -> bytes:
flags = []
flags_format = ">B"
@ -318,10 +303,7 @@ class JBIG2StreamWriter:
else:
bytes_count = math.ceil((ref_count + 1) / 8)
flags_format = ">L" + ("B" * bytes_count)
flags_dword = mask_value(
REF_COUNT_SHORT_MASK,
REF_COUNT_LONG
) << 24
flags_dword = mask_value(REF_COUNT_SHORT_MASK, REF_COUNT_LONG) << 24
flags.append(flags_dword)
for byte_index in range(bytes_count):
@ -353,26 +335,22 @@ class JBIG2StreamWriter:
data += cast(bytes, segment["raw_data"])
return data
def get_eop_segment(
self,
seg_number: int,
page_number: int
) -> JBIG2Segment:
def get_eop_segment(self, seg_number: int, page_number: int) -> JBIG2Segment:
return {
'data_length': 0,
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE},
'number': seg_number,
'page_assoc': page_number,
'raw_data': b'',
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS
"data_length": 0,
"flags": {"deferred": False, "type": SEG_TYPE_END_OF_PAGE},
"number": seg_number,
"page_assoc": page_number,
"raw_data": b"",
"retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
}
def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
return {
'data_length': 0,
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE},
'number': seg_number,
'page_assoc': 0,
'raw_data': b'',
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS
"data_length": 0,
"flags": {"deferred": False, "type": SEG_TYPE_END_OF_FILE},
"number": seg_number,
"page_assoc": 0,
"raw_data": b"",
"retention_flags": JBIG2StreamWriter.EMPTY_RETENTION_FLAGS,
}

View File

@ -7,241 +7,240 @@ This table is extracted from PDF Reference Manual 1.6, pp.925
from typing import List, Optional, Tuple
EncodingRow = \
Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
EncodingRow = Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
ENCODING: List[EncodingRow] = [
# (name, std, mac, win, pdf)
('A', 65, 65, 65, 65),
('AE', 225, 174, 198, 198),
('Aacute', None, 231, 193, 193),
('Acircumflex', None, 229, 194, 194),
('Adieresis', None, 128, 196, 196),
('Agrave', None, 203, 192, 192),
('Aring', None, 129, 197, 197),
('Atilde', None, 204, 195, 195),
('B', 66, 66, 66, 66),
('C', 67, 67, 67, 67),
('Ccedilla', None, 130, 199, 199),
('D', 68, 68, 68, 68),
('E', 69, 69, 69, 69),
('Eacute', None, 131, 201, 201),
('Ecircumflex', None, 230, 202, 202),
('Edieresis', None, 232, 203, 203),
('Egrave', None, 233, 200, 200),
('Eth', None, None, 208, 208),
('Euro', None, None, 128, 160),
('F', 70, 70, 70, 70),
('G', 71, 71, 71, 71),
('H', 72, 72, 72, 72),
('I', 73, 73, 73, 73),
('Iacute', None, 234, 205, 205),
('Icircumflex', None, 235, 206, 206),
('Idieresis', None, 236, 207, 207),
('Igrave', None, 237, 204, 204),
('J', 74, 74, 74, 74),
('K', 75, 75, 75, 75),
('L', 76, 76, 76, 76),
('Lslash', 232, None, None, 149),
('M', 77, 77, 77, 77),
('N', 78, 78, 78, 78),
('Ntilde', None, 132, 209, 209),
('O', 79, 79, 79, 79),
('OE', 234, 206, 140, 150),
('Oacute', None, 238, 211, 211),
('Ocircumflex', None, 239, 212, 212),
('Odieresis', None, 133, 214, 214),
('Ograve', None, 241, 210, 210),
('Oslash', 233, 175, 216, 216),
('Otilde', None, 205, 213, 213),
('P', 80, 80, 80, 80),
('Q', 81, 81, 81, 81),
('R', 82, 82, 82, 82),
('S', 83, 83, 83, 83),
('Scaron', None, None, 138, 151),
('T', 84, 84, 84, 84),
('Thorn', None, None, 222, 222),
('U', 85, 85, 85, 85),
('Uacute', None, 242, 218, 218),
('Ucircumflex', None, 243, 219, 219),
('Udieresis', None, 134, 220, 220),
('Ugrave', None, 244, 217, 217),
('V', 86, 86, 86, 86),
('W', 87, 87, 87, 87),
('X', 88, 88, 88, 88),
('Y', 89, 89, 89, 89),
('Yacute', None, None, 221, 221),
('Ydieresis', None, 217, 159, 152),
('Z', 90, 90, 90, 90),
('Zcaron', None, None, 142, 153),
('a', 97, 97, 97, 97),
('aacute', None, 135, 225, 225),
('acircumflex', None, 137, 226, 226),
('acute', 194, 171, 180, 180),
('adieresis', None, 138, 228, 228),
('ae', 241, 190, 230, 230),
('agrave', None, 136, 224, 224),
('ampersand', 38, 38, 38, 38),
('aring', None, 140, 229, 229),
('asciicircum', 94, 94, 94, 94),
('asciitilde', 126, 126, 126, 126),
('asterisk', 42, 42, 42, 42),
('at', 64, 64, 64, 64),
('atilde', None, 139, 227, 227),
('b', 98, 98, 98, 98),
('backslash', 92, 92, 92, 92),
('bar', 124, 124, 124, 124),
('braceleft', 123, 123, 123, 123),
('braceright', 125, 125, 125, 125),
('bracketleft', 91, 91, 91, 91),
('bracketright', 93, 93, 93, 93),
('breve', 198, 249, None, 24),
('brokenbar', None, None, 166, 166),
('bullet', 183, 165, 149, 128),
('c', 99, 99, 99, 99),
('caron', 207, 255, None, 25),
('ccedilla', None, 141, 231, 231),
('cedilla', 203, 252, 184, 184),
('cent', 162, 162, 162, 162),
('circumflex', 195, 246, 136, 26),
('colon', 58, 58, 58, 58),
('comma', 44, 44, 44, 44),
('copyright', None, 169, 169, 169),
('currency', 168, 219, 164, 164),
('d', 100, 100, 100, 100),
('dagger', 178, 160, 134, 129),
('daggerdbl', 179, 224, 135, 130),
('degree', None, 161, 176, 176),
('dieresis', 200, 172, 168, 168),
('divide', None, 214, 247, 247),
('dollar', 36, 36, 36, 36),
('dotaccent', 199, 250, None, 27),
('dotlessi', 245, 245, None, 154),
('e', 101, 101, 101, 101),
('eacute', None, 142, 233, 233),
('ecircumflex', None, 144, 234, 234),
('edieresis', None, 145, 235, 235),
('egrave', None, 143, 232, 232),
('eight', 56, 56, 56, 56),
('ellipsis', 188, 201, 133, 131),
('emdash', 208, 209, 151, 132),
('endash', 177, 208, 150, 133),
('equal', 61, 61, 61, 61),
('eth', None, None, 240, 240),
('exclam', 33, 33, 33, 33),
('exclamdown', 161, 193, 161, 161),
('f', 102, 102, 102, 102),
('fi', 174, 222, None, 147),
('five', 53, 53, 53, 53),
('fl', 175, 223, None, 148),
('florin', 166, 196, 131, 134),
('four', 52, 52, 52, 52),
('fraction', 164, 218, None, 135),
('g', 103, 103, 103, 103),
('germandbls', 251, 167, 223, 223),
('grave', 193, 96, 96, 96),
('greater', 62, 62, 62, 62),
('guillemotleft', 171, 199, 171, 171),
('guillemotright', 187, 200, 187, 187),
('guilsinglleft', 172, 220, 139, 136),
('guilsinglright', 173, 221, 155, 137),
('h', 104, 104, 104, 104),
('hungarumlaut', 205, 253, None, 28),
('hyphen', 45, 45, 45, 45),
('i', 105, 105, 105, 105),
('iacute', None, 146, 237, 237),
('icircumflex', None, 148, 238, 238),
('idieresis', None, 149, 239, 239),
('igrave', None, 147, 236, 236),
('j', 106, 106, 106, 106),
('k', 107, 107, 107, 107),
('l', 108, 108, 108, 108),
('less', 60, 60, 60, 60),
('logicalnot', None, 194, 172, 172),
('lslash', 248, None, None, 155),
('m', 109, 109, 109, 109),
('macron', 197, 248, 175, 175),
('minus', None, None, None, 138),
('mu', None, 181, 181, 181),
('multiply', None, None, 215, 215),
('n', 110, 110, 110, 110),
('nbspace', None, 202, 160, None),
('nine', 57, 57, 57, 57),
('ntilde', None, 150, 241, 241),
('numbersign', 35, 35, 35, 35),
('o', 111, 111, 111, 111),
('oacute', None, 151, 243, 243),
('ocircumflex', None, 153, 244, 244),
('odieresis', None, 154, 246, 246),
('oe', 250, 207, 156, 156),
('ogonek', 206, 254, None, 29),
('ograve', None, 152, 242, 242),
('one', 49, 49, 49, 49),
('onehalf', None, None, 189, 189),
('onequarter', None, None, 188, 188),
('onesuperior', None, None, 185, 185),
('ordfeminine', 227, 187, 170, 170),
('ordmasculine', 235, 188, 186, 186),
('oslash', 249, 191, 248, 248),
('otilde', None, 155, 245, 245),
('p', 112, 112, 112, 112),
('paragraph', 182, 166, 182, 182),
('parenleft', 40, 40, 40, 40),
('parenright', 41, 41, 41, 41),
('percent', 37, 37, 37, 37),
('period', 46, 46, 46, 46),
('periodcentered', 180, 225, 183, 183),
('perthousand', 189, 228, 137, 139),
('plus', 43, 43, 43, 43),
('plusminus', None, 177, 177, 177),
('q', 113, 113, 113, 113),
('question', 63, 63, 63, 63),
('questiondown', 191, 192, 191, 191),
('quotedbl', 34, 34, 34, 34),
('quotedblbase', 185, 227, 132, 140),
('quotedblleft', 170, 210, 147, 141),
('quotedblright', 186, 211, 148, 142),
('quoteleft', 96, 212, 145, 143),
('quoteright', 39, 213, 146, 144),
('quotesinglbase', 184, 226, 130, 145),
('quotesingle', 169, 39, 39, 39),
('r', 114, 114, 114, 114),
('registered', None, 168, 174, 174),
('ring', 202, 251, None, 30),
('s', 115, 115, 115, 115),
('scaron', None, None, 154, 157),
('section', 167, 164, 167, 167),
('semicolon', 59, 59, 59, 59),
('seven', 55, 55, 55, 55),
('six', 54, 54, 54, 54),
('slash', 47, 47, 47, 47),
('space', 32, 32, 32, 32),
('space', None, 202, 160, None),
('space', None, 202, 173, None),
('sterling', 163, 163, 163, 163),
('t', 116, 116, 116, 116),
('thorn', None, None, 254, 254),
('three', 51, 51, 51, 51),
('threequarters', None, None, 190, 190),
('threesuperior', None, None, 179, 179),
('tilde', 196, 247, 152, 31),
('trademark', None, 170, 153, 146),
('two', 50, 50, 50, 50),
('twosuperior', None, None, 178, 178),
('u', 117, 117, 117, 117),
('uacute', None, 156, 250, 250),
('ucircumflex', None, 158, 251, 251),
('udieresis', None, 159, 252, 252),
('ugrave', None, 157, 249, 249),
('underscore', 95, 95, 95, 95),
('v', 118, 118, 118, 118),
('w', 119, 119, 119, 119),
('x', 120, 120, 120, 120),
('y', 121, 121, 121, 121),
('yacute', None, None, 253, 253),
('ydieresis', None, 216, 255, 255),
('yen', 165, 180, 165, 165),
('z', 122, 122, 122, 122),
('zcaron', None, None, 158, 158),
('zero', 48, 48, 48, 48),
("A", 65, 65, 65, 65),
("AE", 225, 174, 198, 198),
("Aacute", None, 231, 193, 193),
("Acircumflex", None, 229, 194, 194),
("Adieresis", None, 128, 196, 196),
("Agrave", None, 203, 192, 192),
("Aring", None, 129, 197, 197),
("Atilde", None, 204, 195, 195),
("B", 66, 66, 66, 66),
("C", 67, 67, 67, 67),
("Ccedilla", None, 130, 199, 199),
("D", 68, 68, 68, 68),
("E", 69, 69, 69, 69),
("Eacute", None, 131, 201, 201),
("Ecircumflex", None, 230, 202, 202),
("Edieresis", None, 232, 203, 203),
("Egrave", None, 233, 200, 200),
("Eth", None, None, 208, 208),
("Euro", None, None, 128, 160),
("F", 70, 70, 70, 70),
("G", 71, 71, 71, 71),
("H", 72, 72, 72, 72),
("I", 73, 73, 73, 73),
("Iacute", None, 234, 205, 205),
("Icircumflex", None, 235, 206, 206),
("Idieresis", None, 236, 207, 207),
("Igrave", None, 237, 204, 204),
("J", 74, 74, 74, 74),
("K", 75, 75, 75, 75),
("L", 76, 76, 76, 76),
("Lslash", 232, None, None, 149),
("M", 77, 77, 77, 77),
("N", 78, 78, 78, 78),
("Ntilde", None, 132, 209, 209),
("O", 79, 79, 79, 79),
("OE", 234, 206, 140, 150),
("Oacute", None, 238, 211, 211),
("Ocircumflex", None, 239, 212, 212),
("Odieresis", None, 133, 214, 214),
("Ograve", None, 241, 210, 210),
("Oslash", 233, 175, 216, 216),
("Otilde", None, 205, 213, 213),
("P", 80, 80, 80, 80),
("Q", 81, 81, 81, 81),
("R", 82, 82, 82, 82),
("S", 83, 83, 83, 83),
("Scaron", None, None, 138, 151),
("T", 84, 84, 84, 84),
("Thorn", None, None, 222, 222),
("U", 85, 85, 85, 85),
("Uacute", None, 242, 218, 218),
("Ucircumflex", None, 243, 219, 219),
("Udieresis", None, 134, 220, 220),
("Ugrave", None, 244, 217, 217),
("V", 86, 86, 86, 86),
("W", 87, 87, 87, 87),
("X", 88, 88, 88, 88),
("Y", 89, 89, 89, 89),
("Yacute", None, None, 221, 221),
("Ydieresis", None, 217, 159, 152),
("Z", 90, 90, 90, 90),
("Zcaron", None, None, 142, 153),
("a", 97, 97, 97, 97),
("aacute", None, 135, 225, 225),
("acircumflex", None, 137, 226, 226),
("acute", 194, 171, 180, 180),
("adieresis", None, 138, 228, 228),
("ae", 241, 190, 230, 230),
("agrave", None, 136, 224, 224),
("ampersand", 38, 38, 38, 38),
("aring", None, 140, 229, 229),
("asciicircum", 94, 94, 94, 94),
("asciitilde", 126, 126, 126, 126),
("asterisk", 42, 42, 42, 42),
("at", 64, 64, 64, 64),
("atilde", None, 139, 227, 227),
("b", 98, 98, 98, 98),
("backslash", 92, 92, 92, 92),
("bar", 124, 124, 124, 124),
("braceleft", 123, 123, 123, 123),
("braceright", 125, 125, 125, 125),
("bracketleft", 91, 91, 91, 91),
("bracketright", 93, 93, 93, 93),
("breve", 198, 249, None, 24),
("brokenbar", None, None, 166, 166),
("bullet", 183, 165, 149, 128),
("c", 99, 99, 99, 99),
("caron", 207, 255, None, 25),
("ccedilla", None, 141, 231, 231),
("cedilla", 203, 252, 184, 184),
("cent", 162, 162, 162, 162),
("circumflex", 195, 246, 136, 26),
("colon", 58, 58, 58, 58),
("comma", 44, 44, 44, 44),
("copyright", None, 169, 169, 169),
("currency", 168, 219, 164, 164),
("d", 100, 100, 100, 100),
("dagger", 178, 160, 134, 129),
("daggerdbl", 179, 224, 135, 130),
("degree", None, 161, 176, 176),
("dieresis", 200, 172, 168, 168),
("divide", None, 214, 247, 247),
("dollar", 36, 36, 36, 36),
("dotaccent", 199, 250, None, 27),
("dotlessi", 245, 245, None, 154),
("e", 101, 101, 101, 101),
("eacute", None, 142, 233, 233),
("ecircumflex", None, 144, 234, 234),
("edieresis", None, 145, 235, 235),
("egrave", None, 143, 232, 232),
("eight", 56, 56, 56, 56),
("ellipsis", 188, 201, 133, 131),
("emdash", 208, 209, 151, 132),
("endash", 177, 208, 150, 133),
("equal", 61, 61, 61, 61),
("eth", None, None, 240, 240),
("exclam", 33, 33, 33, 33),
("exclamdown", 161, 193, 161, 161),
("f", 102, 102, 102, 102),
("fi", 174, 222, None, 147),
("five", 53, 53, 53, 53),
("fl", 175, 223, None, 148),
("florin", 166, 196, 131, 134),
("four", 52, 52, 52, 52),
("fraction", 164, 218, None, 135),
("g", 103, 103, 103, 103),
("germandbls", 251, 167, 223, 223),
("grave", 193, 96, 96, 96),
("greater", 62, 62, 62, 62),
("guillemotleft", 171, 199, 171, 171),
("guillemotright", 187, 200, 187, 187),
("guilsinglleft", 172, 220, 139, 136),
("guilsinglright", 173, 221, 155, 137),
("h", 104, 104, 104, 104),
("hungarumlaut", 205, 253, None, 28),
("hyphen", 45, 45, 45, 45),
("i", 105, 105, 105, 105),
("iacute", None, 146, 237, 237),
("icircumflex", None, 148, 238, 238),
("idieresis", None, 149, 239, 239),
("igrave", None, 147, 236, 236),
("j", 106, 106, 106, 106),
("k", 107, 107, 107, 107),
("l", 108, 108, 108, 108),
("less", 60, 60, 60, 60),
("logicalnot", None, 194, 172, 172),
("lslash", 248, None, None, 155),
("m", 109, 109, 109, 109),
("macron", 197, 248, 175, 175),
("minus", None, None, None, 138),
("mu", None, 181, 181, 181),
("multiply", None, None, 215, 215),
("n", 110, 110, 110, 110),
("nbspace", None, 202, 160, None),
("nine", 57, 57, 57, 57),
("ntilde", None, 150, 241, 241),
("numbersign", 35, 35, 35, 35),
("o", 111, 111, 111, 111),
("oacute", None, 151, 243, 243),
("ocircumflex", None, 153, 244, 244),
("odieresis", None, 154, 246, 246),
("oe", 250, 207, 156, 156),
("ogonek", 206, 254, None, 29),
("ograve", None, 152, 242, 242),
("one", 49, 49, 49, 49),
("onehalf", None, None, 189, 189),
("onequarter", None, None, 188, 188),
("onesuperior", None, None, 185, 185),
("ordfeminine", 227, 187, 170, 170),
("ordmasculine", 235, 188, 186, 186),
("oslash", 249, 191, 248, 248),
("otilde", None, 155, 245, 245),
("p", 112, 112, 112, 112),
("paragraph", 182, 166, 182, 182),
("parenleft", 40, 40, 40, 40),
("parenright", 41, 41, 41, 41),
("percent", 37, 37, 37, 37),
("period", 46, 46, 46, 46),
("periodcentered", 180, 225, 183, 183),
("perthousand", 189, 228, 137, 139),
("plus", 43, 43, 43, 43),
("plusminus", None, 177, 177, 177),
("q", 113, 113, 113, 113),
("question", 63, 63, 63, 63),
("questiondown", 191, 192, 191, 191),
("quotedbl", 34, 34, 34, 34),
("quotedblbase", 185, 227, 132, 140),
("quotedblleft", 170, 210, 147, 141),
("quotedblright", 186, 211, 148, 142),
("quoteleft", 96, 212, 145, 143),
("quoteright", 39, 213, 146, 144),
("quotesinglbase", 184, 226, 130, 145),
("quotesingle", 169, 39, 39, 39),
("r", 114, 114, 114, 114),
("registered", None, 168, 174, 174),
("ring", 202, 251, None, 30),
("s", 115, 115, 115, 115),
("scaron", None, None, 154, 157),
("section", 167, 164, 167, 167),
("semicolon", 59, 59, 59, 59),
("seven", 55, 55, 55, 55),
("six", 54, 54, 54, 54),
("slash", 47, 47, 47, 47),
("space", 32, 32, 32, 32),
("space", None, 202, 160, None),
("space", None, 202, 173, None),
("sterling", 163, 163, 163, 163),
("t", 116, 116, 116, 116),
("thorn", None, None, 254, 254),
("three", 51, 51, 51, 51),
("threequarters", None, None, 190, 190),
("threesuperior", None, None, 179, 179),
("tilde", 196, 247, 152, 31),
("trademark", None, 170, 153, 146),
("two", 50, 50, 50, 50),
("twosuperior", None, None, 178, 178),
("u", 117, 117, 117, 117),
("uacute", None, 156, 250, 250),
("ucircumflex", None, 158, 251, 251),
("udieresis", None, 159, 252, 252),
("ugrave", None, 157, 249, 249),
("underscore", 95, 95, 95, 95),
("v", 118, 118, 118, 118),
("w", 119, 119, 119, 119),
("x", 120, 120, 120, 120),
("y", 121, 121, 121, 121),
("yacute", None, None, 253, 253),
("ydieresis", None, 216, 255, 255),
("yen", 165, 180, 165, 165),
("z", 122, 122, 122, 122),
("zcaron", None, None, 158, 158),
("zero", 48, 48, 48, 48),
]

View File

@ -1,7 +1,19 @@
import heapq
import logging
from typing import (Dict, Generic, Iterable, Iterator, List, Optional,
Sequence, Set, Tuple, TypeVar, Union, cast)
from typing import (
Dict,
Generic,
Iterable,
Iterator,
List,
Optional,
Sequence,
Set,
Tuple,
TypeVar,
Union,
cast,
)
from .pdfcolor import PDFColorSpace
from .pdffont import PDFFont
@ -25,7 +37,6 @@ logger = logging.getLogger(__name__)
class IndexAssigner:
def __init__(self, index: int = 0) -> None:
self.index = index
@ -74,7 +85,7 @@ class LAParams:
word_margin: float = 0.1,
boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = False,
all_texts: bool = False
all_texts: bool = False,
) -> None:
self.line_overlap = line_overlap
self.char_margin = char_margin
@ -88,19 +99,22 @@ class LAParams:
def _validate(self) -> None:
if self.boxes_flow is not None:
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
"number between -1 and +1")
if not (isinstance(self.boxes_flow, int) or
isinstance(self.boxes_flow, float)):
boxes_flow_err_msg = (
"LAParam boxes_flow should be None, or a " "number between -1 and +1"
)
if not (
isinstance(self.boxes_flow, int) or isinstance(self.boxes_flow, float)
):
raise TypeError(boxes_flow_err_msg)
if not -1 <= self.boxes_flow <= 1:
raise ValueError(boxes_flow_err_msg)
def __repr__(self) -> str:
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
'word_margin=%.1f all_texts=%r>' % \
(self.char_margin, self.line_margin, self.word_margin,
self.all_texts)
return (
"<LAParams: char_margin=%.1f, line_margin=%.1f, "
"word_margin=%.1f all_texts=%r>"
% (self.char_margin, self.line_margin, self.word_margin, self.all_texts)
)
class LTItem:
@ -115,8 +129,7 @@ class LTText:
"""Interface for things that have text"""
def __repr__(self) -> str:
return ('<%s %r>' %
(self.__class__.__name__, self.get_text()))
return "<%s %r>" % (self.__class__.__name__, self.get_text())
def get_text(self) -> str:
"""Text contained in this object"""
@ -131,8 +144,7 @@ class LTComponent(LTItem):
self.set_bbox(bbox)
def __repr__(self) -> str:
return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox)))
return "<%s %s>" % (self.__class__.__name__, bbox2str(self.bbox))
# Disable comparison.
def __lt__(self, _: object) -> bool:
@ -208,7 +220,7 @@ class LTCurve(LTComponent):
fill: bool = False,
evenodd: bool = False,
stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None
non_stroking_color: Optional[Color] = None,
) -> None:
LTComponent.__init__(self, get_bound(pts))
self.pts = pts
@ -220,7 +232,7 @@ class LTCurve(LTComponent):
self.non_stroking_color = non_stroking_color
def get_pts(self) -> str:
return ','.join('%.3f,%.3f' % p for p in self.pts)
return ",".join("%.3f,%.3f" % p for p in self.pts)
class LTLine(LTCurve):
@ -238,10 +250,18 @@ class LTLine(LTCurve):
fill: bool = False,
evenodd: bool = False,
stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None
non_stroking_color: Optional[Color] = None,
) -> None:
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
stroking_color, non_stroking_color)
LTCurve.__init__(
self,
linewidth,
[p0, p1],
stroke,
fill,
evenodd,
stroking_color,
non_stroking_color,
)
class LTRect(LTCurve):
@ -258,12 +278,19 @@ class LTRect(LTCurve):
fill: bool = False,
evenodd: bool = False,
stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None
non_stroking_color: Optional[Color] = None,
) -> None:
(x0, y0, x1, y1) = bbox
LTCurve.__init__(self, linewidth,
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke,
fill, evenodd, stroking_color, non_stroking_color)
LTCurve.__init__(
self,
linewidth,
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)],
stroke,
fill,
evenodd,
stroking_color,
non_stroking_color,
)
class LTImage(LTComponent):
@ -276,18 +303,20 @@ class LTImage(LTComponent):
LTComponent.__init__(self, bbox)
self.name = name
self.stream = stream
self.srcsize = (stream.get_any(('W', 'Width')),
stream.get_any(('H', 'Height')))
self.imagemask = stream.get_any(('IM', 'ImageMask'))
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
self.srcsize = (stream.get_any(("W", "Width")), stream.get_any(("H", "Height")))
self.imagemask = stream.get_any(("IM", "ImageMask"))
self.bits = stream.get_any(("BPC", "BitsPerComponent"), 1)
self.colorspace = stream.get_any(("CS", "ColorSpace"))
if not isinstance(self.colorspace, list):
self.colorspace = [self.colorspace]
def __repr__(self) -> str:
return ('<%s(%s) %s %r>' %
(self.__class__.__name__, self.name,
bbox2str(self.bbox), self.srcsize))
return "<%s(%s) %s %r>" % (
self.__class__.__name__,
self.name,
bbox2str(self.bbox),
self.srcsize,
)
class LTAnno(LTItem, LTText):
@ -320,7 +349,7 @@ class LTChar(LTComponent, LTText):
textwidth: float,
textdisp: Union[float, Tuple[Optional[float], float]],
ncs: PDFColorSpace,
graphicstate: PDFGraphicState
graphicstate: PDFGraphicState,
) -> None:
LTText.__init__(self)
self._text = text
@ -337,8 +366,8 @@ class LTChar(LTComponent, LTText):
if vx is None:
vx = fontsize * 0.5
else:
vx = vx * fontsize * .001
vy = (1000 - vy) * fontsize * .001
vx = vx * fontsize * 0.001
vy = (1000 - vy) * fontsize * 0.001
bbox_lower_left = (-vx, vy + rise + self.adv)
bbox_upper_right = (-vx + fontsize, vy + rise)
else:
@ -347,7 +376,7 @@ class LTChar(LTComponent, LTText):
bbox_lower_left = (0, descent + rise)
bbox_upper_right = (self.adv, descent + rise + fontsize)
(a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0)
self.upright = 0 < a * d * scaling and b * c <= 0
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
if x1 < x0:
@ -362,10 +391,14 @@ class LTChar(LTComponent, LTText):
return
def __repr__(self) -> str:
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
(self.__class__.__name__, bbox2str(self.bbox),
matrix2str(self.matrix), self.fontname, self.adv,
self.get_text()))
return "<%s %s matrix=%s font=%r adv=%s text=%r>" % (
self.__class__.__name__,
bbox2str(self.bbox),
matrix2str(self.matrix),
self.fontname,
self.adv,
self.get_text(),
)
def get_text(self) -> str:
return self._text
@ -375,7 +408,7 @@ class LTChar(LTComponent, LTText):
return True
LTItemT = TypeVar('LTItemT', bound=LTItem)
LTItemT = TypeVar("LTItemT", bound=LTItem)
class LTContainer(LTComponent, Generic[LTItemT]):
@ -416,8 +449,14 @@ class LTExpandableContainer(LTContainer[LTItemT]):
# super() LTContainer only considers LTItem (no bounding box).
def add(self, obj: LTComponent) -> None: # type: ignore[override]
LTContainer.add(self, cast(LTItemT, obj))
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
max(self.x1, obj.x1), max(self.y1, obj.y1)))
self.set_bbox(
(
min(self.x0, obj.x0),
min(self.y0, obj.y0),
max(self.x1, obj.x1),
max(self.y1, obj.y1),
)
)
return
@ -428,8 +467,9 @@ class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
return
def get_text(self) -> str:
return ''.join(cast(LTText, obj).get_text() for obj in self
if isinstance(obj, LTText))
return "".join(
cast(LTText, obj).get_text() for obj in self if isinstance(obj, LTText)
)
TextLineElement = Union[LTChar, LTAnno]
@ -448,16 +488,19 @@ class LTTextLine(LTTextContainer[TextLineElement]):
return
def __repr__(self) -> str:
return ('<%s %s %r>' %
(self.__class__.__name__, bbox2str(self.bbox),
self.get_text()))
return "<%s %s %r>" % (
self.__class__.__name__,
bbox2str(self.bbox),
self.get_text(),
)
def analyze(self, laparams: LAParams) -> None:
LTTextContainer.analyze(self, laparams)
LTContainer.add(self, LTAnno('\n'))
LTContainer.add(self, LTAnno("\n"))
return
def find_neighbors(self, plane: Plane[LTComponentT], ratio: float
def find_neighbors(
self, plane: Plane[LTComponentT], ratio: float
) -> List["LTTextLine"]:
raise NotImplementedError
@ -474,15 +517,13 @@ class LTTextLineHorizontal(LTTextLine):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if self._x1 < obj.x0 - margin:
LTContainer.add(self, LTAnno(' '))
LTContainer.add(self, LTAnno(" "))
self._x1 = obj.x1
super().add(obj)
return
def find_neighbors(
self,
plane: Plane[LTComponentT],
ratio: float
self, plane: Plane[LTComponentT], ratio: float
) -> List[LTTextLine]:
"""
Finds neighboring LTTextLineHorizontals in the plane.
@ -494,49 +535,41 @@ class LTTextLineHorizontal(LTTextLine):
"""
d = ratio * self.height
objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
self._is_same_height_as(obj, tolerance=d) and
(self._is_left_aligned_with(obj, tolerance=d) or
self._is_right_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]
return [
obj
for obj in objs
if (
isinstance(obj, LTTextLineHorizontal)
and self._is_same_height_as(obj, tolerance=d)
and (
self._is_left_aligned_with(obj, tolerance=d)
or self._is_right_aligned_with(obj, tolerance=d)
or self._is_centrally_aligned_with(obj, tolerance=d)
)
)
]
def _is_left_aligned_with(
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
def _is_left_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
"""
Whether the left-hand edge of `other` is within `tolerance`.
"""
return abs(other.x0 - self.x0) <= tolerance
def _is_right_aligned_with(
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
def _is_right_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
"""
Whether the right-hand edge of `other` is within `tolerance`.
"""
return abs(other.x1 - self.x1) <= tolerance
def _is_centrally_aligned_with(
self,
other: LTComponent,
tolerance: float = 0
self, other: LTComponent, tolerance: float = 0
) -> bool:
"""
Whether the horizontal center of `other` is within `tolerance`.
"""
return abs(
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
return abs((other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
def _is_same_height_as(
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
def _is_same_height_as(self, other: LTComponent, tolerance: float = 0) -> bool:
return abs(other.height - self.height) <= tolerance
@ -552,15 +585,13 @@ class LTTextLineVertical(LTTextLine):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * max(obj.width, obj.height)
if obj.y1 + margin < self._y0:
LTContainer.add(self, LTAnno(' '))
LTContainer.add(self, LTAnno(" "))
self._y0 = obj.y0
super().add(obj)
return
def find_neighbors(
self,
plane: Plane[LTComponentT],
ratio: float
self, plane: Plane[LTComponentT], ratio: float
) -> List[LTTextLine]:
"""
Finds neighboring LTTextLineVerticals in the plane.
@ -572,43 +603,39 @@ class LTTextLineVertical(LTTextLine):
"""
d = ratio * self.width
objs = plane.find((self.x0 - d, self.y0, self.x1 + d, self.y1))
return [obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and
self._is_same_width_as(obj, tolerance=d) and
(self._is_lower_aligned_with(obj, tolerance=d) or
self._is_upper_aligned_with(obj, tolerance=d) or
self._is_centrally_aligned_with(obj, tolerance=d)))]
return [
obj
for obj in objs
if (
isinstance(obj, LTTextLineVertical)
and self._is_same_width_as(obj, tolerance=d)
and (
self._is_lower_aligned_with(obj, tolerance=d)
or self._is_upper_aligned_with(obj, tolerance=d)
or self._is_centrally_aligned_with(obj, tolerance=d)
)
)
]
def _is_lower_aligned_with(
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
def _is_lower_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
"""
Whether the lower edge of `other` is within `tolerance`.
"""
return abs(other.y0 - self.y0) <= tolerance
def _is_upper_aligned_with(
self,
other: LTComponent,
tolerance: float = 0
) -> bool:
def _is_upper_aligned_with(self, other: LTComponent, tolerance: float = 0) -> bool:
"""
Whether the upper edge of `other` is within `tolerance`.
"""
return abs(other.y1 - self.y1) <= tolerance
def _is_centrally_aligned_with(
self,
other: LTComponent,
tolerance: float = 0
self, other: LTComponent, tolerance: float = 0
) -> bool:
"""
Whether the vertical center of `other` is within `tolerance`.
"""
return abs(
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
return abs((other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
return abs(other.width - self.width) <= tolerance
@ -628,9 +655,12 @@ class LTTextBox(LTTextContainer[LTTextLine]):
return
def __repr__(self) -> str:
return ('<%s(%s) %s %r>' %
(self.__class__.__name__,
self.index, bbox2str(self.bbox), self.get_text()))
return "<%s(%s) %s %r>" % (
self.__class__.__name__,
self.index,
bbox2str(self.bbox),
self.get_text(),
)
def get_writing_mode(self) -> str:
raise NotImplementedError
@ -643,7 +673,7 @@ class LTTextBoxHorizontal(LTTextBox):
return
def get_writing_mode(self) -> str:
return 'lr-tb'
return "lr-tb"
class LTTextBoxVertical(LTTextBox):
@ -653,7 +683,7 @@ class LTTextBoxVertical(LTTextBox):
return
def get_writing_mode(self) -> str:
return 'tb-rl'
return "tb-rl"
TextGroupElement = Union[LTTextBox, "LTTextGroup"]
@ -674,7 +704,8 @@ class LTTextGroupLRTB(LTTextGroup):
# reorder the objects from top-left to bottom-right.
self._objs.sort(
key=lambda obj: (1 - boxes_flow) * obj.x0
- (1 + boxes_flow) * (obj.y0 + obj.y1))
- (1 + boxes_flow) * (obj.y0 + obj.y1)
)
return
@ -686,7 +717,8 @@ class LTTextGroupTBRL(LTTextGroup):
# reorder the objects from top-right to bottom-left.
self._objs.sort(
key=lambda obj: -(1 + boxes_flow) * (obj.x0 + obj.x1)
- (1 - boxes_flow) * obj.y1)
- (1 - boxes_flow) * obj.y1
)
return
@ -698,9 +730,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
# group_objects: group text object to textlines.
def group_objects(
self,
laparams: LAParams,
objs: Iterable[LTComponent]
self, laparams: LAParams, objs: Iterable[LTComponent]
) -> Iterator[LTTextLine]:
obj0 = None
line = None
@ -716,13 +746,14 @@ class LTLayoutContainer(LTContainer[LTComponent]):
#
# |<--->|
# (char_margin)
halign = \
obj0.is_compatible(obj1) \
and obj0.is_voverlap(obj1) \
and min(obj0.height, obj1.height) * laparams.line_overlap \
< obj0.voverlap(obj1) \
and obj0.hdistance(obj1) \
halign = (
obj0.is_compatible(obj1)
and obj0.is_voverlap(obj1)
and min(obj0.height, obj1.height) * laparams.line_overlap
< obj0.voverlap(obj1)
and obj0.hdistance(obj1)
< max(obj0.width, obj1.width) * laparams.char_margin
)
# valign: obj0 and obj1 is vertically aligned.
#
@ -738,17 +769,19 @@ class LTLayoutContainer(LTContainer[LTComponent]):
#
# |<-->|
# (line_overlap)
valign = \
laparams.detect_vertical \
and obj0.is_compatible(obj1) \
and obj0.is_hoverlap(obj1) \
and min(obj0.width, obj1.width) * laparams.line_overlap \
< obj0.hoverlap(obj1) \
and obj0.vdistance(obj1) \
valign = (
laparams.detect_vertical
and obj0.is_compatible(obj1)
and obj0.is_hoverlap(obj1)
and min(obj0.width, obj1.width) * laparams.line_overlap
< obj0.hoverlap(obj1)
and obj0.vdistance(obj1)
< max(obj0.height, obj1.height) * laparams.char_margin
)
if ((halign and isinstance(line, LTTextLineHorizontal)) or
(valign and isinstance(line, LTTextLineVertical))):
if (halign and isinstance(line, LTTextLineHorizontal)) or (
valign and isinstance(line, LTTextLineVertical)
):
line.add(obj1)
elif line is not None:
@ -777,9 +810,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
return
def group_textlines(
self,
laparams: LAParams,
lines: Iterable[LTTextLine]
self, laparams: LAParams, lines: Iterable[LTTextLine]
) -> Iterator[LTTextBox]:
"""Group neighboring lines to textboxes"""
plane: Plane[LTTextLine] = Plane(self.bbox)
@ -812,9 +843,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
return
def group_textboxes(
self,
laparams: LAParams,
boxes: Sequence[LTTextBox]
self, laparams: LAParams, boxes: Sequence[LTTextBox]
) -> List[LTTextGroup]:
"""Group textboxes hierarchically.
@ -853,8 +882,11 @@ class LTLayoutContainer(LTContainer[LTComponent]):
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
return (x1 - x0) * (y1 - y0) \
- obj1.width*obj1.height - obj2.width*obj2.height
return (
(x1 - x0) * (y1 - y0)
- obj1.width * obj1.height
- obj2.width * obj2.height
)
def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
"""Check if there's any other object between obj1 and obj2."""
@ -870,8 +902,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
box1 = boxes[i]
for j in range(i + 1, len(boxes)):
box2 = boxes[j]
dists.append((False, dist(box1, box2), id(box1), id(box2),
box1, box2))
dists.append((False, dist(box1, box2), id(box1), id(box2), box1, box2))
heapq.heapify(dists)
plane.extend(boxes)
@ -883,8 +914,9 @@ class LTLayoutContainer(LTContainer[LTComponent]):
if not skip_isany and isany(obj1, obj2):
heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
continue
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(
obj2, (LTTextBoxVertical, LTTextGroupTBRL)
):
group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1, obj2])
@ -893,8 +925,10 @@ class LTLayoutContainer(LTContainer[LTComponent]):
done.update([id1, id2])
for other in plane:
heapq.heappush(dists, (False, dist(group, other),
id(group), id(other), group, other))
heapq.heappush(
dists,
(False, dist(group, other), id(group), id(other), group, other),
)
plane.add(group)
# By now only groups are in the plane
return list(cast(LTTextGroup, g) for g in plane)
@ -902,8 +936,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
def analyze(self, laparams: LAParams) -> None:
# textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page.
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
self)
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs:
@ -922,6 +955,7 @@ class LTLayoutContainer(LTContainer[LTComponent]):
return (0, -box.x1, -box.y0)
else:
return (1, -box.y0, box.x0)
textboxes.sort(key=getkey)
else:
self.groups = self.group_textboxes(laparams, textboxes)
@ -930,8 +964,11 @@ class LTLayoutContainer(LTContainer[LTComponent]):
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box: box.index)
self._objs = (cast(List[LTComponent], textboxes) + otherobjs
+ cast(List[LTComponent], empties))
self._objs = (
cast(List[LTComponent], textboxes)
+ otherobjs
+ cast(List[LTComponent], empties)
)
return
@ -953,9 +990,12 @@ class LTFigure(LTLayoutContainer):
return
def __repr__(self) -> str:
return ('<%s(%s) %s matrix=%s>' %
(self.__class__.__name__, self.name,
bbox2str(self.bbox), matrix2str(self.matrix)))
return "<%s(%s) %s matrix=%s>" % (
self.__class__.__name__,
self.name,
bbox2str(self.bbox),
matrix2str(self.matrix),
)
def analyze(self, laparams: LAParams) -> None:
if not laparams.all_texts:
@ -978,6 +1018,9 @@ class LTPage(LTLayoutContainer):
return
def __repr__(self) -> str:
return ('<%s(%r) %s rotate=%r>' %
(self.__class__.__name__, self.pageid,
bbox2str(self.bbox), self.rotate))
return "<%s(%r) %s rotate=%r>" % (
self.__class__.__name__,
self.pageid,
bbox2str(self.bbox),
self.rotate,
)

View File

@ -10,7 +10,6 @@ class CorruptDataError(Exception):
class LZWDecoder:
def __init__(self, fp: BinaryIO) -> None:
self.fp = fp
self.buff = 0
@ -46,12 +45,12 @@ class LZWDecoder:
return v
def feed(self, code: int) -> bytes:
x = b''
x = b""
if code == 256:
self.table = [bytes((c,)) for c in range(256)] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = b''
self.prevbuf = b""
self.nbits = 9
elif code == 257:
pass
@ -91,11 +90,13 @@ class LZWDecoder:
break
yield x
assert self.table is not None
logger.debug('nbits=%d, code=%d, output=%r, table=%r'
% (self.nbits, code, x, self.table[258:]))
logger.debug(
"nbits=%d, code=%d, output=%r, table=%r"
% (self.nbits, code, x, self.table[258:])
)
def lzwdecode(data: bytes) -> bytes:
fp = BytesIO(data)
s = LZWDecoder(fp).run()
return b''.join(s)
return b"".join(s)

View File

@ -3,33 +3,31 @@ from typing import Dict
from .psparser import LIT
LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
LITERAL_DEVICE_GRAY = LIT("DeviceGray")
LITERAL_DEVICE_RGB = LIT("DeviceRGB")
LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
class PDFColorSpace:
def __init__(self, name: str, ncomponents: int) -> None:
self.name = name
self.ncomponents = ncomponents
def __repr__(self) -> str:
return '<PDFColorSpace: %s, ncomponents=%d>' % \
(self.name, self.ncomponents)
return "<PDFColorSpace: %s, ncomponents=%d>" % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
for (name, n) in [
('DeviceGray', 1), # default value first
('CalRGB', 3),
('CalGray', 1),
('Lab', 3),
('DeviceRGB', 3),
('DeviceCMYK', 4),
('Separation', 1),
('Indexed', 1),
('Pattern', 1),
("DeviceGray", 1), # default value first
("CalRGB", 3),
("CalGray", 1),
("Lab", 3),
("DeviceRGB", 3),
("DeviceCMYK", 4),
("Separation", 1),
("Indexed", 1),
("Pattern", 1),
]:
PREDEFINED_COLORSPACE[name] = PDFColorSpace(name, n)

View File

@ -1,5 +1,13 @@
from typing import (BinaryIO, Iterable, List, Optional, Sequence,
TYPE_CHECKING, Union, cast)
from typing import (
BinaryIO,
Iterable,
List,
Optional,
Sequence,
TYPE_CHECKING,
Union,
cast,
)
from pdfminer.psparser import PSLiteral
from . import utils
@ -21,25 +29,19 @@ PDFTextSeq = Iterable[Union[int, float, bytes]]
class PDFDevice:
"""Translate the output of PDFPageInterpreter to the output that is needed
"""
"""Translate the output of PDFPageInterpreter to the output that is needed"""
def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
self.rsrcmgr = rsrcmgr
self.ctm: Optional[Matrix] = None
def __repr__(self) -> str:
return '<PDFDevice>'
return "<PDFDevice>"
def __enter__(self) -> "PDFDevice":
return self
def __exit__(
self,
exc_type: object,
exc_val: object,
exc_tb: object
) -> None:
def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
self.close()
def close(self) -> None:
@ -48,21 +50,13 @@ class PDFDevice:
def set_ctm(self, ctm: Matrix) -> None:
self.ctm = ctm
def begin_tag(
self,
tag: PSLiteral,
props: Optional["PDFStackT"] = None
) -> None:
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
pass
def end_tag(self) -> None:
pass
def do_tag(
self,
tag: PSLiteral,
props: Optional["PDFStackT"] = None
) -> None:
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
pass
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
@ -83,7 +77,7 @@ class PDFDevice:
stroke: bool,
fill: bool,
evenodd: bool,
path: Sequence[PathSegment]
path: Sequence[PathSegment],
) -> None:
pass
@ -95,42 +89,61 @@ class PDFDevice:
textstate: "PDFTextState",
seq: PDFTextSeq,
ncs: PDFColorSpace,
graphicstate: "PDFGraphicState"
graphicstate: "PDFGraphicState",
) -> None:
pass
class PDFTextDevice(PDFDevice):
def render_string(
self,
textstate: "PDFTextState",
seq: PDFTextSeq,
ncs: PDFColorSpace,
graphicstate: "PDFGraphicState"
graphicstate: "PDFGraphicState",
) -> None:
assert self.ctm is not None
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
fontsize = textstate.fontsize
scaling = textstate.scaling * .01
scaling = textstate.scaling * 0.01
charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling
rise = textstate.rise
assert font is not None
if font.is_multibyte():
wordspace = 0
dxscale = .001 * fontsize * scaling
dxscale = 0.001 * fontsize * scaling
if font.is_vertical():
textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale, ncs,
graphicstate)
seq,
matrix,
textstate.linematrix,
font,
fontsize,
scaling,
charspace,
wordspace,
rise,
dxscale,
ncs,
graphicstate,
)
else:
textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale, ncs,
graphicstate)
seq,
matrix,
textstate.linematrix,
font,
fontsize,
scaling,
charspace,
wordspace,
rise,
dxscale,
ncs,
graphicstate,
)
def render_string_horizontal(
self,
@ -145,7 +158,7 @@ class PDFTextDevice(PDFDevice):
rise: float,
dxscale: float,
ncs: PDFColorSpace,
graphicstate: "PDFGraphicState"
graphicstate: "PDFGraphicState",
) -> Point:
(x, y) = pos
needcharspace = False
@ -158,8 +171,15 @@ class PDFTextDevice(PDFDevice):
if needcharspace:
x += charspace
x += self.render_char(
utils.translate_matrix(matrix, (x, y)), font,
fontsize, scaling, rise, cid, ncs, graphicstate)
utils.translate_matrix(matrix, (x, y)),
font,
fontsize,
scaling,
rise,
cid,
ncs,
graphicstate,
)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
@ -178,7 +198,7 @@ class PDFTextDevice(PDFDevice):
rise: float,
dxscale: float,
ncs: PDFColorSpace,
graphicstate: "PDFGraphicState"
graphicstate: "PDFGraphicState",
) -> Point:
(x, y) = pos
needcharspace = False
@ -191,8 +211,15 @@ class PDFTextDevice(PDFDevice):
if needcharspace:
y += charspace
y += self.render_char(
utils.translate_matrix(matrix, (x, y)), font, fontsize,
scaling, rise, cid, ncs, graphicstate)
utils.translate_matrix(matrix, (x, y)),
font,
fontsize,
scaling,
rise,
cid,
ncs,
graphicstate,
)
if cid == 32 and wordspace:
y += wordspace
needcharspace = True
@ -207,18 +234,14 @@ class PDFTextDevice(PDFDevice):
rise: float,
cid: int,
ncs: PDFColorSpace,
graphicstate: "PDFGraphicState"
graphicstate: "PDFGraphicState",
) -> float:
return 0
class TagExtractor(PDFDevice):
def __init__(
self,
rsrcmgr: "PDFResourceManager",
outfp: BinaryIO,
codec: str = 'utf-8'
self, rsrcmgr: "PDFResourceManager", outfp: BinaryIO, codec: str = "utf-8"
) -> None:
PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp
@ -231,11 +254,11 @@ class TagExtractor(PDFDevice):
textstate: "PDFTextState",
seq: PDFTextSeq,
ncs: PDFColorSpace,
graphicstate: "PDFGraphicState"
graphicstate: "PDFGraphicState",
) -> None:
font = textstate.font
assert font is not None
text = ''
text = ""
for obj in seq:
if isinstance(obj, str):
obj = utils.make_compat_bytes(obj)
@ -251,25 +274,29 @@ class TagExtractor(PDFDevice):
self._write(utils.enc(text))
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
output = '<page id="%s" bbox="%s" rotate="%d">' %\
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
output = '<page id="%s" bbox="%s" rotate="%d">' % (
self.pageno,
utils.bbox2str(page.mediabox),
page.rotate,
)
self._write(output)
return
def end_page(self, page: PDFPage) -> None:
self._write('</page>\n')
self._write("</page>\n")
self.pageno += 1
return
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None
) -> None:
s = ''
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
s = ""
if isinstance(props, dict):
s = ''.join([
s = "".join(
[
' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
for (k, v) in sorted(props.items())
])
out_s = '<{}{}>'.format(utils.enc(cast(str, tag.name)), s)
]
)
out_s = "<{}{}>".format(utils.enc(cast(str, tag.name)), s)
self._write(out_s)
self._stack.append(tag)
return
@ -277,12 +304,11 @@ class TagExtractor(PDFDevice):
def end_tag(self) -> None:
assert self._stack, str(self.pageno)
tag = self._stack.pop(-1)
out_s = '</%s>' % utils.enc(cast(str, tag.name))
out_s = "</%s>" % utils.enc(cast(str, tag.name))
self._write(out_s)
return
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None
) -> None:
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None:
self.begin_tag(tag, props)
self._stack.pop(-1)
return

View File

@ -3,8 +3,21 @@ import logging
import re
import struct
from hashlib import sha256, md5, sha384, sha512
from typing import (Any, Callable, Dict, Iterable, Iterator, KeysView, List,
Optional, Sequence, Tuple, Type, Union, cast)
from typing import (
Any,
Callable,
Dict,
Iterable,
Iterator,
KeysView,
List,
Optional,
Sequence,
Tuple,
Type,
Union,
cast,
)
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
@ -13,12 +26,22 @@ from . import settings
from .arcfour import Arcfour
from .data_structures import NumberTree
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, \
PDFStream, PDFObjectNotFound, decipher_all, int_value, str_value, \
list_value, uint_value, dict_value, stream_value
from .pdftypes import (
DecipherCallable,
PDFException,
PDFTypeError,
PDFStream,
PDFObjectNotFound,
decipher_all,
int_value,
str_value,
list_value,
uint_value,
dict_value,
stream_value,
)
from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, decode_text, nunpack, format_int_roman, \
format_int_alpha
from .utils import choplist, decode_text, nunpack, format_int_roman, format_int_alpha
log = logging.getLogger(__name__)
@ -32,6 +55,7 @@ class PDFNoValidXRefWarning(SyntaxWarning):
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
"""
pass
@ -60,6 +84,7 @@ class PDFEncryptionWarning(UserWarning):
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
"""
pass
@ -68,6 +93,7 @@ class PDFTextExtractionNotAllowedWarning(UserWarning):
Not used anymore because warnings.warn is replaced by logger.Logger.warn.
"""
pass
@ -78,15 +104,19 @@ class PDFTextExtractionNotAllowed(PDFEncryptionError):
class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
def __init__(self, *args: object) -> None:
from warnings import warn
warn('PDFTextExtractionNotAllowedError will be removed in the future. '
'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning)
warn(
"PDFTextExtractionNotAllowedError will be removed in the future. "
"Use PDFTextExtractionNotAllowed instead.",
DeprecationWarning,
)
super().__init__(*args)
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
LITERAL_XREF = LIT('XRef')
LITERAL_CATALOG = LIT('Catalog')
LITERAL_OBJSTM = LIT("ObjStm")
LITERAL_XREF = LIT("XRef")
LITERAL_CATALOG = LIT("Catalog")
class PDFBaseXRef:
@ -107,13 +137,12 @@ class PDFBaseXRef:
class PDFXRef(PDFBaseXRef):
def __init__(self) -> None:
self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
self.trailer: Dict[str, Any] = {}
def __repr__(self) -> str:
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
return "<PDFXRef: offsets=%r>" % (self.offsets.keys())
def load(self, parser: PDFParser) -> None:
while True:
@ -123,51 +152,50 @@ class PDFXRef(PDFBaseXRef):
if not line:
continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if line.startswith(b'trailer'):
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
if line.startswith(b"trailer"):
parser.seek(pos)
break
f = line.split(b' ')
f = line.split(b" ")
if len(f) != 2:
error_msg = 'Trailer not found: {!r}: line={!r}'\
.format(parser, line)
error_msg = "Trailer not found: {!r}: line={!r}".format(parser, line)
raise PDFNoValidXRef(error_msg)
try:
(start, nobjs) = map(int, f)
except ValueError:
error_msg = 'Invalid line: {!r}: line={!r}'\
.format(parser, line)
error_msg = "Invalid line: {!r}: line={!r}".format(parser, line)
raise PDFNoValidXRef(error_msg)
for objid in range(start, start + nobjs):
try:
(_, line) = parser.nextline()
line = line.strip()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
f = line.split(b' ')
raise PDFNoValidXRef("Unexpected EOF - file corrupted?")
f = line.split(b" ")
if len(f) != 3:
error_msg = 'Invalid XRef format: {!r}, line={!r}'\
.format(parser, line)
error_msg = "Invalid XRef format: {!r}, line={!r}".format(
parser, line
)
raise PDFNoValidXRef(error_msg)
(pos_b, genno_b, use_b) = f
if use_b != b'n':
if use_b != b"n":
continue
self.offsets[objid] = (None, int(pos_b), int(genno_b))
log.debug('xref objects: %r', self.offsets)
log.debug("xref objects: %r", self.offsets)
self.load_trailer(parser)
def load_trailer(self, parser: PDFParser) -> None:
try:
(_, kwd) = parser.nexttoken()
assert kwd is KWD(b'trailer'), str(kwd)
assert kwd is KWD(b"trailer"), str(kwd)
(_, dic) = parser.nextobject()
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
raise PDFNoValidXRef("Unexpected EOF - file corrupted")
(_, dic) = x[0]
self.trailer.update(dict_value(dic))
log.debug('trailer=%r', self.trailer)
log.debug("trailer=%r", self.trailer)
def get_trailer(self) -> Dict[str, Any]:
return self.trailer
@ -183,11 +211,10 @@ class PDFXRef(PDFBaseXRef):
class PDFXRefFallback(PDFXRef):
def __repr__(self) -> str:
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
return "<PDFXRefFallback: offsets=%r>" % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
PDFOBJ_CUE = re.compile(r"^(\d+)\s+(\d+)\s+obj\b")
def load(self, parser: PDFParser) -> None:
parser.seek(0)
@ -196,12 +223,12 @@ class PDFXRefFallback(PDFXRef):
(pos, line_bytes) = parser.nextline()
except PSEOF:
break
if line_bytes.startswith(b'trailer'):
if line_bytes.startswith(b"trailer"):
parser.seek(pos)
self.load_trailer(parser)
log.debug('trailer: %r', self.trailer)
log.debug("trailer: %r", self.trailer)
break
line = line_bytes.decode('latin-1') # default pdf encoding
line = line_bytes.decode("latin-1") # default pdf encoding
m = self.PDFOBJ_CUE.match(line)
if not m:
continue
@ -212,14 +239,13 @@ class PDFXRefFallback(PDFXRef):
# expand ObjStm.
parser.seek(pos)
(_, obj) = parser.nextobject()
if isinstance(obj, PDFStream) \
and obj.get('Type') is LITERAL_OBJSTM:
if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM:
stream = stream_value(obj)
try:
n = stream['N']
n = stream["N"]
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
raise PDFSyntaxError("N is not defined: %r" % stream)
n = 0
parser1 = PDFStreamParser(stream.get_data())
objs: List[int] = []
@ -236,7 +262,6 @@ class PDFXRefFallback(PDFXRef):
class PDFXRefStream(PDFBaseXRef):
def __init__(self) -> None:
self.data: Optional[bytes] = None
self.entlen: Optional[int] = None
@ -246,31 +271,32 @@ class PDFXRefStream(PDFBaseXRef):
self.ranges: List[Tuple[int, int]] = []
def __repr__(self) -> str:
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
return "<PDFXRefStream: ranges=%r>" % (self.ranges)
def load(self, parser: PDFParser) -> None:
(_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject()
if not isinstance(stream, PDFStream) \
or stream.get('Type') is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (0, size))
if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF:
raise PDFNoValidXRef("Invalid PDF stream spec.")
size = stream["Size"]
index_array = stream.get("Index", (0, size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.ranges.extend(cast(Iterator[Tuple[int, int]],
choplist(2, index_array)))
(self.fl1, self.fl2, self.fl3) = stream['W']
assert (self.fl1 is not None and self.fl2 is not None
and self.fl3 is not None)
raise PDFSyntaxError("Invalid index number")
self.ranges.extend(cast(Iterator[Tuple[int, int]], choplist(2, index_array)))
(self.fl1, self.fl2, self.fl3) = stream["W"]
assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
self.data = stream.get_data()
self.entlen = self.fl1 + self.fl2 + self.fl3
self.trailer = stream.attrs
log.debug('xref stream: objid=%s, fields=%d,%d,%d',
', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)
log.debug(
"xref stream: objid=%s, fields=%d,%d,%d",
", ".join(map(repr, self.ranges)),
self.fl1,
self.fl2,
self.fl3,
)
return
def get_trailer(self) -> Dict[str, Any]:
@ -300,8 +326,7 @@ class PDFXRefStream(PDFBaseXRef):
raise KeyError(objid)
assert self.entlen is not None
assert self.data is not None
assert (self.fl1 is not None and self.fl2 is not None
and self.fl3 is not None)
assert self.fl1 is not None and self.fl2 is not None and self.fl3 is not None
offset = self.entlen * index
ent = self.data[offset : offset + self.entlen]
f1 = nunpack(ent[: self.fl1], 1)
@ -318,15 +343,14 @@ class PDFXRefStream(PDFBaseXRef):
class PDFStandardSecurityHandler:
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
PASSWORD_PADDING = (
b"(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08"
b"..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz"
)
supported_revisions: Tuple[int, ...] = (2, 3)
def __init__(
self,
docid: Sequence[bytes],
param: Dict[str, Any],
password: str = ''
self, docid: Sequence[bytes], param: Dict[str, Any], password: str = ""
) -> None:
self.docid = docid
self.param = param
@ -337,18 +361,18 @@ class PDFStandardSecurityHandler:
def init(self) -> None:
self.init_params()
if self.r not in self.supported_revisions:
error_msg = 'Unsupported revision: param=%r' % self.param
error_msg = "Unsupported revision: param=%r" % self.param
raise PDFEncryptionError(error_msg)
self.init_key()
return
def init_params(self) -> None:
self.v = int_value(self.param.get('V', 0))
self.r = int_value(self.param['R'])
self.p = uint_value(self.param['P'], 32)
self.o = str_value(self.param['O'])
self.u = str_value(self.param['U'])
self.length = int_value(self.param.get('Length', 40))
self.v = int_value(self.param.get("V", 0))
self.r = int_value(self.param["R"])
self.p = uint_value(self.param["P"], 32)
self.o = str_value(self.param["O"])
self.u = str_value(self.param["U"])
self.length = int_value(self.param.get("Length", 40))
return
def init_key(self) -> None:
@ -376,7 +400,7 @@ class PDFStandardSecurityHandler:
hash.update(self.docid[0]) # 3
result = Arcfour(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
k = b''.join(bytes((c ^ i,)) for c in iter(key))
k = b"".join(bytes((c ^ i,)) for c in iter(key))
result = Arcfour(k).encrypt(result)
result += result # 6
return result
@ -387,11 +411,11 @@ class PDFStandardSecurityHandler:
hash = md5(password) # 2
hash.update(self.o) # 3
# See https://github.com/pdfminer/pdfminer.six/issues/186
hash.update(struct.pack('<L', self.p)) # 4
hash.update(struct.pack("<L", self.p)) # 4
hash.update(self.docid[0]) # 5
if self.r >= 4:
if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
hash.update(b'\xff\xff\xff\xff')
hash.update(b"\xff\xff\xff\xff")
result = hash.digest()
n = 5
if self.r >= 3:
@ -437,7 +461,7 @@ class PDFStandardSecurityHandler:
else:
user_password = self.o
for i in range(19, -1, -1):
k = b''.join(bytes((c ^ i,)) for c in iter(key))
k = b"".join(bytes((c ^ i,)) for c in iter(key))
user_password = Arcfour(k).decrypt(user_password)
return self.authenticate_user_password(user_password)
@ -446,14 +470,13 @@ class PDFStandardSecurityHandler:
objid: int,
genno: int,
data: bytes,
attrs: Optional[Dict[str, Any]] = None
attrs: Optional[Dict[str, Any]] = None,
) -> bytes:
return self.decrypt_rc4(objid, genno, data)
def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
assert self.key is not None
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2]
key = self.key + struct.pack("<L", objid)[:3] + struct.pack("<L", genno)[:2]
hash = md5(key)
key = hash.digest()[: min(len(key), 16)]
return Arcfour(key).decrypt(data)
@ -466,34 +489,30 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
def init_params(self) -> None:
super().init_params()
self.length = 128
self.cf = dict_value(self.param.get('CF'))
self.stmf = literal_name(self.param['StmF'])
self.strf = literal_name(self.param['StrF'])
self.encrypt_metadata = bool(self.param.get('EncryptMetadata', True))
self.cf = dict_value(self.param.get("CF"))
self.stmf = literal_name(self.param["StmF"])
self.strf = literal_name(self.param["StrF"])
self.encrypt_metadata = bool(self.param.get("EncryptMetadata", True))
if self.stmf != self.strf:
error_msg = 'Unsupported crypt filter: param=%r' % self.param
error_msg = "Unsupported crypt filter: param=%r" % self.param
raise PDFEncryptionError(error_msg)
self.cfm = {}
for k, v in self.cf.items():
f = self.get_cfm(literal_name(v['CFM']))
f = self.get_cfm(literal_name(v["CFM"]))
if f is None:
error_msg = 'Unknown crypt filter method: param=%r' \
% self.param
error_msg = "Unknown crypt filter method: param=%r" % self.param
raise PDFEncryptionError(error_msg)
self.cfm[k] = f
self.cfm['Identity'] = self.decrypt_identity
self.cfm["Identity"] = self.decrypt_identity
if self.strf not in self.cfm:
error_msg = 'Undefined crypt filter: param=%r' % self.param
error_msg = "Undefined crypt filter: param=%r" % self.param
raise PDFEncryptionError(error_msg)
return
def get_cfm(
self,
name: str
) -> Optional[Callable[[int, int, bytes], bytes]]:
if name == 'V2':
def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
if name == "V2":
return self.decrypt_rc4
elif name == 'AESV2':
elif name == "AESV2":
return self.decrypt_aes128
else:
return None
@ -504,11 +523,11 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
genno: int,
data: bytes,
attrs: Optional[Dict[str, Any]] = None,
name: Optional[str] = None
name: Optional[str] = None,
) -> bytes:
if not self.encrypt_metadata and attrs is not None:
t = attrs.get('Type')
if t is not None and literal_name(t) == 'Metadata':
t = attrs.get("Type")
if t is not None and literal_name(t) == "Metadata":
return data
if name is None:
name = self.strf
@ -519,15 +538,21 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
assert self.key is not None
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2] + b'sAlT'
key = (
self.key
+ struct.pack("<L", objid)[:3]
+ struct.pack("<L", genno)[:2]
+ b"sAlT"
)
hash = md5(key)
key = hash.digest()[: min(len(key), 16)]
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(key),
cipher = Cipher(
algorithms.AES(key),
modes.CBC(initialization_vector),
backend=default_backend()) # type: ignore
backend=default_backend(),
) # type: ignore
return cipher.decryptor().update(ciphertext) # type: ignore
@ -538,8 +563,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
def init_params(self) -> None:
super().init_params()
self.length = 256
self.oe = str_value(self.param['OE'])
self.ue = str_value(self.param['UE'])
self.oe = str_value(self.param["OE"])
self.ue = str_value(self.param["UE"])
self.o_hash = self.o[:32]
self.o_validation_salt = self.o[32:40]
self.o_key_salt = self.o[40:]
@ -548,11 +573,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
self.u_key_salt = self.u[40:]
return
def get_cfm(
self,
name: str
) -> Optional[Callable[[int, int, bytes], bytes]]:
if name == 'AESV3':
def get_cfm(self, name: str) -> Optional[Callable[[int, int, bytes], bytes]]:
if name == "AESV3":
return self.decrypt_aes256
else:
return None
@ -562,16 +584,16 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
hash = self._password_hash(password_b, self.o_validation_salt, self.u)
if hash == self.o_hash:
hash = self._password_hash(password_b, self.o_key_salt, self.u)
cipher = Cipher(algorithms.AES(hash),
modes.CBC(b'\0' * 16),
backend=default_backend()) # type: ignore
cipher = Cipher(
algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend()
) # type: ignore
return cipher.decryptor().update(self.oe) # type: ignore
hash = self._password_hash(password_b, self.u_validation_salt)
if hash == self.u_hash:
hash = self._password_hash(password_b, self.u_key_salt)
cipher = Cipher(algorithms.AES(hash),
modes.CBC(b'\0' * 16),
backend=default_backend()) # type: ignore
cipher = Cipher(
algorithms.AES(hash), modes.CBC(b"\0" * 16), backend=default_backend()
) # type: ignore
return cipher.decryptor().update(self.ue) # type: ignore
return None
@ -579,16 +601,14 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
if self.r == 6:
# saslprep expects non-empty strings, apparently
if not password:
return b''
return b""
from ._saslprep import saslprep
password = saslprep(password)
return password.encode('utf-8')[:127]
return password.encode("utf-8")[:127]
def _password_hash(
self,
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
self, password: bytes, salt: bytes, vector: Optional[bytes] = None
) -> bytes:
"""
Compute password hash depending on revision number
@ -598,10 +618,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
return self._r6_password(password, salt[0:8], vector)
def _r5_password(
self,
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
self, password: bytes, salt: bytes, vector: Optional[bytes] = None
) -> bytes:
"""
Compute the password for revision 5
@ -613,10 +630,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
return hash.digest()
def _r6_password(
self,
password: bytes,
salt: bytes,
vector: Optional[bytes] = None
self, password: bytes, salt: bytes, vector: Optional[bytes] = None
) -> bytes:
"""
Compute the password for revision 6
@ -629,10 +643,8 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
hashes = (sha256, sha384, sha512)
round_no = last_byte_val = 0
while round_no < 64 or last_byte_val > round_no - 32:
k1 = (password + k + (vector or b'')) * 64
e = self._aes_cbc_encrypt(
key=k[:16], iv=k[16:32], data=k1
)
k1 = (password + k + (vector or b"")) * 64
e = self._aes_cbc_encrypt(key=k[:16], iv=k[16:32], data=k1)
# compute the first 16 bytes of e,
# interpreted as an unsigned integer mod 3
next_hash = hashes[self._bytes_mod_3(e[:16])]
@ -646,12 +658,7 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
# 256 is 1 mod 3, so we can just sum 'em
return sum(b % 3 for b in input_bytes) % 3
def _aes_cbc_encrypt(
self,
key: bytes,
iv: bytes,
data: bytes
) -> bytes:
def _aes_cbc_encrypt(self, key: bytes, iv: bytes, data: bytes) -> bytes:
cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
encryptor = cipher.encryptor() # type: ignore
return encryptor.update(data) + encryptor.finalize() # type: ignore
@ -660,9 +667,11 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
initialization_vector = data[:16]
ciphertext = data[16:]
assert self.key is not None
cipher = Cipher(algorithms.AES(self.key),
cipher = Cipher(
algorithms.AES(self.key),
modes.CBC(initialization_vector),
backend=default_backend()) # type: ignore
backend=default_backend(),
) # type: ignore
return cipher.decryptor().update(ciphertext) # type: ignore
@ -689,9 +698,9 @@ class PDFDocument:
def __init__(
self,
parser: PDFParser,
password: str = '',
password: str = "",
caching: bool = True,
fallback: bool = True
fallback: bool = True,
) -> None:
"Set the document to use a given PDFParser object."
self.caching = caching
@ -723,43 +732,42 @@ class PDFDocument:
if not trailer:
continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
if 'ID' in trailer:
id_value = list_value(trailer['ID'])
if "Encrypt" in trailer:
if "ID" in trailer:
id_value = list_value(trailer["ID"])
else:
# Some documents may not have a /ID, use two empty
# byte strings instead. Solves
# https://github.com/pdfminer/pdfminer.six/issues/594
id_value = (b'', b'')
self.encryption = (id_value,
dict_value(trailer['Encrypt']))
id_value = (b"", b"")
self.encryption = (id_value, dict_value(trailer["Encrypt"]))
self._initialize_password(password)
if 'Info' in trailer:
self.info.append(dict_value(trailer['Info']))
if 'Root' in trailer:
if "Info" in trailer:
self.info.append(dict_value(trailer["Info"]))
if "Root" in trailer:
# Every PDF file must have exactly one /Root dictionary.
self.catalog = dict_value(trailer['Root'])
self.catalog = dict_value(trailer["Root"])
break
else:
raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
if self.catalog.get('Type') is not LITERAL_CATALOG:
raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
if self.catalog.get("Type") is not LITERAL_CATALOG:
if settings.STRICT:
raise PDFSyntaxError('Catalog not found!')
raise PDFSyntaxError("Catalog not found!")
return
KEYWORD_OBJ = KWD(b'obj')
KEYWORD_OBJ = KWD(b"obj")
# _initialize_password(password=b'')
# Perform the initialization with a given password.
def _initialize_password(self, password: str = '') -> None:
def _initialize_password(self, password: str = "") -> None:
assert self.encryption is not None
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)
v = int_value(param.get('V', 0))
if literal_name(param.get("Filter")) != "Standard":
raise PDFEncryptionError("Unknown filter: param=%r" % param)
v = int_value(param.get("V", 0))
factory = self.security_handler_registry.get(v)
if factory is None:
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
handler = factory(docid, param, password)
self.decipher = handler.decrypt
self.is_printable = handler.is_printable()
@ -769,12 +777,7 @@ class PDFDocument:
self._parser.fallback = False # need to read streams with exact length
return
def _getobj_objstm(
self,
stream: PDFStream,
index: int,
objid: int
) -> object:
def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object:
if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid]
else:
@ -786,18 +789,18 @@ class PDFDocument:
try:
obj = objs[i]
except IndexError:
raise PDFSyntaxError('index too big: %r' % index)
raise PDFSyntaxError("index too big: %r" % index)
return obj
def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
if stream.get('Type') is not LITERAL_OBJSTM:
if stream.get("Type") is not LITERAL_OBJSTM:
if settings.STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
raise PDFSyntaxError("Not a stream object: %r" % stream)
try:
n = cast(int, stream['N'])
n = cast(int, stream["N"])
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
raise PDFSyntaxError("N is not defined: %r" % stream)
n = 0
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
@ -830,11 +833,10 @@ class PDFDocument:
objid1 = x[-2]
# #### end hack around malformed pdf files
if objid1 != objid:
raise PDFSyntaxError('objid mismatch: {!r}={!r}'
.format(objid1, objid))
raise PDFSyntaxError("objid mismatch: {!r}={!r}".format(objid1, objid))
if kwd != KWD(b'obj'):
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
if kwd != KWD(b"obj"):
raise PDFSyntaxError("Invalid object spec: offset=%r" % pos)
(_, obj) = self._parser.nextobject()
return obj
@ -846,8 +848,8 @@ class PDFDocument:
:raises PDFObjectNotFound if objid does not exist in PDF
"""
if not self.xrefs:
raise PDFException('PDFDocument is not initialized')
log.debug('getobj: objid=%r', objid)
raise PDFException("PDFDocument is not initialized")
log.debug("getobj: objid=%r", objid)
if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid]
else:
@ -863,8 +865,7 @@ class PDFDocument:
else:
obj = self._getobj_parse(index, objid)
if self.decipher:
obj = decipher_all(self.decipher, objid, genno,
obj)
obj = decipher_all(self.decipher, objid, genno, obj)
if isinstance(obj, PDFStream):
obj.set_objid(objid, genno)
@ -873,7 +874,7 @@ class PDFDocument:
continue
else:
raise PDFObjectNotFound(objid)
log.debug('register: objid=%r: %r', objid, obj)
log.debug("register: objid=%r: %r", objid, obj)
if self.caching:
self._cached_objs[objid] = (obj, genno)
return obj
@ -881,25 +882,25 @@ class PDFDocument:
OutlineType = Tuple[Any, Any, Any, Any, Any]
def get_outlines(self) -> Iterator[OutlineType]:
if 'Outlines' not in self.catalog:
if "Outlines" not in self.catalog:
raise PDFNoOutlines
def search(entry: object, level: int
) -> Iterator[PDFDocument.OutlineType]:
def search(entry: object, level: int) -> Iterator[PDFDocument.OutlineType]:
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
if "Title" in entry:
if "A" in entry or "Dest" in entry:
title = decode_text(str_value(entry["Title"]))
dest = entry.get("Dest")
action = entry.get("A")
se = entry.get("SE")
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
yield from search(entry['First'], level+1)
if 'Next' in entry:
yield from search(entry['Next'], level)
if "First" in entry and "Last" in entry:
yield from search(entry["First"], level + 1)
if "Next" in entry:
yield from search(entry["Next"], level)
return
return search(self.catalog['Outlines'], 0)
return search(self.catalog["Outlines"], 0)
def get_page_labels(self) -> Iterator[str]:
"""
@ -913,51 +914,49 @@ class PDFDocument:
assert self.catalog is not None
try:
page_labels = PageLabels(self.catalog['PageLabels'])
page_labels = PageLabels(self.catalog["PageLabels"])
except (PDFTypeError, KeyError):
raise PDFNoPageLabels
return page_labels.labels
def lookup_name(
self,
cat: str,
key: Union[str, bytes]
) -> Any:
def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
try:
names = dict_value(self.catalog['Names'])
names = dict_value(self.catalog["Names"])
except (PDFTypeError, KeyError):
raise KeyError((cat, key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d: Dict[str, Any]) -> Any:
if 'Limits' in d:
(k1, k2) = list_value(d['Limits'])
if "Limits" in d:
(k1, k2) = list_value(d["Limits"])
if key < k1 or k2 < key:
return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(cast(Iterator[Tuple[Union[str, bytes], Any]],
choplist(2, objs)))
if "Names" in d:
objs = list_value(d["Names"])
names = dict(
cast(Iterator[Tuple[Union[str, bytes], Any]], choplist(2, objs))
)
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
if "Kids" in d:
for c in list_value(d["Kids"]):
v = lookup(dict_value(c))
if v:
return v
raise KeyError((cat, key))
return lookup(d0)
def get_dest(self, name: Union[str, bytes]) -> Any:
try:
# PDF-1.2 or later
obj = self.lookup_name('Dests', name)
obj = self.lookup_name("Dests", name)
except KeyError:
# PDF-1.1 or prior
if 'Dests' not in self.catalog:
if "Dests" not in self.catalog:
raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests'])
d0 = dict_value(self.catalog["Dests"])
if name not in d0:
raise PDFDestinationNotFound(name)
obj = d0[name]
@ -970,23 +969,20 @@ class PDFDocument:
prev = None
for line in parser.revreadlines():
line = line.strip()
log.debug('find_xref: %r', line)
if line == b'startxref':
log.debug("find_xref: %r", line)
if line == b"startxref":
break
if line:
prev = line
else:
raise PDFNoValidXRef('Unexpected EOF')
log.debug('xref found: pos=%r', prev)
raise PDFNoValidXRef("Unexpected EOF")
log.debug("xref found: pos=%r", prev)
assert prev is not None
return int(prev)
# read xref table
def read_xref_from(
self,
parser: PDFParser,
start: int,
xrefs: List[PDFBaseXRef]
self, parser: PDFParser, start: int, xrefs: List[PDFBaseXRef]
) -> None:
"""Reads XRefs from the given location."""
parser.seek(start)
@ -994,8 +990,8 @@ class PDFDocument:
try:
(pos, token) = parser.nexttoken()
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF')
log.debug('read_xref_from: start=%d, token=%r', start, token)
raise PDFNoValidXRef("Unexpected EOF")
log.debug("read_xref_from: start=%d, token=%r", start, token)
if isinstance(token, int):
# XRefStream: PDF-1.5
parser.seek(pos)
@ -1009,13 +1005,13 @@ class PDFDocument:
xref.load(parser)
xrefs.append(xref)
trailer = xref.get_trailer()
log.debug('trailer: %r', trailer)
if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm'])
log.debug("trailer: %r", trailer)
if "XRefStm" in trailer:
pos = int_value(trailer["XRefStm"])
self.read_xref_from(parser, pos, xrefs)
if 'Prev' in trailer:
if "Prev" in trailer:
# find previous xref
pos = int_value(trailer['Prev'])
pos = int_value(trailer["Prev"])
self.read_xref_from(parser, pos, xrefs)
return
@ -1033,16 +1029,16 @@ class PageLabels(NumberTree):
# The tree must begin with page index 0
if len(ranges) == 0 or ranges[0][0] != 0:
if settings.STRICT:
raise PDFSyntaxError('PageLabels is missing page index 0')
raise PDFSyntaxError("PageLabels is missing page index 0")
else:
# Try to cope, by assuming empty labels for the initial pages
ranges.insert(0, (0, {}))
for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
label_dict = dict_value(label_dict_unchecked)
style = label_dict.get('S')
prefix = decode_text(str_value(label_dict.get('P', b'')))
first_value = int_value(label_dict.get('St', 1))
style = label_dict.get("S")
prefix = decode_text(str_value(label_dict.get("P", b"")))
first_value = int_value(label_dict.get("St", 1))
if next == len(ranges):
# This is the last specified range. It continues until the end
@ -1061,18 +1057,18 @@ class PageLabels(NumberTree):
def _format_page_label(value: int, style: Any) -> str:
"""Format page label value in a specific style"""
if style is None:
label = ''
elif style is LIT('D'): # Decimal arabic numerals
label = ""
elif style is LIT("D"): # Decimal arabic numerals
label = str(value)
elif style is LIT('R'): # Uppercase roman numerals
elif style is LIT("R"): # Uppercase roman numerals
label = format_int_roman(value).upper()
elif style is LIT('r'): # Lowercase roman numerals
elif style is LIT("r"): # Lowercase roman numerals
label = format_int_roman(value)
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
elif style is LIT("A"): # Uppercase letters A-Z, AA-ZZ...
label = format_int_alpha(value).upper()
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
elif style is LIT("a"): # Lowercase letters a-z, aa-zz...
label = format_int_alpha(value)
else:
log.warning('Unknown page label style: %r', style)
label = ''
log.warning("Unknown page label style: %r", style)
label = ""
return label

File diff suppressed because it is too large Load Diff

View File

@ -50,11 +50,11 @@ class PDFInterpreterError(PDFException):
pass
LITERAL_PDF = LIT('PDF')
LITERAL_TEXT = LIT('Text')
LITERAL_FONT = LIT('Font')
LITERAL_FORM = LIT('Form')
LITERAL_IMAGE = LIT('Image')
LITERAL_PDF = LIT("PDF")
LITERAL_TEXT = LIT("Text")
LITERAL_FONT = LIT("Font")
LITERAL_FORM = LIT("Form")
LITERAL_IMAGE = LIT("Image")
class PDFTextState:
@ -75,12 +75,23 @@ class PDFTextState:
# self.linematrix is set
def __repr__(self) -> str:
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \
'matrix=%r, linematrix=%r>' \
% (self.font, self.fontsize, self.charspace, self.wordspace,
self.scaling, self.leading, self.render, self.rise,
self.matrix, self.linematrix)
return (
"<PDFTextState: font=%r, fontsize=%r, charspace=%r, "
"wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, "
"matrix=%r, linematrix=%r>"
% (
self.font,
self.fontsize,
self.charspace,
self.wordspace,
self.scaling,
self.leading,
self.render,
self.rise,
self.matrix,
self.linematrix,
)
)
def copy(self) -> "PDFTextState":
obj = PDFTextState()
@ -104,11 +115,11 @@ class PDFTextState:
Color = Union[
float, # Greyscale
Tuple[float, float, float], # R, G, B
Tuple[float, float, float, float]] # C, M, Y, K
Tuple[float, float, float, float],
] # C, M, Y, K
class PDFGraphicState:
def __init__(self) -> None:
self.linewidth: float = 0
self.linecap: Optional[object] = None
@ -138,12 +149,22 @@ class PDFGraphicState:
return obj
def __repr__(self) -> str:
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, '
' stroking color=%r, non stroking color=%r>' %
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness,
self.scolor, self.ncolor))
return (
"<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, "
" miterlimit=%r, dash=%r, intent=%r, flatness=%r, "
" stroking color=%r, non stroking color=%r>"
% (
self.linewidth,
self.linecap,
self.linejoin,
self.miterlimit,
self.dash,
self.intent,
self.flatness,
self.scolor,
self.ncolor,
)
)
class PDFResourceManager:
@ -179,41 +200,41 @@ class PDFResourceManager:
if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid]
else:
log.debug('get_font: create: objid=%r, spec=%r', objid, spec)
log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
if settings.STRICT:
if spec['Type'] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font')
if spec["Type"] is not LITERAL_FONT:
raise PDFFontError("Type is not /Font")
# Create a Font object.
if 'Subtype' in spec:
subtype = literal_name(spec['Subtype'])
if "Subtype" in spec:
subtype = literal_name(spec["Subtype"])
else:
if settings.STRICT:
raise PDFFontError('Font Subtype is not specified.')
subtype = 'Type1'
if subtype in ('Type1', 'MMType1'):
raise PDFFontError("Font Subtype is not specified.")
subtype = "Type1"
if subtype in ("Type1", "MMType1"):
# Type1 Font
font = PDFType1Font(self, spec)
elif subtype == 'TrueType':
elif subtype == "TrueType":
# TrueType Font
font = PDFTrueTypeFont(self, spec)
elif subtype == 'Type3':
elif subtype == "Type3":
# Type3 Font
font = PDFType3Font(self, spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'):
elif subtype in ("CIDFontType0", "CIDFontType2"):
# CID Font
font = PDFCIDFont(self, spec)
elif subtype == 'Type0':
elif subtype == "Type0":
# Type0 Font
dfonts = list_value(spec['DescendantFonts'])
dfonts = list_value(spec["DescendantFonts"])
assert dfonts
subspec = dict_value(dfonts[0]).copy()
for k in ('Encoding', 'ToUnicode'):
for k in ("Encoding", "ToUnicode"):
if k in spec:
subspec[k] = resolve1(spec[k])
font = self.get_font(None, subspec)
else:
if settings.STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec)
raise PDFFontError("Invalid Font spec: %r" % spec)
font = PDFType1Font(self, spec) # this is so wrong!
if objid and self.caching:
self._cached_fonts[objid] = font
@ -221,7 +242,6 @@ class PDFResourceManager:
class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
def __init__(self, streams: Sequence[object]) -> None:
self.streams = streams
self.istream = 0
@ -236,7 +256,7 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
strm = stream_value(self.streams[self.istream])
self.istream += 1
else:
raise PSEOF('Unexpected EOF, file truncated?')
raise PSEOF("Unexpected EOF, file truncated?")
self.fp = BytesIO(strm.get_data())
def seek(self, pos: int) -> None:
@ -255,14 +275,10 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
self.fp = None # type: ignore[assignment]
self.charpos = 0
def get_inline_data(
self,
pos: int,
target: bytes = b'EI'
) -> Tuple[int, bytes]:
def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
self.seek(pos)
i = 0
data = b''
data = b""
while i <= len(target):
self.fillbuf()
if i:
@ -286,29 +302,28 @@ class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
data += self.buf[self.charpos :]
self.charpos = len(self.buf)
data = data[: -(len(target) + 1)] # strip the last part
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
return (pos, data)
def flush(self) -> None:
self.add_results(*self.popall())
KEYWORD_BI = KWD(b'BI')
KEYWORD_ID = KWD(b'ID')
KEYWORD_EI = KWD(b'EI')
KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b"EI")
def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BI:
# inline image within a content stream
self.start_type(pos, 'inline')
self.start_type(pos, "inline")
elif token is self.KEYWORD_ID:
try:
(_, objs) = self.end_type('inline')
(_, objs) = self.end_type("inline")
if len(objs) % 2 != 0:
error_msg = 'Invalid dictionary construct: {!r}' \
.format(objs)
error_msg = "Invalid dictionary construct: {!r}".format(objs)
raise PSTypeError(error_msg)
d = {literal_name(k): v for (k, v) in choplist(2, objs)}
(pos, data) = self.get_inline_data(pos+len(b'ID '))
(pos, data) = self.get_inline_data(pos + len(b"ID "))
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
@ -351,32 +366,30 @@ class PDFPageInterpreter:
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased' and isinstance(spec, list) \
and 2 <= len(spec):
return PDFColorSpace(name, stream_value(spec[1])['N'])
elif name == 'DeviceN' and isinstance(spec, list) \
and 2 <= len(spec):
if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, stream_value(spec[1])["N"])
elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE.get(name)
for (k, v) in dict_value(resources).items():
log.debug('Resource: %r: %r', k, v)
if k == 'Font':
log.debug("Resource: %r: %r", k, v)
if k == "Font":
for (fontid, spec) in dict_value(v).items():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace':
elif k == "ColorSpace":
for (csid, spec) in dict_value(v).items():
colorspace = get_colorspace(resolve1(spec))
if colorspace is not None:
self.csmap[csid] = colorspace
elif k == 'ProcSet':
elif k == "ProcSet":
self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject':
elif k == "XObject":
for (xobjid, xobjstrm) in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm
return
@ -410,14 +423,11 @@ class PDFPageInterpreter:
self.argstack = self.argstack[:-n]
return x
def get_current_state(
self
) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
def set_current_state(
self,
state: Tuple[Matrix, PDFTextState, PDFGraphicState]
self, state: Tuple[Matrix, PDFTextState, PDFGraphicState]
) -> None:
(self.ctm, self.textstate, self.graphicstate) = state
self.device.set_ctm(self.ctm)
@ -441,11 +451,10 @@ class PDFPageInterpreter:
c1: PDFStackT,
d1: PDFStackT,
e1: PDFStackT,
f1: PDFStackT
f1: PDFStackT,
) -> None:
"""Concatenate matrix to current transformation matrix"""
self.ctm = \
mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
self.device.set_ctm(self.ctm)
return
@ -491,12 +500,12 @@ class PDFPageInterpreter:
def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
"""Begin new subpath"""
self.curpath.append(('m', cast(float, x), cast(float, y)))
self.curpath.append(("m", cast(float, x), cast(float, y)))
return
def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
"""Append straight line segment to path"""
self.curpath.append(('l', cast(float, x), cast(float, y)))
self.curpath.append(("l", cast(float, x), cast(float, y)))
return
def do_c(
@ -506,66 +515,57 @@ class PDFPageInterpreter:
x2: PDFStackT,
y2: PDFStackT,
x3: PDFStackT,
y3: PDFStackT
y3: PDFStackT,
) -> None:
"""Append curved segment to path (three control points)"""
self.curpath.append(('c', cast(float, x1), cast(float, y1),
cast(float, x2), cast(float, y2),
cast(float, x3), cast(float, y3)))
self.curpath.append(
(
"c",
cast(float, x1),
cast(float, y1),
cast(float, x2),
cast(float, y2),
cast(float, x3),
cast(float, y3),
)
)
return
def do_v(
self,
x2: PDFStackT,
y2: PDFStackT,
x3: PDFStackT,
y3: PDFStackT
) -> None:
def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
"""Append curved segment to path (initial point replicated)"""
self.curpath.append(('v', cast(float, x2), cast(float, y2),
cast(float, x3), cast(float, y3)))
self.curpath.append(
("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3))
)
return
def do_y(
self,
x1: PDFStackT,
y1: PDFStackT,
x3: PDFStackT,
y3: PDFStackT
) -> None:
def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None:
"""Append curved segment to path (final point replicated)"""
self.curpath.append(('y', cast(float, x1), cast(float, y1),
cast(float, x3), cast(float, y3)))
self.curpath.append(
("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3))
)
return
def do_h(self) -> None:
"""Close subpath"""
self.curpath.append(('h',))
self.curpath.append(("h",))
return
def do_re(
self,
x: PDFStackT,
y: PDFStackT,
w: PDFStackT,
h: PDFStackT
) -> None:
def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
"""Append rectangle to path"""
x = cast(float, x)
y = cast(float, y)
w = cast(float, w)
h = cast(float, h)
self.curpath.append(('m', x, y))
self.curpath.append(('l', x+w, y))
self.curpath.append(('l', x+w, y+h))
self.curpath.append(('l', x, y+h))
self.curpath.append(('h',))
self.curpath.append(("m", x, y))
self.curpath.append(("l", x + w, y))
self.curpath.append(("l", x + w, y + h))
self.curpath.append(("l", x, y + h))
self.curpath.append(("h",))
return
def do_S(self) -> None:
"""Stroke path"""
self.device.paint_path(self.graphicstate, True, False, False,
self.curpath)
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = []
return
@ -577,8 +577,7 @@ class PDFPageInterpreter:
def do_f(self) -> None:
"""Fill path using nonzero winding number rule"""
self.device.paint_path(self.graphicstate, False, True, False,
self.curpath)
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
self.curpath = []
return
@ -588,22 +587,19 @@ class PDFPageInterpreter:
def do_f_a(self) -> None:
"""Fill path using even-odd rule"""
self.device.paint_path(self.graphicstate, False, True, True,
self.curpath)
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = []
return
def do_B(self) -> None:
"""Fill and stroke path using nonzero winding number rule"""
self.device.paint_path(self.graphicstate, True, True, False,
self.curpath)
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = []
return
def do_B_a(self) -> None:
"""Fill and stroke path using even-odd rule"""
self.device.paint_path(self.graphicstate, True, True, True,
self.curpath)
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = []
return
@ -641,7 +637,7 @@ class PDFPageInterpreter:
self.scs = self.csmap[literal_name(name)]
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
return
def do_cs(self, name: PDFStackT) -> None:
@ -650,7 +646,7 @@ class PDFPageInterpreter:
self.ncs = self.csmap[literal_name(name)]
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
raise PDFInterpreterError("Undefined ColorSpace: %r" % name)
return
def do_G(self, gray: PDFStackT) -> None:
@ -665,38 +661,32 @@ class PDFPageInterpreter:
def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
"""Set RGB color for stroking operations"""
self.graphicstate.scolor = \
(cast(float, r), cast(float, g), cast(float, b))
self.graphicstate.scolor = (cast(float, r), cast(float, g), cast(float, b))
return
def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
"""Set RGB color for nonstroking operations"""
self.graphicstate.ncolor = \
(cast(float, r), cast(float, g), cast(float, b))
self.graphicstate.ncolor = (cast(float, r), cast(float, g), cast(float, b))
return
def do_K(
self,
c: PDFStackT,
m: PDFStackT,
y: PDFStackT,
k: PDFStackT
) -> None:
def do_K(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
"""Set CMYK color for stroking operations"""
self.graphicstate.scolor = \
(cast(float, c), cast(float, m), cast(float, y), cast(float, k))
self.graphicstate.scolor = (
cast(float, c),
cast(float, m),
cast(float, y),
cast(float, k),
)
return
def do_k(
self,
c: PDFStackT,
m: PDFStackT,
y: PDFStackT,
k: PDFStackT
) -> None:
def do_k(self, c: PDFStackT, m: PDFStackT, y: PDFStackT, k: PDFStackT) -> None:
"""Set CMYK color for nonstroking operations"""
self.graphicstate.ncolor = \
(cast(float, c), cast(float, m), cast(float, y), cast(float, k))
self.graphicstate.ncolor = (
cast(float, c),
cast(float, m),
cast(float, y),
cast(float, k),
)
return
def do_SCN(self) -> None:
@ -705,7 +695,7 @@ class PDFPageInterpreter:
n = self.scs.ncomponents
else:
if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!')
raise PDFInterpreterError("No colorspace specified!")
n = 1
self.graphicstate.scolor = cast(Color, self.pop(n))
return
@ -716,7 +706,7 @@ class PDFPageInterpreter:
n = self.ncs.ncomponents
else:
if settings.STRICT:
raise PDFInterpreterError('No colorspace specified!')
raise PDFInterpreterError("No colorspace specified!")
n = 1
self.graphicstate.ncolor = cast(Color, self.pop(n))
return
@ -831,7 +821,7 @@ class PDFPageInterpreter:
self.textstate.font = self.fontmap[literal_name(fontid)]
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
raise PDFInterpreterError("Undefined Font id: %r" % fontid)
self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = cast(float, fontsize)
return
@ -875,7 +865,7 @@ class PDFPageInterpreter:
c: PDFStackT,
d: PDFStackT,
e: PDFStackT,
f: PDFStackT
f: PDFStackT,
) -> None:
"""Set text matrix and text line matrix"""
self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
@ -885,8 +875,14 @@ class PDFPageInterpreter:
def do_T_a(self) -> None:
"""Move to start of next text line"""
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e,
self.textstate.leading*d+f)
self.textstate.matrix = (
a,
b,
c,
d,
self.textstate.leading * c + e,
self.textstate.leading * d + f,
)
self.textstate.linematrix = (0, 0)
return
@ -894,11 +890,12 @@ class PDFPageInterpreter:
"""Show text, allowing individual glyph positioning"""
if self.textstate.font is None:
if settings.STRICT:
raise PDFInterpreterError('No font specified!')
raise PDFInterpreterError("No font specified!")
return
assert self.ncs is not None
self.device.render_string(self.textstate, cast(PDFTextSeq, seq),
self.ncs, self.graphicstate.copy())
self.device.render_string(
self.textstate, cast(PDFTextSeq, seq), self.ncs, self.graphicstate.copy()
)
return
def do_Tj(self, s: PDFStackT) -> None:
@ -935,7 +932,7 @@ class PDFPageInterpreter:
def do_EI(self, obj: PDFStackT) -> None:
"""End inline image object"""
if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj:
if isinstance(obj, PDFStream) and "W" in obj and "H" in obj:
iobjid = str(id(obj))
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(iobjid, obj)
@ -949,28 +946,28 @@ class PDFPageInterpreter:
xobj = stream_value(self.xobjmap[xobjid])
except KeyError:
if settings.STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
return
log.debug('Processing xobj: %r', xobj)
subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj:
log.debug("Processing xobj: %r", xobj)
subtype = xobj.get("Subtype")
if subtype is LITERAL_FORM and "BBox" in xobj:
interpreter = self.dup()
bbox = cast(Rect, list_value(xobj['BBox']))
matrix = cast(Matrix, list_value(
xobj.get('Matrix', MATRIX_IDENTITY)))
bbox = cast(Rect, list_value(xobj["BBox"]))
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
# According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry.
xobjres = xobj.get('Resources')
xobjres = xobj.get("Resources")
if xobjres:
resources = dict_value(xobjres)
else:
resources = self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(resources, [xobj],
ctm=mult_matrix(matrix, self.ctm))
interpreter.render_contents(
resources, [xobj], ctm=mult_matrix(matrix, self.ctm)
)
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid)
@ -980,7 +977,7 @@ class PDFPageInterpreter:
return
def process_page(self, page: PDFPage) -> None:
log.debug('Processing page: %r', page)
log.debug("Processing page: %r", page)
(x0, y0, x1, y1) = page.mediabox
if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1)
@ -999,14 +996,15 @@ class PDFPageInterpreter:
self,
resources: Dict[object, object],
streams: Sequence[object],
ctm: Matrix = MATRIX_IDENTITY
ctm: Matrix = MATRIX_IDENTITY,
) -> None:
"""Render the content streams.
This method may be called recursively.
"""
log.debug('render_contents: resources=%r, streams=%r, ctm=%r',
resources, streams, ctm)
log.debug(
"render_contents: resources=%r, streams=%r, ctm=%r", resources, streams, ctm
)
self.init_resources(resources)
self.init_state(ctm)
self.execute(list_value(streams))
@ -1025,22 +1023,23 @@ class PDFPageInterpreter:
break
if isinstance(obj, PSKeyword):
name = keyword_name(obj)
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w')\
.replace("'", '_q')
method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
"'", "_q"
)
if hasattr(self, method):
func = getattr(self, method)
nargs = func.__code__.co_argcount - 1
if nargs:
args = self.pop(nargs)
log.debug('exec: %s %r', name, args)
log.debug("exec: %s %r", name, args)
if len(args) == nargs:
func(*args)
else:
log.debug('exec: %s', name)
log.debug("exec: %s", name)
func()
else:
if settings.STRICT:
error_msg = 'Unknown operator: %r' % name
error_msg = "Unknown operator: %r" % name
raise PDFInterpreterError(error_msg)
else:
self.push(obj)

View File

@ -4,8 +4,7 @@ from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
from pdfminer.utils import Rect
from . import settings
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
PDFNoPageLabels
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, PDFNoPageLabels
from .pdfparser import PDFParser
from .pdftypes import PDFObjectNotFound
from .pdftypes import dict_value
@ -17,8 +16,8 @@ from .psparser import LIT
log = logging.getLogger(__name__)
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
LITERAL_PAGE = LIT("Page")
LITERAL_PAGES = LIT("Pages")
class PDFPage:
@ -44,11 +43,7 @@ class PDFPage:
"""
def __init__(
self,
doc: PDFDocument,
pageid: object,
attrs: object,
label: Optional[str]
self, doc: PDFDocument, pageid: object, attrs: object, label: Optional[str]
) -> None:
"""Initialize a page object.
@ -61,19 +56,20 @@ class PDFPage:
self.pageid = pageid
self.attrs = dict_value(attrs)
self.label = label
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources: Dict[object, object] = \
resolve1(self.attrs.get('Resources', dict()))
self.mediabox: Rect = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox: Rect = resolve1(self.attrs['CropBox'])
self.lastmod = resolve1(self.attrs.get("LastModified"))
self.resources: Dict[object, object] = resolve1(
self.attrs.get("Resources", dict())
)
self.mediabox: Rect = resolve1(self.attrs["MediaBox"])
if "CropBox" in self.attrs:
self.cropbox: Rect = resolve1(self.attrs["CropBox"])
else:
self.cropbox = self.mediabox
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
self.annots = self.attrs.get("Annots")
self.beads = self.attrs.get("B")
if "Contents" in self.attrs:
contents = resolve1(self.attrs["Contents"])
else:
contents = []
if not isinstance(contents, list):
@ -81,16 +77,16 @@ class PDFPage:
self.contents: List[object] = contents
def __repr__(self) -> str:
return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
.format(self.resources, self.mediabox)
return "<PDFPage: Resources={!r}, MediaBox={!r}>".format(
self.resources, self.mediabox
)
INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
@classmethod
def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
def search(
obj: object,
parent: Dict[str, object]
obj: object, parent: Dict[str, object]
) -> Iterator[Tuple[int, Dict[object, Dict[object, object]]]]:
if isinstance(obj, int):
objid = obj
@ -104,16 +100,16 @@ class PDFPage:
if k in cls.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
tree_type = tree.get('Type')
tree_type = tree.get("Type")
if tree_type is None and not settings.STRICT: # See #64
tree_type = tree.get('type')
tree_type = tree.get("type")
if tree_type is LITERAL_PAGES and 'Kids' in tree:
log.debug('Pages: Kids=%r', tree['Kids'])
for c in list_value(tree['Kids']):
if tree_type is LITERAL_PAGES and "Kids" in tree:
log.debug("Pages: Kids=%r", tree["Kids"])
for c in list_value(tree["Kids"]):
yield from search(c, tree)
elif tree_type is LITERAL_PAGE:
log.debug('Page: %r', tree)
log.debug("Page: %r", tree)
yield (objid, tree)
try:
@ -122,8 +118,8 @@ class PDFPage:
page_labels = itertools.repeat(None)
pages = False
if 'Pages' in document.catalog:
objects = search(document.catalog['Pages'], document.catalog)
if "Pages" in document.catalog:
objects = search(document.catalog["Pages"], document.catalog)
for (objid, tree) in objects:
yield cls(document, objid, tree, next(page_labels))
pages = True
@ -133,8 +129,7 @@ class PDFPage:
for objid in xref.get_objids():
try:
obj = document.getobj(objid)
if isinstance(obj, dict) \
and obj.get('Type') is LITERAL_PAGE:
if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
yield cls(document, objid, obj, next(page_labels))
except PDFObjectNotFound:
pass
@ -146,9 +141,9 @@ class PDFPage:
fp: BinaryIO,
pagenos: Optional[Container[int]] = None,
maxpages: int = 0,
password: str = '',
password: str = "",
caching: bool = True,
check_extractable: bool = False
check_extractable: bool = False,
) -> Iterator["PDFPage"]:
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
@ -158,14 +153,16 @@ class PDFPage:
# If not, warn the user and proceed.
if not doc.is_extractable:
if check_extractable:
error_msg = 'Text extraction is not allowed: %r' % fp
error_msg = "Text extraction is not allowed: %r" % fp
raise PDFTextExtractionNotAllowed(error_msg)
else:
warning_msg = 'The PDF %r contains a metadata field '\
'indicating that it should not allow ' \
'text extraction. Ignoring this field ' \
'and proceeding. Use the check_extractable ' \
'if you want to raise an error in this case' % fp
warning_msg = (
"The PDF %r contains a metadata field "
"indicating that it should not allow "
"text extraction. Ignoring this field "
"and proceeding. Use the check_extractable "
"if you want to raise an error in this case" % fp
)
log.warning(warning_msg)
# Process each page contained in the document.
for (pageno, page) in enumerate(cls.create_pages(doc)):

View File

@ -51,12 +51,12 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
"""Associates the parser with a PDFDocument object."""
self.doc = doc
KEYWORD_R = KWD(b'R')
KEYWORD_NULL = KWD(b'null')
KEYWORD_ENDOBJ = KWD(b'endobj')
KEYWORD_STREAM = KWD(b'stream')
KEYWORD_XREF = KWD(b'xref')
KEYWORD_STARTXREF = KWD(b'startxref')
KEYWORD_R = KWD(b"R")
KEYWORD_NULL = KWD(b"null")
KEYWORD_ENDOBJ = KWD(b"endobj")
KEYWORD_STREAM = KWD(b"stream")
KEYWORD_XREF = KWD(b"xref")
KEYWORD_STARTXREF = KWD(b"startxref")
def do_keyword(self, pos: int, token: PSKeyword) -> None:
"""Handles PDF-related keywords."""
@ -76,8 +76,7 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
if len(self.curstack) >= 2:
try:
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (
int(objid), int(genno)) # type: ignore[arg-type]
(objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
assert self.doc is not None
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
@ -90,16 +89,16 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
objlen = 0
if not self.fallback:
try:
objlen = int_value(dic['Length'])
objlen = int_value(dic["Length"])
except KeyError:
if settings.STRICT:
raise PDFSyntaxError('/Length is undefined: %r' % dic)
raise PDFSyntaxError("/Length is undefined: %r" % dic)
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF')
raise PDFSyntaxError("Unexpected EOF")
return
pos += len(line)
self.fp.seek(pos)
@ -110,10 +109,10 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
(linepos, line) = self.nextline()
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF')
raise PDFSyntaxError("Unexpected EOF")
break
if b'endstream' in line:
i = line.index(b'endstream')
if b"endstream" in line:
i = line.index(b"endstream")
objlen += i
if self.fallback:
data += line[:i]
@ -123,8 +122,13 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
data += line
self.seek(pos + objlen)
# XXX limit objlen not to exceed object boundary
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
objlen, dic, data[:10])
log.debug(
"Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
pos,
objlen,
dic,
data[:10],
)
assert self.doc is not None
stream = PDFStream(dic, bytes(data), self.doc.decipher)
self.push((pos, stream))
@ -149,15 +153,14 @@ class PDFStreamParser(PDFParser):
def flush(self) -> None:
self.add_results(*self.popall())
KEYWORD_OBJ = KWD(b'obj')
KEYWORD_OBJ = KWD(b"obj")
def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (
int(objid), int(genno)) # type: ignore[arg-type]
(objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
@ -167,7 +170,7 @@ class PDFStreamParser(PDFParser):
if settings.STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used.
raise PDFSyntaxError('Keyword endobj found in stream')
raise PDFSyntaxError("Keyword endobj found in stream")
return
# others
self.push((pos, token))

View File

@ -2,8 +2,17 @@ import io
import logging
import sys
import zlib
from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List,
Tuple, cast)
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
Optional,
Union,
List,
Tuple,
cast,
)
from . import settings
from .ascii85 import ascii85decode
@ -21,18 +30,18 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
LITERAL_CRYPT = LIT('Crypt')
LITERAL_CRYPT = LIT("Crypt")
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
LITERALS_JPX_DECODE = (LIT('JPXDecode'),)
LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl"))
LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW"))
LITERALS_ASCII85_DECODE = (LIT("ASCII85Decode"), LIT("A85"))
LITERALS_ASCIIHEX_DECODE = (LIT("ASCIIHexDecode"), LIT("AHx"))
LITERALS_RUNLENGTH_DECODE = (LIT("RunLengthDecode"), LIT("RL"))
LITERALS_CCITTFAX_DECODE = (LIT("CCITTFaxDecode"), LIT("CCF"))
LITERALS_DCT_DECODE = (LIT("DCTDecode"), LIT("DCT"))
LITERALS_JBIG2_DECODE = (LIT("JBIG2Decode"),)
LITERALS_JPX_DECODE = (LIT("JPXDecode"),)
if sys.version_info >= (3, 8):
@ -40,8 +49,14 @@ if sys.version_info >= (3, 8):
class DecipherCallable(Protocol):
"""Fully typed a decipher callback, with optional parameter."""
def __call__(self, objid: int, genno: int, data: bytes,
attrs: Optional[Dict[str, Any]] = None) -> bytes:
def __call__(
self,
objid: int,
genno: int,
data: bytes,
attrs: Optional[Dict[str, Any]] = None,
) -> bytes:
raise NotImplementedError
else: # Fallback for older Python
@ -75,21 +90,15 @@ class PDFNotImplementedError(PDFException):
class PDFObjRef(PDFObject):
def __init__(
self,
doc: Optional["PDFDocument"],
objid: int,
_: object
) -> None:
def __init__(self, doc: Optional["PDFDocument"], objid: int, _: object) -> None:
if objid == 0:
if settings.STRICT:
raise PDFValueError('PDF object id cannot be 0.')
raise PDFValueError("PDF object id cannot be 0.")
self.doc = doc
self.objid = objid
def __repr__(self) -> str:
return '<PDFObjRef:%d>' % (self.objid)
return "<PDFObjRef:%d>" % (self.objid)
def resolve(self, default: object = None) -> Any:
assert self.doc is not None
@ -126,14 +135,8 @@ def resolve_all(x: object, default: object = None) -> Any:
return x
def decipher_all(
decipher: DecipherCallable,
objid: int,
genno: int,
x: object
) -> Any:
"""Recursively deciphers the given object.
"""
def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any:
"""Recursively deciphers the given object."""
if isinstance(x, bytes):
return decipher(objid, genno, x)
if isinstance(x, list):
@ -148,7 +151,7 @@ def int_value(x: object) -> int:
x = resolve1(x)
if not isinstance(x, int):
if settings.STRICT:
raise PDFTypeError('Integer required: %r' % x)
raise PDFTypeError("Integer required: %r" % x)
return 0
return x
@ -157,7 +160,7 @@ def float_value(x: object) -> float:
x = resolve1(x)
if not isinstance(x, float):
if settings.STRICT:
raise PDFTypeError('Float required: %r' % x)
raise PDFTypeError("Float required: %r" % x)
return 0.0
return x
@ -166,7 +169,7 @@ def num_value(x: object) -> float:
x = resolve1(x)
if not isinstance(x, (int, float)): # == utils.isnumber(x)
if settings.STRICT:
raise PDFTypeError('Int or Float required: %r' % x)
raise PDFTypeError("Int or Float required: %r" % x)
return 0
return x
@ -184,8 +187,8 @@ def str_value(x: object) -> bytes:
x = resolve1(x)
if not isinstance(x, bytes):
if settings.STRICT:
raise PDFTypeError('String required: %r' % x)
return b''
raise PDFTypeError("String required: %r" % x)
return b""
return x
@ -193,7 +196,7 @@ def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]:
x = resolve1(x)
if not isinstance(x, (list, tuple)):
if settings.STRICT:
raise PDFTypeError('List required: %r' % x)
raise PDFTypeError("List required: %r" % x)
return []
return x
@ -202,8 +205,8 @@ def dict_value(x: object) -> Dict[Any, Any]:
x = resolve1(x)
if not isinstance(x, dict):
if settings.STRICT:
logger.error('PDFTypeError : Dict required: %r', x)
raise PDFTypeError('Dict required: %r' % x)
logger.error("PDFTypeError : Dict required: %r", x)
raise PDFTypeError("Dict required: %r" % x)
return {}
return x
@ -212,8 +215,8 @@ def stream_value(x: object) -> "PDFStream":
x = resolve1(x)
if not isinstance(x, PDFStream):
if settings.STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, b'')
raise PDFTypeError("PDFStream required: %r" % x)
return PDFStream({}, b"")
return x
@ -223,7 +226,7 @@ def decompress_corrupted(data: bytes) -> bytes:
"""
d = zlib.decompressobj()
f = io.BytesIO(data)
result_str = b''
result_str = b""
buffer = f.read(1)
i = 0
try:
@ -239,12 +242,11 @@ def decompress_corrupted(data: bytes) -> bytes:
class PDFStream(PDFObject):
def __init__(
self,
attrs: Dict[str, Any],
rawdata: bytes,
decipher: Optional[DecipherCallable] = None
decipher: Optional[DecipherCallable] = None,
) -> None:
assert isinstance(attrs, dict), str(type(attrs))
self.attrs = attrs
@ -261,12 +263,18 @@ class PDFStream(PDFObject):
def __repr__(self) -> str:
if self.data is None:
assert self.rawdata is not None
return '<PDFStream(%r): raw=%d, %r>' % \
(self.objid, len(self.rawdata), self.attrs)
return "<PDFStream(%r): raw=%d, %r>" % (
self.objid,
len(self.rawdata),
self.attrs,
)
else:
assert self.data is not None
return '<PDFStream(%r): len=%d, %r>' % \
(self.objid, len(self.data), self.attrs)
return "<PDFStream(%r): len=%d, %r>" % (
self.objid,
len(self.data),
self.attrs,
)
def __contains__(self, name: object) -> bool:
return name in self.attrs
@ -284,8 +292,8 @@ class PDFStream(PDFObject):
return default
def get_filters(self) -> List[Tuple[Any, Any]]:
filters = self.get_any(('F', 'Filter'))
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
filters = self.get_any(("F", "Filter"))
params = self.get_any(("DP", "DecodeParms", "FDecodeParms"), {})
if not filters:
return []
if not isinstance(filters, list):
@ -298,15 +306,16 @@ class PDFStream(PDFObject):
# resolve filter if possible
_filters = []
for fltr in filters:
if hasattr(fltr, 'resolve'):
if hasattr(fltr, "resolve"):
fltr = fltr.resolve()[0]
_filters.append(fltr)
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15
return list(zip(_filters, params))
def decode(self) -> None:
assert self.data is None \
and self.rawdata is not None, str((self.data, self.rawdata))
assert self.data is None and self.rawdata is not None, str(
(self.data, self.rawdata)
)
data = self.rawdata
if self.decipher:
# Handle encryption
@ -326,14 +335,13 @@ class PDFStream(PDFObject):
except zlib.error as e:
if settings.STRICT:
error_msg = 'Invalid zlib bytes: {!r}, {!r}'\
.format(e, data)
error_msg = "Invalid zlib bytes: {!r}, {!r}".format(e, data)
raise PDFException(error_msg)
try:
data = decompress_corrupted(data)
except zlib.error:
data = b''
data = b""
elif f in LITERALS_LZW_DECODE:
data = lzwdecode(data)
@ -356,25 +364,26 @@ class PDFStream(PDFObject):
pass
elif f == LITERAL_CRYPT:
# not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported')
raise PDFNotImplementedError("/Crypt filter is unsupported")
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
raise PDFNotImplementedError("Unsupported filter: %r" % f)
# apply predictors
if params and 'Predictor' in params:
pred = int_value(params['Predictor'])
if params and "Predictor" in params:
pred = int_value(params["Predictor"])
if pred == 1:
# no predictor
pass
elif 10 <= pred:
# PNG predictor
colors = int_value(params.get('Colors', 1))
columns = int_value(params.get('Columns', 1))
raw_bits_per_component = params.get('BitsPerComponent', 8)
colors = int_value(params.get("Colors", 1))
columns = int_value(params.get("Columns", 1))
raw_bits_per_component = params.get("BitsPerComponent", 8)
bitspercomponent = int_value(raw_bits_per_component)
data = apply_png_predictor(pred, colors, columns,
bitspercomponent, data)
data = apply_png_predictor(
pred, colors, columns, bitspercomponent, data
)
else:
error_msg = 'Unsupported predictor: %r' % pred
error_msg = "Unsupported predictor: %r" % pred
raise PDFNotImplementedError(error_msg)
self.data = data
self.rawdata = None

View File

@ -4,8 +4,19 @@
import logging
import re
from typing import (Any, BinaryIO, Dict, Generic, Iterator, List,
Optional, Tuple, Type, TypeVar, Union)
from typing import (
Any,
BinaryIO,
Dict,
Generic,
Iterator,
List,
Optional,
Tuple,
Type,
TypeVar,
Union,
)
from . import settings
from .utils import choplist
@ -59,7 +70,7 @@ class PSLiteral(PSObject):
def __repr__(self) -> str:
name = self.name
return '/%r' % name
return "/%r" % name
class PSKeyword(PSObject):
@ -79,10 +90,10 @@ class PSKeyword(PSObject):
def __repr__(self) -> str:
name = self.name
return '/%r' % name
return "/%r" % name
_SymbolT = TypeVar('_SymbolT', PSLiteral, PSKeyword)
_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword)
class PSSymbolTable(Generic[_SymbolT]):
@ -110,25 +121,25 @@ PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_PROC_BEGIN = KWD(b'{')
KEYWORD_PROC_END = KWD(b'}')
KEYWORD_ARRAY_BEGIN = KWD(b'[')
KEYWORD_ARRAY_END = KWD(b']')
KEYWORD_DICT_BEGIN = KWD(b'<<')
KEYWORD_DICT_END = KWD(b'>>')
KEYWORD_PROC_BEGIN = KWD(b"{")
KEYWORD_PROC_END = KWD(b"}")
KEYWORD_ARRAY_BEGIN = KWD(b"[")
KEYWORD_ARRAY_END = KWD(b"]")
KEYWORD_DICT_BEGIN = KWD(b"<<")
KEYWORD_DICT_END = KWD(b">>")
def literal_name(x: object) -> Any:
if not isinstance(x, PSLiteral):
if settings.STRICT:
raise PSTypeError('Literal required: {!r}'.format(x))
raise PSTypeError("Literal required: {!r}".format(x))
else:
name = x
else:
name = x.name
if not isinstance(name, str):
try:
name = str(name, 'utf-8')
name = str(name, "utf-8")
except Exception:
pass
return name
@ -137,34 +148,34 @@ def literal_name(x: object) -> Any:
def keyword_name(x: object) -> Any:
if not isinstance(x, PSKeyword):
if settings.STRICT:
raise PSTypeError('Keyword required: %r' % x)
raise PSTypeError("Keyword required: %r" % x)
else:
name = x
else:
name = str(x.name, 'utf-8', 'ignore')
name = str(x.name, "utf-8", "ignore")
return name
EOL = re.compile(br'[\r\n]')
SPC = re.compile(br'\s')
NONSPC = re.compile(br'\S')
HEX = re.compile(br'[0-9a-fA-F]')
END_LITERAL = re.compile(br'[#/%\[\]()<>{}\s]')
END_HEX_STRING = re.compile(br'[^\s0-9a-fA-F]')
HEX_PAIR = re.compile(br'[0-9a-fA-F]{2}|.')
END_NUMBER = re.compile(br'[^0-9]')
END_KEYWORD = re.compile(br'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(br'[()\134]')
OCT_STRING = re.compile(br'[0-7]')
EOL = re.compile(rb"[\r\n]")
SPC = re.compile(rb"\s")
NONSPC = re.compile(rb"\S")
HEX = re.compile(rb"[0-9a-fA-F]")
END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
END_NUMBER = re.compile(rb"[^0-9]")
END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
END_STRING = re.compile(rb"[()\134]")
OCT_STRING = re.compile(rb"[0-7]")
ESC_STRING = {
b'b': 8,
b't': 9,
b'n': 10,
b'f': 12,
b'r': 13,
b'(': 40,
b')': 41,
b'\\': 92
b"b": 8,
b"t": 9,
b"n": 10,
b"f": 12,
b"r": 13,
b"(": 40,
b")": 41,
b"\\": 92,
}
@ -173,8 +184,8 @@ PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
class PSBaseParser:
"""Most basic PostScript parser that performs only tokenization.
"""
"""Most basic PostScript parser that performs only tokenization."""
BUFSIZ = 4096
def __init__(self, fp: BinaryIO) -> None:
@ -182,8 +193,7 @@ class PSBaseParser:
self.seek(0)
def __repr__(self) -> str:
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
self.bufpos)
return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)
def flush(self) -> None:
return
@ -200,22 +210,21 @@ class PSBaseParser:
if not pos:
pos = self.bufpos + self.charpos
self.fp.seek(pos)
log.debug('poll(%d): %r', pos, self.fp.read(n))
log.debug("poll(%d): %r", pos, self.fp.read(n))
self.fp.seek(pos0)
return
def seek(self, pos: int) -> None:
"""Seeks the parser to the given position.
"""
log.debug('seek: %r', pos)
"""Seeks the parser to the given position."""
log.debug("seek: %r", pos)
self.fp.seek(pos)
# reset the status for nextline()
self.bufpos = pos
self.buf = b''
self.buf = b""
self.charpos = 0
# reset the status for nexttoken()
self._parse1 = self._parse_main
self._curtoken = b''
self._curtoken = b""
self._curtokenpos = 0
self._tokens: List[Tuple[int, PSBaseParserToken]] = []
return
@ -227,14 +236,13 @@ class PSBaseParser:
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if not self.buf:
raise PSEOF('Unexpected EOF')
raise PSEOF("Unexpected EOF")
self.charpos = 0
return
def nextline(self) -> Tuple[int, bytes]:
"""Fetches a next line that ends either with \\r or \\n.
"""
linebuf = b''
"""Fetches a next line that ends either with \\r or \\n."""
linebuf = b""
linepos = self.bufpos + self.charpos
eol = False
while 1:
@ -242,7 +250,7 @@ class PSBaseParser:
if eol:
c = self.buf[self.charpos : self.charpos + 1]
# handle b'\r\n'
if c == b'\n':
if c == b"\n":
linebuf += c
self.charpos += 1
break
@ -250,14 +258,14 @@ class PSBaseParser:
if m:
linebuf += self.buf[self.charpos : m.end(0)]
self.charpos = m.end(0)
if linebuf[-1:] == b'\r':
if linebuf[-1:] == b"\r":
eol = True
else:
break
else:
linebuf += self.buf[self.charpos :]
self.charpos = len(self.buf)
log.debug('nextline: %r, %r', linepos, linebuf)
log.debug("nextline: %r, %r", linepos, linebuf)
return (linepos, linebuf)
@ -268,7 +276,7 @@ class PSBaseParser:
"""
self.fp.seek(0, 2)
pos = self.fp.tell()
buf = b''
buf = b""
while 0 < pos:
prevpos = pos
pos = max(0, pos - self.BUFSIZ)
@ -277,13 +285,13 @@ class PSBaseParser:
if not s:
break
while 1:
n = max(s.rfind(b'\r'), s.rfind(b'\n'))
n = max(s.rfind(b"\r"), s.rfind(b"\n"))
if n == -1:
buf = s + buf
break
yield s[n:] + buf
s = s[:n]
buf = b''
buf = b""
return
def _parse_main(self, s: bytes, i: int) -> int:
@ -293,19 +301,19 @@ class PSBaseParser:
j = m.start(0)
c = s[j : j + 1]
self._curtokenpos = self.bufpos + j
if c == b'%':
self._curtoken = b'%'
if c == b"%":
self._curtoken = b"%"
self._parse1 = self._parse_comment
return j + 1
elif c == b'/':
self._curtoken = b''
elif c == b"/":
self._curtoken = b""
self._parse1 = self._parse_literal
return j + 1
elif c in b'-+' or c.isdigit():
elif c in b"-+" or c.isdigit():
self._curtoken = c
self._parse1 = self._parse_number
return j + 1
elif c == b'.':
elif c == b".":
self._curtoken = c
self._parse1 = self._parse_float
return j + 1
@ -313,17 +321,17 @@ class PSBaseParser:
self._curtoken = c
self._parse1 = self._parse_keyword
return j + 1
elif c == b'(':
self._curtoken = b''
elif c == b"(":
self._curtoken = b""
self.paren = 1
self._parse1 = self._parse_string
return j + 1
elif c == b'<':
self._curtoken = b''
elif c == b"<":
self._curtoken = b""
self._parse1 = self._parse_wopen
return j + 1
elif c == b'>':
self._curtoken = b''
elif c == b">":
self._curtoken = b""
self._parse1 = self._parse_wclose
return j + 1
else:
@ -354,12 +362,12 @@ class PSBaseParser:
j = m.start(0)
self._curtoken += s[i:j]
c = s[j : j + 1]
if c == b'#':
self.hex = b''
if c == b"#":
self.hex = b""
self._parse1 = self._parse_literal_hex
return j + 1
try:
name: Union[str, bytes] = str(self._curtoken, 'utf-8')
name: Union[str, bytes] = str(self._curtoken, "utf-8")
except Exception:
name = self._curtoken
self._add_token(LIT(name))
@ -384,7 +392,7 @@ class PSBaseParser:
j = m.start(0)
self._curtoken += s[i:j]
c = s[j : j + 1]
if c == b'.':
if c == b".":
self._curtoken += c
self._parse1 = self._parse_float
return j + 1
@ -416,9 +424,9 @@ class PSBaseParser:
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
if self._curtoken == b'true':
if self._curtoken == b"true":
token: Union[bool, PSKeyword] = True
elif self._curtoken == b'false':
elif self._curtoken == b"false":
token = False
else:
token = KWD(self._curtoken)
@ -434,15 +442,15 @@ class PSBaseParser:
j = m.start(0)
self._curtoken += s[i:j]
c = s[j : j + 1]
if c == b'\\':
self.oct = b''
if c == b"\\":
self.oct = b""
self._parse1 = self._parse_string_1
return j + 1
if c == b'(':
if c == b"(":
self.paren += 1
self._curtoken += c
return j + 1
if c == b')':
if c == b")":
self.paren -= 1
if self.paren:
# WTF, they said balanced parens need no special treatment.
@ -470,7 +478,7 @@ class PSBaseParser:
elif c in ESC_STRING:
self._curtoken += bytes((ESC_STRING[c],))
elif c == b'\r' and len(s) > i+1 and s[i+1:i+2] == b'\n':
elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
# If current and next character is \r\n skip both because enters
# after a \ are ignored
i += 1
@ -481,7 +489,7 @@ class PSBaseParser:
def _parse_wopen(self, s: bytes, i: int) -> int:
c = s[i : i + 1]
if c == b'<':
if c == b"<":
self._add_token(KEYWORD_DICT_BEGIN)
self._parse1 = self._parse_main
i += 1
@ -491,7 +499,7 @@ class PSBaseParser:
def _parse_wclose(self, s: bytes, i: int) -> int:
c = s[i : i + 1]
if c == b'>':
if c == b">":
self._add_token(KEYWORD_DICT_END)
i += 1
self._parse1 = self._parse_main
@ -504,8 +512,9 @@ class PSBaseParser:
return len(s)
j = m.start(0)
self._curtoken += s[i:j]
token = HEX_PAIR.sub(lambda m: bytes((int(m.group(0), 16),)),
SPC.sub(b'', self._curtoken))
token = HEX_PAIR.sub(
lambda m: bytes((int(m.group(0), 16),)), SPC.sub(b"", self._curtoken)
)
self._add_token(token)
self._parse1 = self._parse_main
return j
@ -515,7 +524,7 @@ class PSBaseParser:
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
token = self._tokens.pop(0)
log.debug('nexttoken: %r', token)
log.debug("nexttoken: %r", token)
return token
@ -530,15 +539,13 @@ PSStackEntry = Tuple[int, PSStackType[ExtraT]]
class PSStackParser(PSBaseParser, Generic[ExtraT]):
def __init__(self, fp: BinaryIO) -> None:
PSBaseParser.__init__(self, fp)
self.reset()
return
def reset(self) -> None:
self.context: List[Tuple[int, Optional[str],
List[PSStackEntry[ExtraT]]]] = []
self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = []
self.curtype: Optional[str] = None
self.curstack: List[PSStackEntry[ExtraT]] = []
self.results: List[PSStackEntry[ExtraT]] = []
@ -565,25 +572,24 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
try:
log.debug('add_results: %r', objs)
log.debug("add_results: %r", objs)
except Exception:
log.debug('add_results: (unprintable object)')
log.debug("add_results: (unprintable object)")
self.results.extend(objs)
return
def start_type(self, pos: int, type: str) -> None:
self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, [])
log.debug('start_type: pos=%r, type=%r', pos, type)
log.debug("start_type: pos=%r, type=%r", pos, type)
return
def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
if self.curtype != type:
raise PSTypeError('Type mismatch: {!r} != {!r}'
.format(self.curtype, type))
raise PSTypeError("Type mismatch: {!r} != {!r}".format(self.curtype, type))
objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop()
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs)
return (pos, objs)
def do_keyword(self, pos: int, token: PSKeyword) -> None:
@ -604,47 +610,55 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
self.push((pos, token))
elif token == KEYWORD_ARRAY_BEGIN:
# begin array
self.start_type(pos, 'a')
self.start_type(pos, "a")
elif token == KEYWORD_ARRAY_END:
# end array
try:
self.push(self.end_type('a'))
self.push(self.end_type("a"))
except PSTypeError:
if settings.STRICT:
raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
self.start_type(pos, "d")
elif token == KEYWORD_DICT_END:
# end dictionary
try:
(pos, objs) = self.end_type('d')
(pos, objs) = self.end_type("d")
if len(objs) % 2 != 0:
error_msg = 'Invalid dictionary construct: %r' % objs
error_msg = "Invalid dictionary construct: %r" % objs
raise PSSyntaxError(error_msg)
d = {literal_name(k): v
for (k, v) in choplist(2, objs) if v is not None}
d = {
literal_name(k): v
for (k, v) in choplist(2, objs)
if v is not None
}
self.push((pos, d))
except PSTypeError:
if settings.STRICT:
raise
elif token == KEYWORD_PROC_BEGIN:
# begin proc
self.start_type(pos, 'p')
self.start_type(pos, "p")
elif token == KEYWORD_PROC_END:
# end proc
try:
self.push(self.end_type('p'))
self.push(self.end_type("p"))
except PSTypeError:
if settings.STRICT:
raise
elif isinstance(token, PSKeyword):
log.debug('do_keyword: pos=%r, token=%r, stack=%r', pos,
token, self.curstack)
log.debug(
"do_keyword: pos=%r, token=%r, stack=%r", pos, token, self.curstack
)
self.do_keyword(pos, token)
else:
log.error('unknown token: pos=%r, token=%r, stack=%r', pos,
token, self.curstack)
log.error(
"unknown token: pos=%r, token=%r, stack=%r",
pos,
token,
self.curstack,
)
self.do_keyword(pos, token)
raise
if self.context:
@ -653,7 +667,7 @@ class PSStackParser(PSBaseParser, Generic[ExtraT]):
self.flush()
obj = self.results.pop(0)
try:
log.debug('nextobject: %r', obj)
log.debug("nextobject: %r", obj)
except Exception:
log.debug('nextobject: (unprintable object)')
log.debug("nextobject: (unprintable object)")
return obj

View File

@ -20,7 +20,7 @@ def rldecode(data: bytes) -> bytes:
(2 to 128) times during decompression. A length value of 128
denotes EOD.
"""
decoded = b''
decoded = b""
i = 0
while i < len(data):
length = data[i]

View File

@ -6,9 +6,24 @@ import pathlib
import string
import struct
from html import escape
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
List, Optional, Set, TextIO, Tuple, TypeVar, Union,
TYPE_CHECKING, cast)
from typing import (
Any,
BinaryIO,
Callable,
Dict,
Generic,
Iterable,
Iterator,
List,
Optional,
Set,
TextIO,
Tuple,
TypeVar,
Union,
TYPE_CHECKING,
cast,
)
if TYPE_CHECKING:
from .layout import LTComponent
@ -30,12 +45,8 @@ class open_filename(object):
(str or pathlib.PurePath type is supported) and closes it on exit,
(just like `open`), but does nothing for file-like objects.
"""
def __init__(
self,
filename: FileOrName,
*args: Any,
**kwargs: Any
) -> None:
def __init__(self, filename: FileOrName, *args: Any, **kwargs: Any) -> None:
if isinstance(filename, pathlib.PurePath):
filename = str(filename)
if isinstance(filename, str):
@ -45,17 +56,12 @@ class open_filename(object):
self.file_handler = cast(AnyIO, filename)
self.closing = False
else:
raise TypeError('Unsupported input type: %s' % type(filename))
raise TypeError("Unsupported input type: %s" % type(filename))
def __enter__(self) -> AnyIO:
return self.file_handler
def __exit__(
self,
exc_type: object,
exc_val: object,
exc_tb: object
) -> None:
def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None:
if self.closing:
self.file_handler.close()
@ -70,7 +76,7 @@ def make_compat_str(o: object) -> str:
"""Converts everything to string, if bytes guessing the encoding."""
if isinstance(o, bytes):
enc = chardet.detect(o)
return o.decode(enc['encoding'])
return o.decode(enc["encoding"])
else:
return str(o)
@ -80,15 +86,13 @@ def shorten_str(s: str, size: int) -> str:
return s[:size]
if len(s) > size:
length = (size - 5) // 2
return '{} ... {}'.format(s[:length], s[-length:])
return "{} ... {}".format(s[:length], s[-length:])
else:
return s
def compatible_encode_method(
bytesorstring: Union[bytes, str],
encoding: str = 'utf-8',
erraction: str = 'ignore'
bytesorstring: Union[bytes, str], encoding: str = "utf-8", erraction: str = "ignore"
) -> str:
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
@ -119,11 +123,7 @@ def paeth_predictor(left: int, above: int, upper_left: int) -> int:
def apply_png_predictor(
pred: int,
colors: int,
columns: int,
bitspercomponent: int,
data: bytes
pred: int, colors: int, columns: int, bitspercomponent: int, data: bytes
) -> bytes:
"""Reverse the effect of the PNG predictor
@ -135,12 +135,12 @@ def apply_png_predictor(
nbytes = colors * columns * bitspercomponent // 8
bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel
buf = b''
line_above = b'\x00' * columns
buf = b""
line_above = b"\x00" * columns
for scanline_i in range(0, len(data), nbytes + 1):
filter_type = data[scanline_i]
line_encoded = data[scanline_i + 1 : scanline_i + 1 + nbytes]
raw = b''
raw = b""
if filter_type == 0:
# Filter type 0: None
@ -226,7 +226,8 @@ PathSegment = Union[
Tuple[str], # Literal['h']
Tuple[str, float, float], # Literal['m', 'l']
Tuple[str, float, float, float, float], # Literal['v', 'y']
Tuple[str, float, float, float, float, float, float]] # Literal['c']
Tuple[str, float, float, float, float, float, float],
] # Literal['c']
# Matrix operations
MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
@ -236,9 +237,14 @@ def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
(a1, b1, c1, d1, e1, f1) = m1
(a0, b0, c0, d0, e0, f0) = m0
"""Returns the multiplication of two matrices."""
return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1,
a0 * c1 + c0 * d1, b0 * c1 + d0 * d1,
a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
return (
a0 * a1 + c0 * b1,
b0 * a1 + d0 * b1,
a0 * c1 + c0 * d1,
b0 * c1 + d0 * d1,
a0 * e1 + c0 * f1 + e0,
b0 * e1 + d0 * f1 + f0,
)
def translate_matrix(m: Matrix, v: Point) -> Matrix:
@ -264,11 +270,12 @@ def apply_matrix_norm(m: Matrix, v: Point) -> Point:
# Utility functions
def isnumber(x: object) -> bool:
return isinstance(x, (int, float))
_T = TypeVar('_T')
_T = TypeVar("_T")
def uniq(objs: Iterable[_T]) -> Iterator[_T]:
@ -282,10 +289,7 @@ def uniq(objs: Iterable[_T]) -> Iterator[_T]:
return
def fsplit(
pred: Callable[[_T], bool],
objs: Iterable[_T]
) -> Tuple[List[_T], List[_T]]:
def fsplit(pred: Callable[[_T], bool], objs: Iterable[_T]) -> Tuple[List[_T], List[_T]]:
"""Split a list into two classes according to the predicate."""
t = []
f = []
@ -315,9 +319,7 @@ def get_bound(pts: Iterable[Point]) -> Rect:
def pick(
seq: Iterable[_T],
func: Callable[[_T], float],
maxobj: Optional[_T] = None
seq: Iterable[_T], func: Callable[[_T], float], maxobj: Optional[_T] = None
) -> Optional[_T]:
"""Picks the object obj where func(obj) has the highest value."""
maxscore = None
@ -347,77 +349,303 @@ def nunpack(s: bytes, default: int = 0) -> int:
elif length == 1:
return ord(s)
elif length == 2:
return cast(int, struct.unpack('>H', s)[0])
return cast(int, struct.unpack(">H", s)[0])
elif length == 3:
return cast(int, struct.unpack('>L', b'\x00' + s)[0])
return cast(int, struct.unpack(">L", b"\x00" + s)[0])
elif length == 4:
return cast(int, struct.unpack('>L', s)[0])
return cast(int, struct.unpack(">L", s)[0])
elif length == 8:
return cast(int, struct.unpack('>Q', s)[0])
return cast(int, struct.unpack(">Q", s)[0])
else:
raise TypeError('invalid length: %d' % length)
raise TypeError("invalid length: %d" % length)
PDFDocEncoding = ''.join(chr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
PDFDocEncoding = "".join(
chr(x)
for x in (
0x0000,
0x0001,
0x0002,
0x0003,
0x0004,
0x0005,
0x0006,
0x0007,
0x0008,
0x0009,
0x000A,
0x000B,
0x000C,
0x000D,
0x000E,
0x000F,
0x0010,
0x0011,
0x0012,
0x0013,
0x0014,
0x0015,
0x0017,
0x0017,
0x02D8,
0x02C7,
0x02C6,
0x02D9,
0x02DD,
0x02DB,
0x02DA,
0x02DC,
0x0020,
0x0021,
0x0022,
0x0023,
0x0024,
0x0025,
0x0026,
0x0027,
0x0028,
0x0029,
0x002A,
0x002B,
0x002C,
0x002D,
0x002E,
0x002F,
0x0030,
0x0031,
0x0032,
0x0033,
0x0034,
0x0035,
0x0036,
0x0037,
0x0038,
0x0039,
0x003A,
0x003B,
0x003C,
0x003D,
0x003E,
0x003F,
0x0040,
0x0041,
0x0042,
0x0043,
0x0044,
0x0045,
0x0046,
0x0047,
0x0048,
0x0049,
0x004A,
0x004B,
0x004C,
0x004D,
0x004E,
0x004F,
0x0050,
0x0051,
0x0052,
0x0053,
0x0054,
0x0055,
0x0056,
0x0057,
0x0058,
0x0059,
0x005A,
0x005B,
0x005C,
0x005D,
0x005E,
0x005F,
0x0060,
0x0061,
0x0062,
0x0063,
0x0064,
0x0065,
0x0066,
0x0067,
0x0068,
0x0069,
0x006A,
0x006B,
0x006C,
0x006D,
0x006E,
0x006F,
0x0070,
0x0071,
0x0072,
0x0073,
0x0074,
0x0075,
0x0076,
0x0077,
0x0078,
0x0079,
0x007A,
0x007B,
0x007C,
0x007D,
0x007E,
0x0000,
0x2022,
0x2020,
0x2021,
0x2026,
0x2014,
0x2013,
0x0192,
0x2044,
0x2039,
0x203A,
0x2212,
0x2030,
0x201E,
0x201C,
0x201D,
0x2018,
0x2019,
0x201A,
0x2122,
0xFB01,
0xFB02,
0x0141,
0x0152,
0x0160,
0x0178,
0x017D,
0x0131,
0x0142,
0x0153,
0x0161,
0x017E,
0x0000,
0x20AC,
0x00A1,
0x00A2,
0x00A3,
0x00A4,
0x00A5,
0x00A6,
0x00A7,
0x00A8,
0x00A9,
0x00AA,
0x00AB,
0x00AC,
0x0000,
0x00AE,
0x00AF,
0x00B0,
0x00B1,
0x00B2,
0x00B3,
0x00B4,
0x00B5,
0x00B6,
0x00B7,
0x00B8,
0x00B9,
0x00BA,
0x00BB,
0x00BC,
0x00BD,
0x00BE,
0x00BF,
0x00C0,
0x00C1,
0x00C2,
0x00C3,
0x00C4,
0x00C5,
0x00C6,
0x00C7,
0x00C8,
0x00C9,
0x00CA,
0x00CB,
0x00CC,
0x00CD,
0x00CE,
0x00CF,
0x00D0,
0x00D1,
0x00D2,
0x00D3,
0x00D4,
0x00D5,
0x00D6,
0x00D7,
0x00D8,
0x00D9,
0x00DA,
0x00DB,
0x00DC,
0x00DD,
0x00DE,
0x00DF,
0x00E0,
0x00E1,
0x00E2,
0x00E3,
0x00E4,
0x00E5,
0x00E6,
0x00E7,
0x00E8,
0x00E9,
0x00EA,
0x00EB,
0x00EC,
0x00ED,
0x00EE,
0x00EF,
0x00F0,
0x00F1,
0x00F2,
0x00F3,
0x00F4,
0x00F5,
0x00F6,
0x00F7,
0x00F8,
0x00F9,
0x00FA,
0x00FB,
0x00FC,
0x00FD,
0x00FE,
0x00FF,
)
)
def decode_text(s: bytes) -> str:
"""Decodes a PDFDocEncoding string to Unicode."""
if s.startswith(b'\xfe\xff'):
return str(s[2:], 'utf-16be', 'ignore')
if s.startswith(b"\xfe\xff"):
return str(s[2:], "utf-16be", "ignore")
else:
return ''.join(PDFDocEncoding[c] for c in s)
return "".join(PDFDocEncoding[c] for c in s)
def enc(x: str) -> str:
"""Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes):
return ''
return ""
return escape(x)
def bbox2str(bbox: Rect) -> str:
(x0, y0, x1, y1) = bbox
return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1)
return "{:.3f},{:.3f},{:.3f},{:.3f}".format(x0, y0, x1, y1)
def matrix2str(m: Matrix) -> str:
(a, b, c, d, e, f) = m
return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\
.format(a, b, c, d, e, f)
return "[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]".format(a, b, c, d, e, f)
def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
@ -446,7 +674,7 @@ def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
return max(0, iw), max(0, ih)
LTComponentT = TypeVar('LTComponentT', bound='LTComponent')
LTComponentT = TypeVar("LTComponentT", bound="LTComponent")
class Plane(Generic[LTComponentT]):
@ -465,7 +693,7 @@ class Plane(Generic[LTComponentT]):
(self.x0, self.y0, self.x1, self.y1) = bbox
def __repr__(self) -> str:
return '<Plane objs=%r>' % list(self)
return "<Plane objs=%r>" % list(self)
def __iter__(self) -> Iterator[LTComponentT]:
return (obj for obj in self._seq if obj in self._objs)
@ -524,14 +752,13 @@ class Plane(Generic[LTComponentT]):
if obj in done:
continue
done.add(obj)
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 \
or y1 <= obj.y0:
if obj.x1 <= x0 or x1 <= obj.x0 or obj.y1 <= y0 or y1 <= obj.y0:
continue
yield obj
ROMAN_ONES = ['i', 'x', 'c', 'm']
ROMAN_FIVES = ['v', 'l', 'd']
ROMAN_ONES = ["i", "x", "c", "m"]
ROMAN_FIVES = ["v", "l", "d"]
def format_int_roman(value: int) -> str:
@ -557,7 +784,7 @@ def format_int_roman(value: int) -> str:
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
index += 1
return ''.join(result)
return "".join(result)
def format_int_alpha(value: int) -> str:
@ -571,4 +798,4 @@ def format_int_alpha(value: int) -> str:
result.append(string.ascii_lowercase[remainder])
result.reverse()
return ''.join(result)
return "".join(result)

View File

@ -8,52 +8,52 @@ sys.path.append(str(Path(__file__).parent))
import pdfminer as package
with open(path.join(path.abspath(path.dirname(__file__)), 'README.md')) as f:
with open(path.join(path.abspath(path.dirname(__file__)), "README.md")) as f:
readme = f.read()
setup(
name='pdfminer.six',
name="pdfminer.six",
version=package.__version__,
packages=['pdfminer'],
package_data={'pdfminer': ['cmap/*.pickle.gz', 'py.typed']},
packages=["pdfminer"],
package_data={"pdfminer": ["cmap/*.pickle.gz", "py.typed"]},
install_requires=[
'chardet ; python_version > "3.0"',
'cryptography',
"cryptography",
],
extras_require={
"dev": ["pytest", "nox", "mypy == 0.931"],
"dev": ["pytest", "nox", "black", "mypy == 0.931"],
"docs": ["sphinx", "sphinx-argparse"],
},
description='PDF parser and analyzer',
description="PDF parser and analyzer",
long_description=readme,
long_description_content_type='text/markdown',
license='MIT/X',
author='Yusuke Shinyama + Philippe Guglielmetti',
author_email='pdfminer@goulu.net',
url='https://github.com/pdfminer/pdfminer.six',
long_description_content_type="text/markdown",
license="MIT/X",
author="Yusuke Shinyama + Philippe Guglielmetti",
author_email="pdfminer@goulu.net",
url="https://github.com/pdfminer/pdfminer.six",
scripts=[
'tools/pdf2txt.py',
'tools/dumppdf.py',
"tools/pdf2txt.py",
"tools/dumppdf.py",
],
keywords=[
'pdf parser',
'pdf converter',
'layout analysis',
'text mining',
"pdf parser",
"pdf converter",
"layout analysis",
"text mining",
],
python_requires='>=3.6',
python_requires=">=3.6",
classifiers=[
'Programming Language :: Python',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3 :: Only',
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Topic :: Text Processing',
"Programming Language :: Python",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3 :: Only",
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Topic :: Text Processing",
],
)

View File

@ -2,7 +2,6 @@ import os
def absolute_sample_path(relative_sample_path):
sample_dir = os.path.abspath(
os.path.join(os.path.dirname(__file__), '../samples'))
sample_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../samples"))
sample_file = os.path.join(sample_dir, relative_sample_path)
return sample_file

View File

@ -4,7 +4,7 @@ import tempfile
import os
class TemporaryFilePath():
class TemporaryFilePath:
"""Context manager class, which generates temporary file name
Coonroraly to standard tempfile.NamedTemporaryFile(), it does not
@ -40,9 +40,9 @@ class TemporaryFilePath():
`tempfile.NamedTemporaryFile` will create and delete a file, and
this method only returns the filepath of the non-existing file.
"""
with tempfile.NamedTemporaryFile(suffix=self.suffix,
prefix=self.prefix,
dir=self.dir) as file:
with tempfile.NamedTemporaryFile(
suffix=self.suffix, prefix=self.prefix, dir=self.dir
) as file:
self.temp_file_name = file.name
return self.temp_file_name

View File

@ -9,14 +9,14 @@ from pdfminer.pdfinterp import PDFGraphicState
class TestPaintPath:
def test_paint_path(self):
path = [('m', 6, 7), ('l', 7, 7)]
path = [("m", 6, 7), ("l", 7, 7)]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert len(analyzer.cur_item._objs) == 1
def test_paint_path_mlllh(self):
path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)]
path = [("m", 6, 7), ("l", 7, 7), ("l", 7, 91), ("l", 6, 91), ("h",)]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
@ -25,9 +25,21 @@ class TestPaintPath:
def test_paint_path_multiple_mlllh(self):
"""Path from samples/contrib/issue-00369-excel.pdf"""
path = [
('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',),
('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',),
('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',)
("m", 6, 7),
("l", 7, 7),
("l", 7, 91),
("l", 6, 91),
("h",),
("m", 4, 7),
("l", 6, 7),
("l", 6, 91),
("l", 4, 91),
("h",),
("m", 67, 2),
("l", 68, 2),
("l", 68, 3),
("l", 67, 3),
("h",),
]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
@ -177,34 +189,34 @@ class TestPaintPath:
return analyzer.cur_item._objs
# "c" operator
assert parse([
assert parse(
[
("m", 72.41, 433.89),
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
])[0].pts == [
]
)[0].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
# "v" operator
assert parse([
("m", 72.41, 433.89),
("v", 71.96, 434.89, 71.41, 434.89),
])[0].pts == [
assert parse([("m", 72.41, 433.89), ("v", 71.96, 434.89, 71.41, 434.89)])[
0
].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
# "y" operator
assert parse([
("m", 72.41, 433.89),
("y", 72.41, 434.45, 71.41, 434.89),
])[0].pts == [
assert parse([("m", 72.41, 433.89), ("y", 72.41, 434.45, 71.41, 434.89)])[
0
].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
class TestBinaryDetector():
class TestBinaryDetector:
def test_stringio(self):
assert not PDFConverter._is_binary_stream(io.StringIO())
@ -212,11 +224,11 @@ class TestBinaryDetector():
assert PDFConverter._is_binary_stream(io.BytesIO())
def test_tmpfile(self):
with TemporaryFile(mode='w') as f:
with TemporaryFile(mode="w") as f:
assert not PDFConverter._is_binary_stream(f)
def test_binary_tmpfile(self):
with TemporaryFile(mode='wb') as f:
with TemporaryFile(mode="wb") as f:
assert PDFConverter._is_binary_stream(f)
def test_non_file_like_object_defaults_to_binary(self):

View File

@ -13,31 +13,31 @@ from pdfminer.psparser import PSLiteral
def test_name2unicode_name_in_agl():
"""The name "Lcommaaccent" has a single component,
which is mapped to the string U+013B by AGL"""
assert '\u013B' == name2unicode('Lcommaaccent')
assert "\u013B" == name2unicode("Lcommaaccent")
def test_name2unicode_uni():
"""The components "Lcommaaccent," "uni013B," and "u013B"
all map to the string U+013B"""
assert '\u013B' == name2unicode('uni013B')
assert "\u013B" == name2unicode("uni013B")
def test_name2unicode_uni_lowercase():
"""The components "Lcommaaccent," "uni013B," and "u013B"
all map to the string U+013B"""
assert '\u013B' == name2unicode('uni013b')
assert "\u013B" == name2unicode("uni013b")
def test_name2unicode_uni_with_sequence_of_digits():
"""The name "uni20AC0308" has a single component,
which is mapped to the string U+20AC U+0308"""
assert '\u20AC\u0308' == name2unicode('uni20AC0308')
assert "\u20AC\u0308" == name2unicode("uni20AC0308")
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
"""The name "uni20AC0308" has a single component,
which is mapped to the string U+20AC U+0308"""
assert '\u20AC\u0308' == name2unicode('uni20ac0308')
assert "\u20AC\u0308" == name2unicode("uni20ac0308")
def test_name2unicode_uni_empty_string():
@ -46,7 +46,7 @@ def test_name2unicode_uni_empty_string():
According to the specification this should be mapped to an empty string,
but we also want to support lowercase hexadecimals"""
assert '\u20ac' == name2unicode('uni20ac')
assert "\u20ac" == name2unicode("uni20ac")
def test_name2unicode_uni_empty_string_long():
@ -60,7 +60,7 @@ def test_name2unicode_uni_empty_string_long():
glyph name "u1040C.
"""
with pytest.raises(KeyError):
name2unicode('uniD801DC0C')
name2unicode("uniD801DC0C")
def test_name2unicode_uni_empty_string_long_lowercase():
@ -73,57 +73,59 @@ def test_name2unicode_uni_empty_string_long_lowercase():
This character can be correctly mapped by using the
glyph name "u1040C."""
with pytest.raises(KeyError):
name2unicode('uniD801DC0C')
name2unicode("uniD801DC0C")
def test_name2unicode_uni_pua():
""" "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
U+F6FB."""
assert '\uF6FB' == name2unicode('uniF6FB')
assert "\uF6FB" == name2unicode("uniF6FB")
def test_name2unicode_uni_pua_lowercase():
""" "Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
U+F6FB."""
assert '\uF6FB' == name2unicode('unif6fb')
assert "\uF6FB" == name2unicode("unif6fb")
def test_name2unicode_u_with_4_digits():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the
string U+013B"""
assert '\u013B' == name2unicode('u013B')
assert "\u013B" == name2unicode("u013B")
def test_name2unicode_u_with_4_digits_lowercase():
"""The components "Lcommaaccent," "uni013B," and "u013B" all map to the
string U+013B"""
assert '\u013B' == name2unicode('u013b')
assert "\u013B" == name2unicode("u013b")
def test_name2unicode_u_with_5_digits():
"""The name "u1040C" has a single component, which is mapped to the string
U+1040C"""
assert '\U0001040C' == name2unicode('u1040C')
assert "\U0001040C" == name2unicode("u1040C")
def test_name2unicode_u_with_5_digits_lowercase():
"""The name "u1040C" has a single component, which is mapped to the string
U+1040C"""
assert '\U0001040C' == name2unicode('u1040c')
assert "\U0001040C" == name2unicode("u1040c")
def test_name2unicode_multiple_components():
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
string U+013B U+20AC U+0308 U+1040C"""
assert '\u013B\u20AC\u0308\U0001040C' == \
name2unicode('Lcommaaccent_uni20AC0308_u1040C.alternate')
assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
"Lcommaaccent_uni20AC0308_u1040C.alternate"
)
def test_name2unicode_multiple_components_lowercase():
"""The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
string U+013B U+20AC U+0308 U+1040C"""
assert '\u013B\u20AC\u0308\U0001040C' == \
name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
assert "\u013B\u20AC\u0308\U0001040C" == name2unicode(
"Lcommaaccent_uni20ac0308_u1040c.alternate"
)
def test_name2unicode_foo():
@ -131,26 +133,26 @@ def test_name2unicode_foo():
because 'foo' is not in AGL,
and because it does not start with a 'u.'"""
with pytest.raises(KeyError):
name2unicode('foo')
name2unicode("foo")
def test_name2unicode_notdef():
"""The name ".notdef" is reduced to an empty string (step 1)
and mapped to an empty string (step 3)"""
with pytest.raises(KeyError):
name2unicode('.notdef')
name2unicode(".notdef")
def test_name2unicode_pua_ogoneksmall():
""" "
Ogoneksmall" and "uniF6FB" both map to the string
that corresponds to U+F6FB."""
assert '\uF6FB' == name2unicode('Ogoneksmall')
assert "\uF6FB" == name2unicode("Ogoneksmall")
def test_name2unicode_overflow_error():
with pytest.raises(KeyError):
name2unicode('226215240241240240240240')
name2unicode("226215240241240240240240")
def test_get_encoding_with_invalid_differences():
@ -158,5 +160,5 @@ def test_get_encoding_with_invalid_differences():
Regression test for https://github.com/pdfminer/pdfminer.six/issues/385
"""
invalid_differences = [PSLiteral('ubuntu'), PSLiteral('1234')]
EncodingDB.get_encoding('StandardEncoding', invalid_differences)
invalid_differences = [PSLiteral("ubuntu"), PSLiteral("1234")]
EncodingDB.get_encoding("StandardEncoding", invalid_differences)

View File

@ -4,7 +4,7 @@ from pdfminer.layout import LTChar, LTTextBox
def test_font_size():
path = absolute_sample_path('font-size-test.pdf')
path = absolute_sample_path("font-size-test.pdf")
for page in extract_pages(path):
for text_box in page:
if isinstance(text_box, LTTextBox):

View File

@ -129,37 +129,43 @@ class TestExtractPages(unittest.TestCase):
def test_line_margin(self):
# The lines have margin 0.2 relative to the height.
# Extract with line_margin 0.19 should break into 3 separate textboxes.
pages = list(extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.19)))
pages = list(
extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.19)
)
)
self.assertEqual(len(pages), 1)
page = pages[0]
elements = [element for element in page
if isinstance(element, LTTextContainer)]
elements = [element for element in page if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 3)
self.assertEqual(elements[0].get_text(), "Text1\n")
self.assertEqual(elements[1].get_text(), "Text2\n")
self.assertEqual(elements[2].get_text(), "Text3\n")
# Extract with line_margin 0.21 should merge into one textbox.
pages = list(extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.21)))
pages = list(
extract_pages(
self._get_test_file_path(), laparams=LAParams(line_margin=0.21)
)
)
self.assertEqual(len(pages), 1)
page = pages[0]
elements = [element for element in page
if isinstance(element, LTTextContainer)]
elements = [element for element in page if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 1)
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
def test_no_boxes_flow(self):
pages = list(extract_pages(
self._get_test_file_path(), laparams=LAParams(boxes_flow=None)))
pages = list(
extract_pages(
self._get_test_file_path(), laparams=LAParams(boxes_flow=None)
)
)
self.assertEqual(len(pages), 1)
page = pages[0]
elements = [element for element in page
if isinstance(element, LTTextContainer)]
elements = [element for element in page if isinstance(element, LTTextContainer)]
self.assertEqual(len(elements), 1)
self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")

View File

@ -46,8 +46,7 @@ class TestFindNeigbors(unittest.TestCase):
right_aligned_below.set_bbox((15, 2, 20, 4))
plane.add(right_aligned_below)
centrally_aligned_overlapping = LTTextLineHorizontal(
laparams.word_margin)
centrally_aligned_overlapping = LTTextLineHorizontal(laparams.word_margin)
centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
plane.add(centrally_aligned_overlapping)
@ -86,8 +85,7 @@ class TestFindNeigbors(unittest.TestCase):
top_aligned_left.set_bbox((2, 15, 4, 20))
plane.add(top_aligned_left)
centrally_aligned_overlapping = LTTextLineVertical(
laparams.word_margin)
centrally_aligned_overlapping = LTTextLineVertical(laparams.word_margin)
centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
plane.add(centrally_aligned_overlapping)

View File

@ -9,9 +9,8 @@ from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
class TestPdfDocument(object):
def test_get_zero_objid_raises_pdfobjectnotfound(self):
with open(absolute_sample_path('simple1.pdf'), 'rb') as in_file:
with open(absolute_sample_path("simple1.pdf"), "rb") as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
with pytest.raises(PDFObjectNotFound):
@ -21,24 +20,29 @@ class TestPdfDocument(object):
# Some documents may be encrypted but not have an /ID key in
# their trailer. Tests
# https://github.com/pdfminer/pdfminer.six/issues/594
path = absolute_sample_path('encryption/encrypted_doc_no_id.pdf')
with open(path, 'rb') as fp:
path = absolute_sample_path("encryption/encrypted_doc_no_id.pdf")
with open(path, "rb") as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
assert doc.info == [{'Producer': b'European Patent Office'}]
assert doc.info == [{"Producer": b"European Patent Office"}]
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
with open(path, 'rb') as fp:
path = absolute_sample_path("contrib/pagelabels.pdf")
with open(path, "rb") as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
assert list(itertools.islice(doc.get_page_labels(), total_pages)) \
== ['iii', 'iv', '1', '2', '1']
total_pages = int_value(dict_value(doc.catalog["Pages"])["Count"])
assert list(itertools.islice(doc.get_page_labels(), total_pages)) == [
"iii",
"iv",
"1",
"2",
"1",
]
def test_no_page_labels(self):
path = absolute_sample_path('simple1.pdf')
with open(path, 'rb') as fp:
path = absolute_sample_path("simple1.pdf")
with open(path, "rb") as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)

View File

@ -9,96 +9,95 @@ from pdfminer.psparser import PSLiteral
class TestPDFEncoding:
def test_cmapname_onebyteidentityV(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("OneByteIdentityV")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte)
def test_cmapname_onebyteidentityH(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("OneByteIdentityH")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte)
def test_cmapname_V(self):
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("V")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_cmapname_H(self):
stream = PDFStream({'CMapName': PSLiteral('H')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("H")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)
def test_encoding_identityH(self):
spec = {'Encoding': PSLiteral('Identity-H')}
spec = {"Encoding": PSLiteral("Identity-H")}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV(self):
spec = {'Encoding': PSLiteral('Identity-V')}
spec = {"Encoding": PSLiteral("Identity-V")}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('Identity-H')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("Identity-H")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('Identity-V')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("Identity-V")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityH_as_stream(self):
stream = PDFStream({'CMapName': 'Identity-H'}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": "Identity-H"}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_identityV_as_stream(self):
stream = PDFStream({'CMapName': 'Identity-V'}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": "Identity-V"}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH(self):
spec = {'Encoding': PSLiteral('DLIdent-H')}
spec = {"Encoding": PSLiteral("DLIdent-H")}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV(self):
spec = {'Encoding': PSLiteral('DLIdent-V')}
spec = {"Encoding": PSLiteral("DLIdent-V")}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('DLIdent-H')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("DLIdent-H")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName': PSLiteral('DLIdent-V')}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": PSLiteral("DLIdent-V")}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentH_as_stream(self):
stream = PDFStream({'CMapName': 'DLIdent-H'}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": "DLIdent-H"}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)
def test_encoding_DLIdentV_as_stream(self):
stream = PDFStream({'CMapName': 'DLIdent-V'}, '')
spec = {'Encoding': stream}
stream = PDFStream({"CMapName": "DLIdent-V"}, "")
spec = {"Encoding": stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

View File

@ -8,12 +8,12 @@ def test_get_cmap_from_pickle():
Regression test for https://github.com/pdfminer/pdfminer.six/issues/391
"""
cmap_name = 'UniGB-UCS2-H'
spec = {'Encoding': PSLiteral(cmap_name)}
cmap_name = "UniGB-UCS2-H"
spec = {"Encoding": PSLiteral(cmap_name)}
resource_manager = PDFResourceManager()
font = PDFCIDFont(resource_manager, spec)
cmap = font.get_cmap_from_spec(spec, False)
assert cmap.attrs.get('CMapName') == cmap_name
assert cmap.attrs.get("CMapName") == cmap_name
assert len(cmap.code2cid) > 0

View File

@ -1,7 +1,7 @@
from pdfminer.ccitt import CCITTG4Parser, CCITTFaxDecoder
class TestCCITTG4Parser():
class TestCCITTG4Parser:
def get_parser(self, bits):
parser = CCITTG4Parser(len(bits))
parser._curline = [int(c) for c in bits]
@ -9,60 +9,60 @@ class TestCCITTG4Parser():
return parser
def test_b1(self):
parser = self.get_parser('00000')
parser = self.get_parser("00000")
parser._do_vertical(0)
assert parser._curpos == 0
return
def test_b2(self):
parser = self.get_parser('10000')
parser = self.get_parser("10000")
parser._do_vertical(-1)
assert parser._curpos == 0
return
def test_b3(self):
parser = self.get_parser('000111')
parser = self.get_parser("000111")
parser._do_pass()
assert parser._curpos == 3
assert parser._get_bits() == '111'
assert parser._get_bits() == "111"
return
def test_b4(self):
parser = self.get_parser('00000')
parser = self.get_parser("00000")
parser._do_vertical(+2)
assert parser._curpos == 2
assert parser._get_bits() == '11'
assert parser._get_bits() == "11"
return
def test_b5(self):
parser = self.get_parser('11111111100')
parser = self.get_parser("11111111100")
parser._do_horizontal(0, 3)
assert parser._curpos == 3
parser._do_vertical(1)
assert parser._curpos == 10
assert parser._get_bits() == '0001111111'
assert parser._get_bits() == "0001111111"
return
def test_e1(self):
parser = self.get_parser('10000')
parser = self.get_parser("10000")
parser._do_vertical(0)
assert parser._curpos == 1
parser._do_vertical(0)
assert parser._curpos == 5
assert parser._get_bits() == '10000'
assert parser._get_bits() == "10000"
return
def test_e2(self):
parser = self.get_parser('10011')
parser = self.get_parser("10011")
parser._do_vertical(0)
assert parser._curpos == 1
parser._do_vertical(2)
assert parser._curpos == 5
assert parser._get_bits() == '10000'
assert parser._get_bits() == "10000"
return
def test_e3(self):
parser = self.get_parser('011111')
parser = self.get_parser("011111")
parser._color = 0
parser._do_vertical(0)
assert parser._color == 1
@ -72,90 +72,90 @@ class TestCCITTG4Parser():
assert parser._curpos == 4
parser._do_vertical(0)
assert parser._curpos == 6
assert parser._get_bits() == '011100'
assert parser._get_bits() == "011100"
return
def test_e4(self):
parser = self.get_parser('10000')
parser = self.get_parser("10000")
parser._do_vertical(0)
assert parser._curpos == 1
parser._do_vertical(-2)
assert parser._curpos == 3
parser._do_vertical(0)
assert parser._curpos == 5
assert parser._get_bits() == '10011'
assert parser._get_bits() == "10011"
return
def test_e5(self):
parser = self.get_parser('011000')
parser = self.get_parser("011000")
parser._color = 0
parser._do_vertical(0)
assert parser._curpos == 1
parser._do_vertical(3)
assert parser._curpos == 6
assert parser._get_bits() == '011111'
assert parser._get_bits() == "011111"
return
def test_e6(self):
parser = self.get_parser('11001')
parser = self.get_parser("11001")
parser._do_pass()
assert parser._curpos == 4
parser._do_vertical(0)
assert parser._curpos == 5
assert parser._get_bits() == '11111'
assert parser._get_bits() == "11111"
return
def test_e7(self):
parser = self.get_parser('0000000000')
parser = self.get_parser("0000000000")
parser._curpos = 2
parser._color = 1
parser._do_horizontal(2, 6)
assert parser._curpos == 10
assert parser._get_bits() == '1111000000'
assert parser._get_bits() == "1111000000"
return
def test_e8(self):
parser = self.get_parser('001100000')
parser = self.get_parser("001100000")
parser._curpos = 1
parser._color = 0
parser._do_vertical(0)
assert parser._curpos == 2
parser._do_horizontal(7, 0)
assert parser._curpos == 9
assert parser._get_bits() == '101111111'
assert parser._get_bits() == "101111111"
return
def test_m1(self):
parser = self.get_parser('10101')
parser = self.get_parser("10101")
parser._do_pass()
assert parser._curpos == 2
parser._do_pass()
assert parser._curpos == 4
assert parser._get_bits() == '1111'
assert parser._get_bits() == "1111"
return
def test_m2(self):
parser = self.get_parser('101011')
parser = self.get_parser("101011")
parser._do_vertical(-1)
parser._do_vertical(-1)
parser._do_vertical(1)
parser._do_horizontal(1, 1)
assert parser._get_bits() == '011101'
assert parser._get_bits() == "011101"
return
def test_m3(self):
parser = self.get_parser('10111011')
parser = self.get_parser("10111011")
parser._do_vertical(-1)
parser._do_pass()
parser._do_vertical(1)
parser._do_vertical(1)
assert parser._get_bits() == '00000001'
assert parser._get_bits() == "00000001"
return
class TestCCITTFaxDecoder:
def test_b1(self):
decoder = CCITTFaxDecoder(5)
decoder.output_line(0, b'0')
assert decoder.close() == b'\x80'
decoder.output_line(0, b"0")
assert decoder.close() == b"\x80"
return

View File

@ -18,36 +18,37 @@ def dehex(b):
return binascii.unhexlify(b)
class TestAscii85():
class TestAscii85:
def test_ascii85decode(self):
"""The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85"""
assert ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q') \
== b'Man is distinguished'
assert ascii85decode(b'E,9)oF*2M7/c~>') == b'pleasure.'
assert ascii85decode(b"9jqo^BlbD-BleB1DJ+*+F(f,q") == b"Man is distinguished"
assert ascii85decode(b"E,9)oF*2M7/c~>") == b"pleasure."
def test_asciihexdecode(self):
assert asciihexdecode(b'61 62 2e6364 65') == b'ab.cde'
assert asciihexdecode(b'61 62 2e6364 657>') == b'ab.cdep'
assert asciihexdecode(b'7>') == b'p'
assert asciihexdecode(b"61 62 2e6364 65") == b"ab.cde"
assert asciihexdecode(b"61 62 2e6364 657>") == b"ab.cdep"
assert asciihexdecode(b"7>") == b"p"
class TestArcfour():
class TestArcfour:
def test(self):
assert hex(Arcfour(b'Key').process(b'Plaintext')) \
== b'bbf316e8d940af0ad3'
assert hex(Arcfour(b'Wiki').process(b'pedia')) == b'1021bf0420'
assert hex(Arcfour(b'Secret').process(b'Attack at dawn')) \
== b'45a01f645fc35b383552544b9bf5'
assert hex(Arcfour(b"Key").process(b"Plaintext")) == b"bbf316e8d940af0ad3"
assert hex(Arcfour(b"Wiki").process(b"pedia")) == b"1021bf0420"
assert (
hex(Arcfour(b"Secret").process(b"Attack at dawn"))
== b"45a01f645fc35b383552544b9bf5"
)
class TestLzw():
class TestLzw:
def test_lzwdecode(self):
assert lzwdecode(b'\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') \
== b'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
assert (
lzwdecode(b"\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01")
== b"\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42"
)
class TestRunlength():
class TestRunlength:
def test_rldecode(self):
assert rldecode(b'\x05123456\xfa7\x04abcde\x80junk') \
== b'1234567777777abcde'
assert rldecode(b"\x05123456\xfa7\x04abcde\x80junk") == b"1234567777777abcde"

View File

@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
class TestPSBaseParser:
"""Simplistic Test cases"""
TESTDATA = br'''%!PS
TESTDATA = rb"""%!PS
begin end
" @ #
/a/BCD /Some_Name /foo#5f#xbaa
@ -26,33 +26,83 @@ baa)
func/a/b{(c)do*}def
[ 1 (z) ! ]
<< /foo (bar) >>
'''
"""
TOKENS = [
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')),
(19, KWD(b'@')), (21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')),
(30, LIT('Some_Name')), (41, LIT('foo_xbaa')), (54, 0), (56, 1),
(59, -2), (62, 0.5), (65, 1.234), (71, b'abc'), (77, b''),
(80, b'abc ( def ) ghi'), (98, b'def \x00 4ghi'),
(118, b'bach\\slask'), (132, b'foo\nbaa'),
(143, b'this % is not a comment.'), (170, b'foo\nbaa'),
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '),
(211, b'\xab\xcd\x00\x124\x05'), (226, KWD(b'func')), (230, LIT('a')),
(232, LIT('b')), (234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')),
(241, KWD(b'}')), (242, KWD(b'def')), (246, KWD(b'[')), (248, 1),
(250, b'z'), (254, KWD(b'!')), (256, KWD(b']')), (258, KWD(b'<<')),
(261, LIT('foo')), (266, b'bar'), (272, KWD(b'>>'))
(5, KWD(b"begin")),
(11, KWD(b"end")),
(16, KWD(b'"')),
(19, KWD(b"@")),
(21, KWD(b"#")),
(23, LIT("a")),
(25, LIT("BCD")),
(30, LIT("Some_Name")),
(41, LIT("foo_xbaa")),
(54, 0),
(56, 1),
(59, -2),
(62, 0.5),
(65, 1.234),
(71, b"abc"),
(77, b""),
(80, b"abc ( def ) ghi"),
(98, b"def \x00 4ghi"),
(118, b"bach\\slask"),
(132, b"foo\nbaa"),
(143, b"this % is not a comment."),
(170, b"foo\nbaa"),
(180, b"foobaa"),
(191, b""),
(194, b" "),
(199, b"@@ "),
(211, b"\xab\xcd\x00\x124\x05"),
(226, KWD(b"func")),
(230, LIT("a")),
(232, LIT("b")),
(234, KWD(b"{")),
(235, b"c"),
(238, KWD(b"do*")),
(241, KWD(b"}")),
(242, KWD(b"def")),
(246, KWD(b"[")),
(248, 1),
(250, b"z"),
(254, KWD(b"!")),
(256, KWD(b"]")),
(258, KWD(b"<<")),
(261, LIT("foo")),
(266, b"bar"),
(272, KWD(b">>")),
]
OBJS = [
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
(143, b'this % is not a comment.'), (170, b'foo\nbaa'),
(180, b'foobaa'), (191, b''), (194, b' '), (199, b'@@ '),
(211, b'\xab\xcd\x00\x124\x05'), (230, LIT('a')), (232, LIT('b')),
(234, [b'c']), (246, [1, b'z']), (258, {'foo': b'bar'}),
(23, LIT("a")),
(25, LIT("BCD")),
(30, LIT("Some_Name")),
(41, LIT("foo_xbaa")),
(54, 0),
(56, 1),
(59, -2),
(62, 0.5),
(65, 1.234),
(71, b"abc"),
(77, b""),
(80, b"abc ( def ) ghi"),
(98, b"def \x00 4ghi"),
(118, b"bach\\slask"),
(132, b"foo\nbaa"),
(143, b"this % is not a comment."),
(170, b"foo\nbaa"),
(180, b"foobaa"),
(191, b""),
(194, b" "),
(199, b"@@ "),
(211, b"\xab\xcd\x00\x124\x05"),
(230, LIT("a")),
(232, LIT("b")),
(234, [b"c"]),
(246, [1, b"z"]),
(258, {"foo": b"bar"}),
]
def get_tokens(self, s):

View File

@ -6,10 +6,10 @@ from pdfminer.pdfparser import PDFParser
class TestPdfPage(object):
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
expected_labels = ['iii', 'iv', '1', '2', '1']
path = absolute_sample_path("contrib/pagelabels.pdf")
expected_labels = ["iii", "iv", "1", "2", "1"]
with open(path, 'rb') as fp:
with open(path, "rb") as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
for (i, page) in enumerate(PDFPage.create_pages(doc)):

View File

@ -11,48 +11,47 @@ def run(filename, options=None):
absolute_path = absolute_sample_path(filename)
with TemporaryFilePath() as output_file_name:
if options:
s = 'dumppdf -o %s %s %s' % (output_file_name,
options, absolute_path)
s = "dumppdf -o %s %s %s" % (output_file_name, options, absolute_path)
else:
s = 'dumppdf -o %s %s' % (output_file_name, absolute_path)
s = "dumppdf -o %s %s" % (output_file_name, absolute_path)
dumppdf.main(s.split(' ')[1:])
dumppdf.main(s.split(" ")[1:])
class TestDumpPDF(unittest.TestCase):
def test_simple1(self):
run('simple1.pdf', '-t -a')
run("simple1.pdf", "-t -a")
def test_simple2(self):
run('simple2.pdf', '-t -a')
run("simple2.pdf", "-t -a")
def test_jo(self):
run('jo.pdf', '-t -a')
run("jo.pdf", "-t -a")
def test_simple3(self):
run('simple3.pdf', '-t -a')
run("simple3.pdf", "-t -a")
def test_2(self):
run('nonfree/dmca.pdf', '-t -a')
run("nonfree/dmca.pdf", "-t -a")
def test_3(self):
run('nonfree/f1040nr.pdf')
run("nonfree/f1040nr.pdf")
def test_4(self):
run('nonfree/i1040nr.pdf')
run("nonfree/i1040nr.pdf")
def test_5(self):
run('nonfree/kampo.pdf', '-t -a')
run("nonfree/kampo.pdf", "-t -a")
def test_6(self):
run('nonfree/naacl06-shinyama.pdf', '-t -a')
run("nonfree/naacl06-shinyama.pdf", "-t -a")
def test_simple1_raw(self):
"""Known issue: crash in dumpxml writing binary to text stream."""
with pytest.raises(TypeError):
run('simple1.pdf', '-r -a')
run("simple1.pdf", "-r -a")
def test_simple1_binary(self):
"""Known issue: crash in dumpxml writing binary to text stream."""
with pytest.raises(TypeError):
run('simple1.pdf', '-b -a')
run("simple1.pdf", "-b -a")

View File

@ -12,115 +12,119 @@ def run(sample_path, options=None):
absolute_path = absolute_sample_path(sample_path)
with TemporaryFilePath() as output_file_name:
if options:
s = 'pdf2txt -o{} {} {}' \
.format(output_file_name, options, absolute_path)
s = "pdf2txt -o{} {} {}".format(output_file_name, options, absolute_path)
else:
s = 'pdf2txt -o{} {}'.format(output_file_name, absolute_path)
s = "pdf2txt -o{} {}".format(output_file_name, absolute_path)
pdf2txt.main(s.split(' ')[1:])
pdf2txt.main(s.split(" ")[1:])
class TestPdf2Txt():
class TestPdf2Txt:
def test_jo(self):
run('jo.pdf')
run("jo.pdf")
def test_simple1(self):
run('simple1.pdf')
run("simple1.pdf")
def test_simple2(self):
run('simple2.pdf')
run("simple2.pdf")
def test_simple3(self):
run('simple3.pdf')
run("simple3.pdf")
def test_sample_one_byte_identity_encode(self):
run('sampleOneByteIdentityEncode.pdf')
run("sampleOneByteIdentityEncode.pdf")
def test_nonfree_175(self):
"""Regression test for:
https://github.com/pdfminer/pdfminer.six/issues/65
"""
run('nonfree/175.pdf')
run("nonfree/175.pdf")
def test_nonfree_dmca(self):
run('nonfree/dmca.pdf')
run("nonfree/dmca.pdf")
def test_nonfree_f1040nr(self):
run('nonfree/f1040nr.pdf', '-p 1')
run("nonfree/f1040nr.pdf", "-p 1")
def test_nonfree_i1040nr(self):
run('nonfree/i1040nr.pdf', '-p 1')
run("nonfree/i1040nr.pdf", "-p 1")
def test_nonfree_kampo(self):
run('nonfree/kampo.pdf')
run("nonfree/kampo.pdf")
def test_nonfree_naacl06_shinyama(self):
run('nonfree/naacl06-shinyama.pdf')
run("nonfree/naacl06-shinyama.pdf")
def test_nlp2004slides(self):
run('nonfree/nlp2004slides.pdf', '-p 1')
run("nonfree/nlp2004slides.pdf", "-p 1")
def test_contrib_2b(self):
run('contrib/2b.pdf', '-A -t xml')
run("contrib/2b.pdf", "-A -t xml")
def test_contrib_issue_350(self):
"""Regression test for
https://github.com/pdfminer/pdfminer.six/issues/350"""
run('contrib/issue-00352-asw-oct96-p41.pdf')
run("contrib/issue-00352-asw-oct96-p41.pdf")
def test_scancode_patchelf(self):
"""Regression test for https://github.com/euske/pdfminer/issues/96"""
run('scancode/patchelf.pdf')
run("scancode/patchelf.pdf")
def test_contrib_hash_two_complement(self):
"""Check that unsigned integer is added correctly to encryption hash.et
See https://github.com/pdfminer/pdfminer.six/issues/186
"""
run('contrib/issue-00352-hash-twos-complement.pdf')
run("contrib/issue-00352-hash-twos-complement.pdf")
def test_contrib_excel(self):
"""Regression test for
https://github.com/pdfminer/pdfminer.six/issues/369
"""
run('contrib/issue-00369-excel.pdf', '-t html')
run("contrib/issue-00369-excel.pdf", "-t html")
def test_encryption_aes128(self):
run('encryption/aes-128.pdf', '-P foo')
run("encryption/aes-128.pdf", "-P foo")
def test_encryption_aes128m(self):
run('encryption/aes-128-m.pdf', '-P foo')
run("encryption/aes-128-m.pdf", "-P foo")
def test_encryption_aes256(self):
run('encryption/aes-256.pdf', '-P foo')
run("encryption/aes-256.pdf", "-P foo")
def test_encryption_aes256m(self):
run('encryption/aes-256-m.pdf', '-P foo')
run("encryption/aes-256-m.pdf", "-P foo")
def test_encryption_aes256_r6_user(self):
run('encryption/aes-256-r6.pdf', '-P usersecret')
run("encryption/aes-256-r6.pdf", "-P usersecret")
def test_encryption_aes256_r6_owner(self):
run('encryption/aes-256-r6.pdf', '-P ownersecret')
run("encryption/aes-256-r6.pdf", "-P ownersecret")
def test_encryption_base(self):
run('encryption/base.pdf', '-P foo')
run("encryption/base.pdf", "-P foo")
def test_encryption_rc4_40(self):
run('encryption/rc4-40.pdf', '-P foo')
run("encryption/rc4-40.pdf", "-P foo")
def test_encryption_rc4_128(self):
run('encryption/rc4-128.pdf', '-P foo')
run("encryption/rc4-128.pdf", "-P foo")
class TestDumpImages:
@staticmethod
def extract_images(input_file, *args):
output_dir = mkdtemp()
with TemporaryFilePath() as output_file_name:
commands = ['-o', output_file_name, '--output-dir',
output_dir, input_file, *args]
commands = [
"-o",
output_file_name,
"--output-dir",
output_dir,
input_file,
*args,
]
pdf2txt.main(commands)
image_files = os.listdir(output_dir)
rmtree(output_dir)
@ -132,39 +136,38 @@ class TestDumpImages:
Regression test for:
https://github.com/pdfminer/pdfminer.six/issues/131
"""
filepath = absolute_sample_path('../samples/nonfree/dmca.pdf')
image_files = self.extract_images(filepath, '-p', '1')
assert image_files[0].endswith('bmp')
filepath = absolute_sample_path("../samples/nonfree/dmca.pdf")
image_files = self.extract_images(filepath, "-p", "1")
assert image_files[0].endswith("bmp")
def test_nonfree_175(self):
"""Extract images of pdf containing jpg images"""
self.extract_images(absolute_sample_path('../samples/nonfree/175.pdf'))
self.extract_images(absolute_sample_path("../samples/nonfree/175.pdf"))
def test_jbig2_image_export(self):
"""Extract images of pdf containing jbig2 images
Feature test for: https://github.com/pdfminer/pdfminer.six/pull/46
"""
input_file = absolute_sample_path(
'../samples/contrib/pdf-with-jbig2.pdf')
input_file = absolute_sample_path("../samples/contrib/pdf-with-jbig2.pdf")
output_dir = mkdtemp()
with TemporaryFilePath() as output_file_name:
commands = ['-o', output_file_name, '--output-dir',
output_dir, input_file]
commands = ["-o", output_file_name, "--output-dir", output_dir, input_file]
pdf2txt.main(commands)
image_files = os.listdir(output_dir)
try:
assert image_files[0].endswith('.jb2')
assert filecmp.cmp(output_dir + '/' + image_files[0],
absolute_sample_path(
'../samples/contrib/XIPLAYER0.jb2'))
assert image_files[0].endswith(".jb2")
assert filecmp.cmp(
output_dir + "/" + image_files[0],
absolute_sample_path("../samples/contrib/XIPLAYER0.jb2"),
)
finally:
rmtree(output_dir)
def test_contrib_matplotlib(self):
"""Test a pdf with Type3 font"""
run('contrib/matplotlib.pdf')
run("contrib/matplotlib.pdf")
def test_nonfree_cmp_itext_logo(self):
"""Test a pdf with Type3 font"""
run('nonfree/cmp_itext_logo.pdf')
run("nonfree/cmp_itext_logo.pdf")

View File

@ -4,8 +4,13 @@ import pytest
from helpers import absolute_sample_path
from pdfminer.layout import LTComponent
from pdfminer.utils import open_filename, Plane, shorten_str, \
format_int_roman, format_int_alpha
from pdfminer.utils import (
open_filename,
Plane,
shorten_str,
format_int_roman,
format_int_alpha,
)
class TestOpenFilename:
@ -48,14 +53,12 @@ class TestPlane:
assert result == [obj]
def test_find_if_object_is_smaller_than_gridsize(self):
plane, obj = self.given_plane_with_one_object(object_size=1,
gridsize=100)
plane, obj = self.given_plane_with_one_object(object_size=1, gridsize=100)
result = list(plane.find((0, 0, 100, 100)))
assert result == [obj]
def test_find_object_if_much_larger_than_gridsize(self):
plane, obj = self.given_plane_with_one_object(object_size=100,
gridsize=10)
plane, obj = self.given_plane_with_one_object(object_size=100, gridsize=10)
result = list(plane.find((0, 0, 100, 100)))
assert result == [obj]
@ -70,43 +73,43 @@ class TestPlane:
class TestFunctions(object):
def test_shorten_str(self):
s = shorten_str('Hello there World', 15)
assert s == 'Hello ... World'
s = shorten_str("Hello there World", 15)
assert s == "Hello ... World"
def test_shorten_short_str_is_same(self):
s = 'Hello World'
s = "Hello World"
assert shorten_str(s, 50) == s
def test_shorten_to_really_short(self):
assert shorten_str('Hello World', 5) == 'Hello'
assert shorten_str("Hello World", 5) == "Hello"
def test_format_int_alpha(self):
assert format_int_alpha(1) == 'a'
assert format_int_alpha(2) == 'b'
assert format_int_alpha(26) == 'z'
assert format_int_alpha(27) == 'aa'
assert format_int_alpha(28) == 'ab'
assert format_int_alpha(26 * 2) == 'az'
assert format_int_alpha(26 * 2 + 1) == 'ba'
assert format_int_alpha(26 * 27) == 'zz'
assert format_int_alpha(26 * 27 + 1) == 'aaa'
assert format_int_alpha(1) == "a"
assert format_int_alpha(2) == "b"
assert format_int_alpha(26) == "z"
assert format_int_alpha(27) == "aa"
assert format_int_alpha(28) == "ab"
assert format_int_alpha(26 * 2) == "az"
assert format_int_alpha(26 * 2 + 1) == "ba"
assert format_int_alpha(26 * 27) == "zz"
assert format_int_alpha(26 * 27 + 1) == "aaa"
def test_format_int_roman(self):
assert format_int_roman(1) == 'i'
assert format_int_roman(2) == 'ii'
assert format_int_roman(3) == 'iii'
assert format_int_roman(4) == 'iv'
assert format_int_roman(5) == 'v'
assert format_int_roman(6) == 'vi'
assert format_int_roman(7) == 'vii'
assert format_int_roman(8) == 'viii'
assert format_int_roman(9) == 'ix'
assert format_int_roman(10) == 'x'
assert format_int_roman(11) == 'xi'
assert format_int_roman(20) == 'xx'
assert format_int_roman(40) == 'xl'
assert format_int_roman(45) == 'xlv'
assert format_int_roman(50) == 'l'
assert format_int_roman(90) == 'xc'
assert format_int_roman(91) == 'xci'
assert format_int_roman(100) == 'c'
assert format_int_roman(1) == "i"
assert format_int_roman(2) == "ii"
assert format_int_roman(3) == "iii"
assert format_int_roman(4) == "iv"
assert format_int_roman(5) == "v"
assert format_int_roman(6) == "vi"
assert format_int_roman(7) == "vii"
assert format_int_roman(8) == "viii"
assert format_int_roman(9) == "ix"
assert format_int_roman(10) == "x"
assert format_int_roman(11) == "xi"
assert format_int_roman(20) == "xx"
assert format_int_roman(40) == "xl"
assert format_int_roman(45) == "xlv"
assert format_int_roman(50) == "l"
assert format_int_roman(90) == "xc"
assert format_int_roman(91) == "xci"
assert format_int_roman(100) == "c"

View File

@ -7,39 +7,38 @@ import fileinput
def main(argv):
fonts = {}
for line in fileinput.input():
f = line.strip().split(' ')
f = line.strip().split(" ")
if not f:
continue
k = f[0]
if k == 'FontName':
if k == "FontName":
fontname = f[1]
props = {'FontName': fontname, 'Flags': 0}
props = {"FontName": fontname, "Flags": 0}
chars = {}
fonts[fontname] = (props, chars)
elif k == 'C':
elif k == "C":
cid = int(f[1])
if 0 <= cid and cid <= 255:
width = int(f[4])
chars[cid] = width
elif k in ('CapHeight', 'XHeight', 'ItalicAngle',
'Ascender', 'Descender'):
k = {'Ascender': 'Ascent', 'Descender': 'Descent'}.get(k, k)
elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"):
k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k)
props[k] = float(f[1])
elif k in ('FontName', 'FamilyName', 'Weight'):
k = {'FamilyName': 'FontFamily', 'Weight': 'FontWeight'}.get(k, k)
elif k in ("FontName", "FamilyName", "Weight"):
k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k)
props[k] = f[1]
elif k == 'IsFixedPitch':
if f[1].lower() == 'true':
props['Flags'] = 64
elif k == 'FontBBox':
elif k == "IsFixedPitch":
if f[1].lower() == "true":
props["Flags"] = 64
elif k == "FontBBox":
props[k] = tuple(map(float, f[1:5]))
print('# -*- python -*-')
print('FONT_METRICS = {')
print("# -*- python -*-")
print("FONT_METRICS = {")
for (fontname, (props, chars)) in fonts.items():
print(' {!r}: {!r},'.format(fontname, (props, chars)))
print('}')
print(" {!r}: {!r},".format(fontname, (props, chars)))
print("}")
return 0
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]

View File

@ -6,7 +6,6 @@ import codecs
class CMapConverter:
def __init__(self, enc2codec={}):
self.enc2codec = enc2codec
self.code2cid = {} # {'cmapname': ...}
@ -19,12 +18,12 @@ class CMapConverter:
return self.code2cid.keys()
def get_maps(self, enc):
if enc.endswith('-H'):
if enc.endswith("-H"):
(hmapenc, vmapenc) = (enc, None)
elif enc == 'H':
(hmapenc, vmapenc) = ('H', 'V')
elif enc == "H":
(hmapenc, vmapenc) = ("H", "V")
else:
(hmapenc, vmapenc) = (enc+'-H', enc+'-V')
(hmapenc, vmapenc) = (enc + "-H", enc + "-V")
if hmapenc in self.code2cid:
hmap = self.code2cid[hmapenc]
else:
@ -43,12 +42,12 @@ class CMapConverter:
def load(self, fp):
encs = None
for line in fp:
(line, _, _) = line.strip().partition('#')
(line, _, _) = line.strip().partition("#")
if not line:
continue
values = line.split('\t')
values = line.split("\t")
if encs is None:
assert values[0] == 'CID', str(values)
assert values[0] == "CID", str(values)
encs = values
continue
@ -68,7 +67,7 @@ class CMapConverter:
def add(unimap, enc, code):
try:
codec = self.enc2codec[enc]
c = code.decode(codec, 'strict')
c = code.decode(codec, "strict")
if len(c) == 1:
if c not in unimap:
unimap[c] = 0
@ -89,20 +88,20 @@ class CMapConverter:
unimap_h = {}
unimap_v = {}
for (enc, value) in zip(encs, values):
if enc == 'CID':
if enc == "CID":
continue
if value == '*':
if value == "*":
continue
# hcodes, vcodes: encoded bytes for each writing mode.
hcodes = []
vcodes = []
for code in value.split(','):
vertical = code.endswith('v')
for code in value.split(","):
vertical = code.endswith("v")
if vertical:
code = code[:-1]
try:
code = codecs.decode(code, 'hex_codec')
code = codecs.decode(code, "hex_codec")
except Exception:
code = chr(int(code, 16))
if vertical:
@ -155,17 +154,19 @@ def main(argv):
import os.path
def usage():
print('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]'
% argv[0])
print(
"usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]" % argv[0]
)
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'c:')
(opts, args) = getopt.getopt(argv[1:], "c:")
except getopt.GetoptError:
return usage()
enc2codec = {}
for (k, v) in opts:
if k == '-c':
(enc, _, codec) = v.partition('=')
if k == "-c":
(enc, _, codec) = v.partition("=")
enc2codec[enc] = codec
if not args:
return usage()
@ -176,27 +177,27 @@ def main(argv):
converter = CMapConverter(enc2codec)
for path in args:
print('reading: %r...' % path)
print("reading: %r..." % path)
fp = open(path)
converter.load(fp)
fp.close()
for enc in converter.get_encs():
fname = '%s.pickle.gz' % enc
fname = "%s.pickle.gz" % enc
path = os.path.join(outdir, fname)
print('writing: %r...' % path)
fp = gzip.open(path, 'wb')
print("writing: %r..." % path)
fp = gzip.open(path, "wb")
converter.dump_cmap(fp, enc)
fp.close()
fname = 'to-unicode-%s.pickle.gz' % regname
fname = "to-unicode-%s.pickle.gz" % regname
path = os.path.join(outdir, fname)
print('writing: %r...' % path)
fp = gzip.open(path, 'wb')
print("writing: %r..." % path)
fp = gzip.open(path, "wb")
converter.dump_unicodemap(fp)
fp.close()
return
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]

View File

@ -8,20 +8,19 @@ def main(argv):
state = 0
for line in fileinput.input():
line = line.strip()
if not line or line.startswith('#'):
if not line or line.startswith("#"):
if state == 1:
state = 2
print('}\n')
print("}\n")
print(line)
continue
if state == 0:
print('\nglyphname2unicode = {')
print("\nglyphname2unicode = {")
state = 1
(name, x) = line.split(';')
codes = x.split(' ')
print(' {!r}: u\'{}\','
.format(name, ''.join('\\u%s' % code for code in codes)))
(name, x) = line.split(";")
codes = x.split(" ")
print(" {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)))
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]

View File

@ -4,8 +4,7 @@ import logging
import os.path
import re
import sys
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \
Union, cast
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast
from argparse import ArgumentParser
import pdfminer
@ -25,33 +24,33 @@ ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
def escape(s: Union[str, bytes]) -> str:
if isinstance(s, bytes):
us = str(s, 'latin-1')
us = str(s, "latin-1")
else:
us = s
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), us)
return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us)
def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
if obj is None:
out.write('<null />')
out.write("<null />")
return
if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj))
for (k, v) in obj.items():
out.write('<key>%s</key>\n' % k)
out.write('<value>')
out.write("<key>%s</key>\n" % k)
out.write("<value>")
dumpxml(out, v)
out.write('</value>\n')
out.write('</dict>')
out.write("</value>\n")
out.write("</dict>")
return
if isinstance(obj, list):
out.write('<list size="%d">\n' % len(obj))
for v in obj:
dumpxml(out, v)
out.write('\n')
out.write('</list>')
out.write("\n")
out.write("</list>")
return
if isinstance(obj, (str, bytes)):
@ -59,21 +58,20 @@ def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
return
if isinstance(obj, PDFStream):
if codec == 'raw':
if codec == "raw":
# Bug: writing bytes to text I/O. This will raise TypeError.
out.write(obj.get_rawdata()) # type: ignore [arg-type]
elif codec == 'binary':
elif codec == "binary":
# Bug: writing bytes to text I/O. This will raise TypeError.
out.write(obj.get_data()) # type: ignore [arg-type]
else:
out.write('<stream>\n<props>\n')
out.write("<stream>\n<props>\n")
dumpxml(out, obj.attrs)
out.write('\n</props>\n')
if codec == 'text':
out.write("\n</props>\n")
if codec == "text":
data = obj.get_data()
out.write('<data size="%d">%s</data>\n'
% (len(data), escape(data)))
out.write('</stream>')
out.write('<data size="%d">%s</data>\n' % (len(data), escape(data)))
out.write("</stream>")
return
if isinstance(obj, PDFObjRef):
@ -82,38 +80,36 @@ def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
if isinstance(obj, PSKeyword):
# Likely bug: obj.name is bytes, not str
out.write('<keyword>%s</keyword>'
% obj.name) # type: ignore [str-bytes-safe]
out.write("<keyword>%s</keyword>" % obj.name) # type: ignore [str-bytes-safe]
return
if isinstance(obj, PSLiteral):
# Likely bug: obj.name may be bytes, not str
out.write('<literal>%s</literal>'
% obj.name) # type: ignore [str-bytes-safe]
out.write("<literal>%s</literal>" % obj.name) # type: ignore [str-bytes-safe]
return
if isnumber(obj):
out.write('<number>%s</number>' % obj)
out.write("<number>%s</number>" % obj)
return
raise TypeError(obj)
def dumptrailers(
out: TextIO,
doc: PDFDocument,
show_fallback_xref: bool = False
out: TextIO, doc: PDFDocument, show_fallback_xref: bool = False
) -> None:
for xref in doc.xrefs:
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
out.write('<trailer>\n')
out.write("<trailer>\n")
dumpxml(out, xref.get_trailer())
out.write('\n</trailer>\n\n')
out.write("\n</trailer>\n\n")
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
if no_xrefs and not show_fallback_xref:
msg = 'This PDF does not have an xref. Use --show-fallback-xref if ' \
'you want to display the content of a fallback xref that ' \
'contains all objects.'
msg = (
"This PDF does not have an xref. Use --show-fallback-xref if "
"you want to display the content of a fallback xref that "
"contains all objects."
)
logger.warning(msg)
return
@ -122,10 +118,10 @@ def dumpallobjs(
out: TextIO,
doc: PDFDocument,
codec: Optional[str] = None,
show_fallback_xref: bool = False
show_fallback_xref: bool = False,
) -> None:
visited = set()
out.write('<pdf>')
out.write("<pdf>")
for xref in doc.xrefs:
for objid in xref.get_objids():
if objid in visited:
@ -137,11 +133,11 @@ def dumpallobjs(
continue
out.write('<object id="%d">\n' % objid)
dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n')
out.write("\n</object>\n\n")
except PDFObjectNotFound as e:
print('not found: %r' % e)
print("not found: %r" % e)
dumptrailers(out, doc, show_fallback_xref)
out.write('</pdf>')
out.write("</pdf>")
return
@ -150,16 +146,18 @@ def dumpoutline(
fname: str,
objids: Any,
pagenos: Container[int],
password: str = '',
password: str = "",
dumpall: bool = False,
codec: Optional[str] = None,
extractdir: Optional[str] = None
extractdir: Optional[str] = None,
) -> None:
fp = open(fname, 'rb')
fp = open(fname, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
pages = {page.pageid: pageno for (pageno, page)
in enumerate(PDFPage.create_pages(doc), 1)}
pages = {
page.pageid: pageno
for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
}
def resolve_dest(dest: object) -> Any:
if isinstance(dest, (str, bytes)):
@ -167,14 +165,14 @@ def dumpoutline(
elif isinstance(dest, PSLiteral):
dest = resolve1(doc.get_dest(dest.name))
if isinstance(dest, dict):
dest = dest['D']
dest = dest["D"]
if isinstance(dest, PDFObjRef):
dest = dest.resolve()
return dest
try:
outlines = doc.get_outlines()
outfp.write('<outlines>\n')
outfp.write("<outlines>\n")
for (level, title, dest, a, se) in outlines:
pageno = None
if dest:
@ -183,21 +181,20 @@ def dumpoutline(
elif a:
action = a
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
'D'):
dest = resolve_dest(action['D'])
subtype = action.get("S")
if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
dest = resolve_dest(action["D"])
pageno = pages[dest[0].objid]
s = escape(title)
outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
if dest is not None:
outfp.write('<dest>')
outfp.write("<dest>")
dumpxml(outfp, dest)
outfp.write('</dest>\n')
outfp.write("</dest>\n")
if pageno is not None:
outfp.write('<pageno>%r</pageno>\n' % pageno)
outfp.write('</outline>\n')
outfp.write('</outlines>\n')
outfp.write("<pageno>%r</pageno>\n" % pageno)
outfp.write("</outline>\n")
outfp.write("</outlines>\n")
except PDFNoOutlines:
pass
parser.close()
@ -205,43 +202,48 @@ def dumpoutline(
return
LITERAL_FILESPEC = LIT('Filespec')
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
LITERAL_FILESPEC = LIT("Filespec")
LITERAL_EMBEDDEDFILE = LIT("EmbeddedFile")
def extractembedded(fname: str, password: str, extractdir: str) -> None:
def extract1(objid: int, obj: Dict[str, Any]) -> None:
filename = os.path.basename(obj.get('UF') or
cast(bytes, obj.get('F')).decode())
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
filename = os.path.basename(obj.get("UF") or cast(bytes, obj.get("F")).decode())
fileref = obj["EF"].get("UF") or obj["EF"].get("F")
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
error_msg = 'unable to process PDF: reference for %r is not a ' \
'PDFStream' % filename
error_msg = (
"unable to process PDF: reference for %r is not a "
"PDFStream" % filename
)
raise PDFValueError(error_msg)
if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE:
raise PDFValueError(
'unable to process PDF: reference for %r '
'is not an EmbeddedFile' % (filename))
path = os.path.join(extractdir, '%.6d-%s' % (objid, filename))
"unable to process PDF: reference for %r "
"is not an EmbeddedFile" % (filename)
)
path = os.path.join(extractdir, "%.6d-%s" % (objid, filename))
if os.path.exists(path):
raise IOError('file exists: %r' % path)
print('extracting: %r' % path)
raise IOError("file exists: %r" % path)
print("extracting: %r" % path)
os.makedirs(os.path.dirname(path), exist_ok=True)
out = open(path, 'wb')
out = open(path, "wb")
out.write(fileobj.get_data())
out.close()
return
with open(fname, 'rb') as fp:
with open(fname, "rb") as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
extracted_objids = set()
for xref in doc.xrefs:
for objid in xref.get_objids():
obj = doc.getobj(objid)
if objid not in extracted_objids and isinstance(obj, dict) \
and obj.get('Type') is LITERAL_FILESPEC:
if (
objid not in extracted_objids
and isinstance(obj, dict)
and obj.get("Type") is LITERAL_FILESPEC
):
extracted_objids.add(objid)
extract1(objid, obj)
return
@ -252,13 +254,13 @@ def dumppdf(
fname: str,
objids: Iterable[int],
pagenos: Container[int],
password: str = '',
password: str = "",
dumpall: bool = False,
codec: Optional[str] = None,
extractdir: Optional[str] = None,
show_fallback_xref: bool = False
show_fallback_xref: bool = False,
) -> None:
fp = open(fname, 'rb')
fp = open(fname, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
if objids:
@ -279,71 +281,125 @@ def dumppdf(
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc, show_fallback_xref)
fp.close()
if codec not in ('raw', 'binary'):
outfp.write('\n')
if codec not in ("raw", "binary"):
outfp.write("\n")
return
def create_parser() -> ArgumentParser:
parser = ArgumentParser(description=__doc__, add_help=True)
parser.add_argument('files', type=str, default=None, nargs='+',
help='One or more paths to PDF files.')
parser.add_argument(
"files",
type=str,
default=None,
nargs="+",
help="One or more paths to PDF files.",
)
parser.add_argument(
"--version", "-v", action="version",
version="pdfminer.six v{}".format(pdfminer.__version__))
"--version",
"-v",
action="version",
version="pdfminer.six v{}".format(pdfminer.__version__),
)
parser.add_argument(
'--debug', '-d', default=False, action='store_true',
help='Use debug logging level.')
"--debug",
"-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
procedure_parser = parser.add_mutually_exclusive_group()
procedure_parser.add_argument(
'--extract-toc', '-T', default=False, action='store_true',
help='Extract structure of outline')
"--extract-toc",
"-T",
default=False,
action="store_true",
help="Extract structure of outline",
)
procedure_parser.add_argument(
'--extract-embedded', '-E', type=str,
help='Extract embedded files')
"--extract-embedded", "-E", type=str, help="Extract embedded files"
)
parse_params = parser.add_argument_group(
'Parser', description='Used during PDF parsing')
"Parser", description="Used during PDF parsing"
)
parse_params.add_argument(
'--page-numbers', type=int, default=None, nargs='+',
help='A space-seperated list of page numbers to parse.')
"--page-numbers",
type=int,
default=None,
nargs="+",
help="A space-seperated list of page numbers to parse.",
)
parse_params.add_argument(
'--pagenos', '-p', type=str,
help='A comma-separated list of page numbers to parse. Included for '
'legacy applications, use --page-numbers for more idiomatic '
'argument entry.')
"--pagenos",
"-p",
type=str,
help="A comma-separated list of page numbers to parse. Included for "
"legacy applications, use --page-numbers for more idiomatic "
"argument entry.",
)
parse_params.add_argument(
'--objects', '-i', type=str,
help='Comma separated list of object numbers to extract')
"--objects",
"-i",
type=str,
help="Comma separated list of object numbers to extract",
)
parse_params.add_argument(
'--all', '-a', default=False, action='store_true',
help='If the structure of all objects should be extracted')
"--all",
"-a",
default=False,
action="store_true",
help="If the structure of all objects should be extracted",
)
parse_params.add_argument(
'--show-fallback-xref', action='store_true',
help='Additionally show the fallback xref. Use this if the PDF '
'has zero or only invalid xref\'s. This setting is ignored if '
'--extract-toc or --extract-embedded is used.')
"--show-fallback-xref",
action="store_true",
help="Additionally show the fallback xref. Use this if the PDF "
"has zero or only invalid xref's. This setting is ignored if "
"--extract-toc or --extract-embedded is used.",
)
parse_params.add_argument(
'--password', '-P', type=str, default='',
help='The password to use for decrypting PDF file.')
"--password",
"-P",
type=str,
default="",
help="The password to use for decrypting PDF file.",
)
output_params = parser.add_argument_group(
'Output', description='Used during output generation.')
"Output", description="Used during output generation."
)
output_params.add_argument(
'--outfile', '-o', type=str, default='-',
"--outfile",
"-o",
type=str,
default="-",
help='Path to file where output is written. Or "-" (default) to '
'write to stdout.')
"write to stdout.",
)
codec_parser = output_params.add_mutually_exclusive_group()
codec_parser.add_argument(
'--raw-stream', '-r', default=False, action='store_true',
help='Write stream objects without encoding')
"--raw-stream",
"-r",
default=False,
action="store_true",
help="Write stream objects without encoding",
)
codec_parser.add_argument(
'--binary-stream', '-b', default=False, action='store_true',
help='Write stream objects with binary encoding')
"--binary-stream",
"-b",
default=False,
action="store_true",
help="Write stream objects with binary encoding",
)
codec_parser.add_argument(
'--text-stream', '-t', default=False, action='store_true',
help='Write stream objects as plain text')
"--text-stream",
"-t",
default=False,
action="store_true",
help="Write stream objects as plain text",
)
return parser
@ -355,53 +411,63 @@ def main(argv: Optional[List[str]] = None) -> None:
if args.debug:
logging.getLogger().setLevel(logging.DEBUG)
if args.outfile == '-':
if args.outfile == "-":
outfp = sys.stdout
else:
outfp = open(args.outfile, 'w')
outfp = open(args.outfile, "w")
if args.objects:
objids = [int(x) for x in args.objects.split(',')]
objids = [int(x) for x in args.objects.split(",")]
else:
objids = []
if args.page_numbers:
pagenos = {x - 1 for x in args.page_numbers}
elif args.pagenos:
pagenos = {int(x) - 1 for x in args.pagenos.split(',')}
pagenos = {int(x) - 1 for x in args.pagenos.split(",")}
else:
pagenos = set()
password = args.password
if args.raw_stream:
codec: Optional[str] = 'raw'
codec: Optional[str] = "raw"
elif args.binary_stream:
codec = 'binary'
codec = "binary"
elif args.text_stream:
codec = 'text'
codec = "text"
else:
codec = None
for fname in args.files:
if args.extract_toc:
dumpoutline(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=None
outfp,
fname,
objids,
pagenos,
password=password,
dumpall=args.all,
codec=codec,
extractdir=None,
)
elif args.extract_embedded:
extractembedded(
fname, password=password, extractdir=args.extract_embedded
)
extractembedded(fname, password=password, extractdir=args.extract_embedded)
else:
dumppdf(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=None,
show_fallback_xref=args.show_fallback_xref
outfp,
fname,
objids,
pagenos,
password=password,
dumpall=args.all,
codec=codec,
extractdir=None,
show_fallback_xref=args.show_fallback_xref,
)
outfp.close()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -12,10 +12,7 @@ from pdfminer.utils import AnyIO
logging.basicConfig()
OUTPUT_TYPES = ((".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag"))
OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
def float_or_disabled(x: str) -> Optional[float]:
@ -29,17 +26,17 @@ def float_or_disabled(x: str) -> Optional[float]:
def extract_text(
files: Iterable[str] = [],
outfile: str = '-',
outfile: str = "-",
laparams: Optional[LAParams] = None,
output_type: str = 'text',
codec: str = 'utf-8',
output_type: str = "text",
codec: str = "utf-8",
strip_control: bool = False,
maxpages: int = 0,
page_numbers: Optional[Container[int]] = None,
password: str = "",
scale: float = 1.0,
rotation: int = 0,
layoutmode: str = 'normal',
layoutmode: str = "normal",
output_dir: Optional[str] = None,
debug: bool = False,
disable_caching: bool = False,
@ -56,7 +53,7 @@ def extract_text(
if outfile == "-":
outfp: AnyIO = sys.stdout
if sys.stdout.encoding is not None:
codec = 'utf-8'
codec = "utf-8"
else:
outfp = open(outfile, "wb")
@ -69,73 +66,133 @@ def extract_text(
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument(
"files", type=str, default=None, nargs="+",
help="One or more paths to PDF files.")
"files",
type=str,
default=None,
nargs="+",
help="One or more paths to PDF files.",
)
parser.add_argument(
"--version", "-v", action="version",
version="pdfminer.six v{}".format(pdfminer.__version__))
"--version",
"-v",
action="version",
version="pdfminer.six v{}".format(pdfminer.__version__),
)
parser.add_argument(
"--debug", "-d", default=False, action="store_true",
help="Use debug logging level.")
"--debug",
"-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
parser.add_argument(
"--disable-caching", "-C", default=False, action="store_true",
help="If caching or resources, such as fonts, should be disabled.")
"--disable-caching",
"-C",
default=False,
action="store_true",
help="If caching or resources, such as fonts, should be disabled.",
)
parse_params = parser.add_argument_group(
'Parser', description='Used during PDF parsing')
"Parser", description="Used during PDF parsing"
)
parse_params.add_argument(
"--page-numbers", type=int, default=None, nargs="+",
help="A space-seperated list of page numbers to parse.")
"--page-numbers",
type=int,
default=None,
nargs="+",
help="A space-seperated list of page numbers to parse.",
)
parse_params.add_argument(
"--pagenos", "-p", type=str,
"--pagenos",
"-p",
type=str,
help="A comma-separated list of page numbers to parse. "
"Included for legacy applications, use --page-numbers "
"for more idiomatic argument entry.")
"for more idiomatic argument entry.",
)
parse_params.add_argument(
"--maxpages", "-m", type=int, default=0,
help="The maximum number of pages to parse.")
"--maxpages",
"-m",
type=int,
default=0,
help="The maximum number of pages to parse.",
)
parse_params.add_argument(
"--password", "-P", type=str, default="",
help="The password to use for decrypting PDF file.")
"--password",
"-P",
type=str,
default="",
help="The password to use for decrypting PDF file.",
)
parse_params.add_argument(
"--rotation", "-R", default=0, type=int,
"--rotation",
"-R",
default=0,
type=int,
help="The number of degrees to rotate the PDF "
"before other types of processing.")
"before other types of processing.",
)
la_params = LAParams() # will be used for defaults
la_param_group = parser.add_argument_group(
'Layout analysis', description='Used during layout analysis.')
"Layout analysis", description="Used during layout analysis."
)
la_param_group.add_argument(
"--no-laparams", "-n", default=False, action="store_true",
help="If layout analysis parameters should be ignored.")
la_param_group.add_argument(
"--detect-vertical", "-V", default=la_params.detect_vertical,
"--no-laparams",
"-n",
default=False,
action="store_true",
help="If vertical text should be considered during layout analysis")
help="If layout analysis parameters should be ignored.",
)
la_param_group.add_argument(
"--line-overlap", type=float, default=la_params.line_overlap,
help='If two characters have more overlap than this they '
'are considered to be on the same line. The overlap is specified '
'relative to the minimum height of both characters.')
"--detect-vertical",
"-V",
default=la_params.detect_vertical,
action="store_true",
help="If vertical text should be considered during layout analysis",
)
la_param_group.add_argument(
"--char-margin", "-M", type=float, default=la_params.char_margin,
"--line-overlap",
type=float,
default=la_params.line_overlap,
help="If two characters have more overlap than this they "
"are considered to be on the same line. The overlap is specified "
"relative to the minimum height of both characters.",
)
la_param_group.add_argument(
"--char-margin",
"-M",
type=float,
default=la_params.char_margin,
help="If two characters are closer together than this margin they "
"are considered to be part of the same line. The margin is "
"specified relative to the width of the character.")
"specified relative to the width of the character.",
)
la_param_group.add_argument(
"--word-margin", "-W", type=float, default=la_params.word_margin,
"--word-margin",
"-W",
type=float,
default=la_params.word_margin,
help="If two characters on the same line are further apart than this "
"margin then they are considered to be two separate words, and "
"an intermediate space will be added for readability. The margin "
"is specified relative to the width of the character.")
"is specified relative to the width of the character.",
)
la_param_group.add_argument(
"--line-margin", "-L", type=float, default=la_params.line_margin,
"--line-margin",
"-L",
type=float,
default=la_params.line_margin,
help="If two lines are close together they are considered to "
"be part of the same paragraph. The margin is specified "
"relative to the height of a line.")
"relative to the height of a line.",
)
la_param_group.add_argument(
"--boxes-flow", "-F", type=float_or_disabled,
"--boxes-flow",
"-F",
type=float_or_disabled,
default=la_params.boxes_flow,
help="Specifies how much a horizontal and vertical position of a "
"text matters when determining the order of lines. The value "
@ -143,44 +200,77 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
"matters) to +1.0 (only vertical position matters). You can also "
"pass `disabled` to disable advanced layout analysis, and "
"instead return text based on the position of the bottom left "
"corner of the text box.")
"corner of the text box.",
)
la_param_group.add_argument(
"--all-texts", "-A", default=la_params.all_texts, action="store_true",
help="If layout analysis should be performed on text in figures.")
"--all-texts",
"-A",
default=la_params.all_texts,
action="store_true",
help="If layout analysis should be performed on text in figures.",
)
output_params = parser.add_argument_group(
'Output', description='Used during output generation.')
"Output", description="Used during output generation."
)
output_params.add_argument(
"--outfile", "-o", type=str, default="-",
"--outfile",
"-o",
type=str,
default="-",
help="Path to file where output is written. "
"Or \"-\" (default) to write to stdout.")
'Or "-" (default) to write to stdout.',
)
output_params.add_argument(
"--output_type", "-t", type=str, default="text",
help="Type of output to generate {text,html,xml,tag}.")
"--output_type",
"-t",
type=str,
default="text",
help="Type of output to generate {text,html,xml,tag}.",
)
output_params.add_argument(
"--codec", "-c", type=str, default="utf-8",
help="Text encoding to use in output file.")
"--codec",
"-c",
type=str,
default="utf-8",
help="Text encoding to use in output file.",
)
output_params.add_argument(
"--output-dir", "-O", default=None,
"--output-dir",
"-O",
default=None,
help="The output directory to put extracted images in. If not given, "
"images are not extracted.")
"images are not extracted.",
)
output_params.add_argument(
"--layoutmode", "-Y", default="normal",
type=str, help="Type of layout to use when generating html "
"--layoutmode",
"-Y",
default="normal",
type=str,
help="Type of layout to use when generating html "
"{normal,exact,loose}. If normal,each line is"
" positioned separately in the html. If exact"
", each character is positioned separately in"
" the html. If loose, same result as normal "
"but with an additional newline after each "
"text line. Only used when output_type is html.")
"text line. Only used when output_type is html.",
)
output_params.add_argument(
"--scale", "-s", type=float, default=1.0,
"--scale",
"-s",
type=float,
default=1.0,
help="The amount of zoom to use when generating html file. "
"Only used when output_type is html.")
"Only used when output_type is html.",
)
output_params.add_argument(
"--strip-control", "-S", default=False, action="store_true",
"--strip-control",
"-S",
default=False,
action="store_true",
help="Remove control statement from text. "
"Only used when output_type is xml.")
"Only used when output_type is xml.",
)
parsed_args = parser.parse_args(args=args)
@ -202,10 +292,7 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
parsed_args.page_numbers = {x - 1 for x in parsed_args.page_numbers}
if parsed_args.pagenos:
parsed_args.page_numbers = {
int(x) - 1
for x in parsed_args.pagenos.split(",")
}
parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
if parsed_args.output_type == "text" and parsed_args.outfile != "-":
for override, alttype in OUTPUT_TYPES:
@ -222,5 +309,5 @@ def main(args: Optional[List[str]] = None) -> int:
return 0
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())

View File

@ -21,14 +21,20 @@ def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]:
# If any LAParams group arguments were passed,
# create an LAParams object and
# populate with given args. Otherwise, set it to None.
if kwargs.get('laparams', None) is None:
if kwargs.get("laparams", None) is None:
laparams = layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin",
"char_margin", "line_margin", "boxes_flow"):
for param in (
"all_texts",
"detect_vertical",
"word_margin",
"char_margin",
"line_margin",
"boxes_flow",
):
paramv = kwargs.get(param, None)
if paramv is not None:
setattr(laparams, param, paramv)
kwargs['laparams'] = laparams
kwargs["laparams"] = laparams
s1 = io.StringIO()
with open(file1, "rb") as fp:
@ -39,81 +45,140 @@ def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]:
high_level.extract_text_to_fp(fp, s2, **kwargs)
import difflib
s1.seek(0)
s2.seek(0)
s1_lines, s2_lines = s1.readlines(), s2.readlines()
import os.path
try:
extension = os.path.splitext(kwargs['outfile'])[1][1:4]
if extension.lower() == 'htm':
extension = os.path.splitext(kwargs["outfile"])[1][1:4]
if extension.lower() == "htm":
return difflib.HtmlDiff().make_file(s1_lines, s2_lines)
except KeyError:
pass
return difflib.unified_diff(s1_lines, s2_lines, n=kwargs['context_lines'])
return difflib.unified_diff(s1_lines, s2_lines, n=kwargs["context_lines"])
# main
def main(args: Optional[List[str]] = None) -> int:
import argparse
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
P.add_argument("file2", type=str, default=None, help="File 2 to compare.")
P.add_argument("-o", "--outfile", type=str, default="-",
P.add_argument(
"-o",
"--outfile",
type=str,
default="-",
help="Output file(default/'-' is stdout) if .htm or .html,"
" create an HTML table (or a complete HTML file "
"containing the table) showing a side by side, "
"line by line comparison of text with inter-line and "
"intra-line change highlights. The table can be "
"generated in either full or "
"contextual difference mode.")
P.add_argument("-N", "--context-lines", default=3, type=int,
help="context lines shown")
P.add_argument("-d", "--debug", default=False, action="store_true",
help="Debug output.")
"contextual difference mode.",
)
P.add_argument(
"-N", "--context-lines", default=3, type=int, help="context lines shown"
)
P.add_argument(
"-d", "--debug", default=False, action="store_true", help="Debug output."
)
# params for pdf2txt
P.add_argument("-p", "--pagenos", type=str,
P.add_argument(
"-p",
"--pagenos",
type=str,
help="Comma-separated list of page numbers to parse. "
"Included for legacy applications, "
"use --page-numbers for more "
"idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+",
"idiomatic argument entry.",
)
P.add_argument(
"--page-numbers",
type=int,
default=None,
nargs="+",
help="Alternative to --pagenos with space-separated "
"numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0,
help="Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="",
help="Decryption password for both PDFs")
P.add_argument("-t", "--output_type", type=str, default="text",
help="pdf2txt type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8",
help="Text encoding")
"numbers; supercedes --pagenos where it is used.",
)
P.add_argument(
"-m", "--maxpages", type=int, default=0, help="Maximum pages to parse"
)
P.add_argument(
"-P",
"--password",
type=str,
default="",
help="Decryption password for both PDFs",
)
P.add_argument(
"-t",
"--output_type",
type=str,
default="text",
help="pdf2txt type: text|html|xml|tag (default is text)",
)
P.add_argument("-c", "--codec", type=str, default="utf-8", help="Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help="Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true",
help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None,
action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None,
help="LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None,
help="LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None,
help="LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None,
help="LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str,
help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False,
action="store_true", help="Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int,
help="Rotation")
P.add_argument("-O", "--output-dir", default=None,
help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False,
action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False,
action="store_true", help="Strip control in XML mode")
P.add_argument(
"-A",
"--all-texts",
default=None,
action="store_true",
help="LAParams all texts",
)
P.add_argument(
"-V",
"--detect-vertical",
default=None,
action="store_true",
help="LAParams detect vertical",
)
P.add_argument(
"-W", "--word-margin", type=float, default=None, help="LAParams word margin"
)
P.add_argument(
"-M", "--char-margin", type=float, default=None, help="LAParams char margin"
)
P.add_argument(
"-L", "--line-margin", type=float, default=None, help="LAParams line margin"
)
P.add_argument(
"-F", "--boxes-flow", type=float, default=None, help="LAParams boxes flow"
)
P.add_argument(
"-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode"
)
P.add_argument(
"-n",
"--no-laparams",
default=False,
action="store_true",
help="Pass None as LAParams",
)
P.add_argument("-R", "--rotation", default=0, type=int, help="Rotation")
P.add_argument(
"-O", "--output-dir", default=None, help="Output directory for images"
)
P.add_argument(
"-C",
"--disable-caching",
default=False,
action="store_true",
help="Disable caching",
)
P.add_argument(
"-S",
"--strip-control",
default=False,
action="store_true",
help="Strip control in XML mode",
)
A = P.parse_args(args=args)
@ -126,21 +191,23 @@ def main(args: Optional[List[str]] = None) -> int:
A.page_numbers = {int(x) - 1 for x in A.pagenos.split(",")}
if A.output_type == "text" and A.outfile != "-":
for override, alttype in ((".htm", "html"),
for override, alttype in (
(".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag")):
(".tag", "tag"),
):
if A.outfile.endswith(override):
A.output_type = alttype
if A.outfile == "-":
outfp = sys.stdout
else:
outfp = open(A.outfile, "w", encoding='utf-8')
outfp = open(A.outfile, "w", encoding="utf-8")
outfp.writelines(compare(**vars(A)))
outfp.close()
return 0
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main())

View File

@ -21,7 +21,7 @@ _, SCRIPT = os.path.split(__file__)
def msg(*args: object, **kwargs: Any) -> None:
print(' '.join(map(str, args)), **kwargs) # noqa E999
print(" ".join(map(str, args)), **kwargs) # noqa E999
def flat_iter(obj: object) -> Iterator[object]:
@ -35,22 +35,22 @@ def main(args: List[str]) -> int:
msg(SCRIPT, args)
if len(args) != 1:
msg('Parse a PDF file and print some pdfminer-specific stats')
msg('Usage:', SCRIPT, '<PDF-filename>')
msg("Parse a PDF file and print some pdfminer-specific stats")
msg("Usage:", SCRIPT, "<PDF-filename>")
return 1
infilename, = args
(infilename,) = args
lt_types: Counter[str] = collections.Counter()
with open(infilename, 'rb') as pdf_file:
with open(infilename, "rb") as pdf_file:
# Create a PDF parser object associated with the file object.
parser = PDFParser(pdf_file)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
password = ''
password = ""
document = PDFDocument(parser, password)
# Check if the document allows text extraction.
if not document.is_extractable:
@ -75,11 +75,11 @@ def main(args: List[str]) -> int:
lt_types.update(type(item).__name__ for item in flat_iter(layout))
msg('page_count', page_count)
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
msg("page_count", page_count)
msg("lt_types:", " ".join("{}:{}".format(*tc) for tc in lt_types.items()))
return 0
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@ -7,14 +7,15 @@ def prof_main(argv: List[str]) -> int:
import hotshot.stats # type: ignore[import]
def usage() -> int:
print('usage: %s module.function [args ...]' % argv[0])
print("usage: %s module.function [args ...]" % argv[0])
return 100
args = argv[1:]
if len(args) < 1:
return usage()
name = args.pop(0)
prof = name+'.prof'
i = name.rindex('.')
prof = name + ".prof"
i = name.rindex(".")
(modname, funcname) = (name[:i], name[i + 1 :])
# Type error: fromlist expects sequence of strings; presumably the intent
@ -31,10 +32,10 @@ def prof_main(argv: List[str]) -> int:
else:
stats = hotshot.stats.load(prof)
stats.strip_dirs()
stats.sort_stats('time', 'calls')
stats.sort_stats("time", "calls")
stats.print_stats(1000)
return 0
if __name__ == '__main__':
if __name__ == "__main__":
sys.exit(prof_main(sys.argv))