Add type annotations (#661)
Squashed commit of the following: commit fa229f7b7591c07aea4e5a4545f9e0c34246e1cd Merge: eaab3c6pull/678/headc3e3499
Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 20:33:06 2021 -0700 Merge branch 'develop' into mypy (and fixed types) commit eaab3c65e2e3ab5f1f400cfc5186a3834c4ffe34 Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 20:00:45 2021 -0700 reformat all multi-line function defs to one-arg-per-line commit 3fe2b69eed9197009d9da6776462f580ebf0dfa3 Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 15:58:48 2021 -0700 ccitt nit -- avoid casting needlessly commit 15983d8c1e7162632fde43752c9d1c15938cd980 Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 15:58:36 2021 -0700 tweak CHANGELOG commit 13dc0babf782938e7d5b5e482d4c5adf92d82702 Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 15:43:46 2021 -0700 add failing tests for dumppdf crash commit 6b509c517876b8c15ac5a98a963884e23bd2e4d8 Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 15:24:23 2021 -0700 ccitt: apply misc PR feedback commit feb031ba86d3f22e41cfbbda13f17c039359f1e6 Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 15:18:26 2021 -0700 add missing None return type to all __init__ methods commit c0d62d6c54c7ec37b40bea54a3f6a7a618ec0ec6 Author: Andrew Baumann <ab@ab.id.au> Date: Mon Sep 6 15:13:08 2021 -0700 minor cleanup, remove a few more Any types commit b52a0594e1998a492c172538a9b35491c5fc5f52 Author: Andrew Baumann <ab@ab.id.au> Date: Sun Sep 5 22:37:28 2021 -0700 tighten up types, avoid Any in favour of explicit casts commit e58fd48bd14f31bebd2de8259f12630ac02756d6 Author: Andrew Baumann <ab@ab.id.au> Date: Sun Sep 5 14:10:49 2021 -0700 annotate ccitt.py, and fix one definite bug (array.tostring was renamed tobytes) commit 605290633e55595e5e0045840df5c5b1d9de843a Author: Andrew Baumann <ab@ab.id.au> Date: Sat Sep 4 22:37:38 2021 -0700 python 3.7 back-compat commit 4dbcf8760f8a1d3e3d99f085476f86e6a043c80c Author: Andrew Baumann <ab@ab.id.au> Date: Sat Sep 4 22:32:43 2021 -0700 annotate pdfminer.jbig2 commit 0d40b7c03a8028dc44acd3f457eac71abd681827 Author: Andrew Baumann <ab@ab.id.au> Date: Sat Sep 4 22:31:33 2021 -0700 annotate pdf2txt.py commit 5f82eb4f5646b5d1285252689191e0a14557ec7b Author: Andrew Baumann <ab@ab.id.au> Date: Sat Sep 4 09:16:31 2021 -0700 cleanup: make Plane generic commit 624fc92b88473ff36a174760883f34c22109da2b Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 23:16:51 2021 -0700 bluntly ignore calls to cryptography.hazmat commit 96b20439c169f40dbb114cabba6a582ad1ebe91e Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 23:01:06 2021 -0700 finish annotating, and disallow_untyped_defs for pdfminer.* _except_ ccitt and jbig2 commit 0ab586347861b72b1d16880dc9293f9ad597e20a Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 21:51:56 2021 -0700 annotate pdffont commit 4b689f1bcbdaf654feb9de81023e318ca310a12e Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 18:30:02 2021 -0700 annotate a couple more scripts; document sketchy code commit 291981ff3d273952ec9c92ef8ab948473558b787 Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 15:02:01 2021 -0700 pacify flake8 commit 45d2ce91ff333f3b7e34322b16e9c52b99b7a972 Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 14:31:48 2021 -0700 annotate dumppdf, and comment likely bugs commit 7278d83851cb336a1be3803a0993b5ec0ad39b4c Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 13:49:58 2021 -0700 enable mypy on tests and tools, fix one implicit reexport bug commit 4a83166ef4e4733cd2113f43188b585a4fda392b Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 13:25:59 2021 -0700 pdfdocument: per dumppdf.py, get_dest accepts either bytes or str commit 43701e1bee068df98f378a253c9c2150ee4ad9f7 Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 13:25:00 2021 -0700 layout: LAParams.boxes_flow may be None commit 164f81652f1788e74837466f0ab593e94079bc0f Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 09:45:09 2021 -0700 add whitespace, pacify flake8 commit 893b9fb9ec918032b36a30456fc0b7a217da86d8 Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 09:40:33 2021 -0700 support old Python without typing.Protocol commit dc245084102b7b04c3f5599d75b5d62ba4290787 Author: Andrew Baumann <ab@ab.id.au> Date: Fri Sep 3 09:12:03 2021 -0700 Move "# type: ignore" comments to fix mypy on Python < 3.8 The placement of these comments got more flexible in 3.8 due to https://github.com/python/mypy/issues/1032 Satisfying older Python and fitting in flake8's 79-character line limit was quite a challenge! commit da03afe7bd2cf3336e611f467f1c901455940ae8 Author: Andrew Baumann <ab@ab.id.au> Date: Thu Sep 2 22:59:58 2021 -0700 fix text output from HTMLConverter commit 5401276a2ed3b74a385ebcab5152485224146161 Author: Andrew Baumann <ab@ab.id.au> Date: Thu Sep 2 22:40:22 2021 -0700 annotate high_level.py and the immediately-reachable internal APIs (mostly converters) commit cc490513f8f17a7adc0bcbab2e0e86f37e832300 Author: Andrew Baumann <ab@ab.id.au> Date: Thu Sep 2 17:04:35 2021 -0700 * expand and improve annotations in cmap, encryption/decompression and fonts * disallow untyped calls; this way, we have a core set of typed code that can grow over time (just not for ccitt, because there's a ton of work lurking there) * expand "typing: none" comments to suppress a specific error code commit 92df54ba1d53d5dbbd5442757dd85be5b1851f99 Author: Andrew Baumann <ab@ab.id.au> Date: Wed Sep 1 20:50:59 2021 -0700 update CHANGELOG commit f72aaead45d0615e472a9b3190c9551a6b67b36e Merge: ff787a98ea9f10
Author: Andrew Baumann <ab@ab.id.au> Date: Wed Sep 1 20:47:03 2021 -0700 Merge branch 'develop' into mypy commit ff787a93986c60361536a97182a41774f4a53ac3 Author: Andrew Baumann <ab@ab.id.au> Date: Sat Aug 21 21:46:14 2021 -0700 be more precise about types on ps/pdf stacks, remove most of the Any annotations commit be1550189e10717f6827dbb7009d6e8c8b3f4c62 Author: Andrew Baumann <ab@ab.id.au> Date: Sat Aug 21 10:13:58 2021 -0700 silence missing imports, (maybe?) hook to tox commit ff4b6a9bd46b352583d823d39065652c9a6f05f4 Author: Andrew Baumann <ab@ab.id.au> Date: Fri Aug 20 22:49:06 2021 -0700 turn on more strict checks, and untangle the layout mess with generics Status: $ mypy pdfminer pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame" pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs pdfminer/pdfdevice.py:191: error: Argument 1 to "write" of "IO" has incompatible type "str"; expected "bytes" pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL" Found 5 errors in 4 files (checked 27 source files) pdfdevice.py:191 appears to be a real bug commit 5c9c0b19d26ae391aea0e69c2c819261cc04460c Author: Andrew Baumann <ab@ab.id.au> Date: Fri Aug 20 17:22:41 2021 -0700 finish annotating layout commit 0e6871c16abb29df2868ab145b4ce451b4b6c777 Author: Andrew Baumann <ab@ab.id.au> Date: Fri Aug 20 16:54:46 2021 -0700 general progress on annotations * finish utils * annotate more of pdfinterp, pdfdevice * document reason for # type: ignore comments * fix cyclic imports * satisfy flake8 commit 17d59f42917fbf9b2b2eb844d3e83a8f2a3f123a Author: Andrew Baumann <ab@ab.id.au> Date: Thu Aug 19 21:38:50 2021 -0700 WIP on type annotations With the possible exception of psparser.py, this is far from complete. $ mypy pdfminer pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame" pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL"
parent
33d7dde4d1
commit
9406040d8e
|
@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
### Added
|
### Added
|
||||||
- Add support for PDF 2.0 (ISO 32000-2) AES-256 encryption ([#614](https://github.com/pdfminer/pdfminer.six/pull/614))
|
- Add support for PDF 2.0 (ISO 32000-2) AES-256 encryption ([#614](https://github.com/pdfminer/pdfminer.six/pull/614))
|
||||||
- Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537))
|
- Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537))
|
||||||
|
- Type annotations ([#661](https://github.com/pdfminer/pdfminer.six/pull/661))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- `KeyError` when `'Encrypt'` but not `'ID'` present in `trailer` ([#594](https://github.com/pdfminer/pdfminer.six/pull/594))
|
- `KeyError` when `'Encrypt'` but not `'ID'` present in `trailer` ([#594](https://github.com/pdfminer/pdfminer.six/pull/594))
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import pdfminer
|
import pdfminer
|
||||||
|
|
||||||
|
@ -48,7 +49,7 @@ templates_path = ['_templates']
|
||||||
# List of patterns, relative to source directory, that match files and
|
# List of patterns, relative to source directory, that match files and
|
||||||
# directories to ignore when looking for source files.
|
# directories to ignore when looking for source files.
|
||||||
# This pattern also affects html_static_path and html_extra_path.
|
# This pattern also affects html_static_path and html_extra_path.
|
||||||
exclude_patterns = []
|
exclude_patterns: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
# -- Options for HTML output -------------------------------------------------
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
[mypy]
|
||||||
|
warn_unused_configs = True
|
||||||
|
disallow_any_generics = True
|
||||||
|
disallow_subclassing_any = True
|
||||||
|
disallow_untyped_calls = True
|
||||||
|
disallow_incomplete_defs = True
|
||||||
|
disallow_untyped_decorators = True
|
||||||
|
no_implicit_optional = True
|
||||||
|
warn_redundant_casts = True
|
||||||
|
warn_return_any = True
|
||||||
|
no_implicit_reexport = True
|
||||||
|
strict_equality = True
|
||||||
|
|
||||||
|
# This seems impossible to turn on in a version-independent manner
|
||||||
|
warn_unused_ignores = False
|
||||||
|
|
||||||
|
[mypy-pdfminer.*]
|
||||||
|
disallow_untyped_defs = True
|
||||||
|
|
||||||
|
[mypy-cryptography.hazmat.*]
|
||||||
|
ignore_missing_imports = True
|
||||||
|
|
||||||
|
[mypy-nose.*]
|
||||||
|
ignore_missing_imports = True
|
||||||
|
|
||||||
|
[mypy-setuptools]
|
||||||
|
ignore_missing_imports = True
|
|
@ -21,10 +21,11 @@
|
||||||
__all__ = ['saslprep']
|
__all__ = ['saslprep']
|
||||||
|
|
||||||
import stringprep
|
import stringprep
|
||||||
|
from typing import Callable, Tuple
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
# RFC4013 section 2.3 prohibited output.
|
# RFC4013 section 2.3 prohibited output.
|
||||||
_PROHIBITED = (
|
_PROHIBITED: Tuple[Callable[[str], bool], ...] = (
|
||||||
# A strict reading of RFC 4013 requires table c12 here, but
|
# A strict reading of RFC 4013 requires table c12 here, but
|
||||||
# characters from it are mapped to SPACE in the Map step. Can
|
# characters from it are mapped to SPACE in the Map step. Can
|
||||||
# normalization reintroduce them somehow?
|
# normalization reintroduce them somehow?
|
||||||
|
@ -39,7 +40,7 @@ _PROHIBITED = (
|
||||||
stringprep.in_table_c9)
|
stringprep.in_table_c9)
|
||||||
|
|
||||||
|
|
||||||
def saslprep(data: str, prohibit_unassigned_code_points=True) -> str:
|
def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
|
||||||
"""An implementation of RFC4013 SASLprep.
|
"""An implementation of RFC4013 SASLprep.
|
||||||
:param data:
|
:param data:
|
||||||
The string to SASLprep.
|
The string to SASLprep.
|
||||||
|
|
|
@ -5,9 +5,12 @@ This code is in the public domain.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from typing import Sequence
|
||||||
|
|
||||||
|
|
||||||
class Arcfour:
|
class Arcfour:
|
||||||
|
|
||||||
def __init__(self, key):
|
def __init__(self, key: Sequence[int]) -> None:
|
||||||
# because Py3 range is not indexable
|
# because Py3 range is not indexable
|
||||||
s = [i for i in range(256)]
|
s = [i for i in range(256)]
|
||||||
j = 0
|
j = 0
|
||||||
|
@ -19,7 +22,7 @@ class Arcfour:
|
||||||
(self.i, self.j) = (0, 0)
|
(self.i, self.j) = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def process(self, data):
|
def process(self, data: bytes) -> bytes:
|
||||||
(i, j) = (self.i, self.j)
|
(i, j) = (self.i, self.j)
|
||||||
s = self.s
|
s = self.s
|
||||||
r = b''
|
r = b''
|
||||||
|
|
|
@ -9,7 +9,7 @@ import struct
|
||||||
|
|
||||||
|
|
||||||
# ascii85decode(data)
|
# ascii85decode(data)
|
||||||
def ascii85decode(data):
|
def ascii85decode(data: bytes) -> bytes:
|
||||||
"""
|
"""
|
||||||
In ASCII85 encoding, every four bytes are encoded with five ASCII
|
In ASCII85 encoding, every four bytes are encoded with five ASCII
|
||||||
letters, using 85 different types of characters (as 256**4 < 85**5).
|
letters, using 85 different types of characters (as 256**4 < 85**5).
|
||||||
|
@ -47,7 +47,7 @@ hex_re = re.compile(br'([a-f\d]{2})', re.IGNORECASE)
|
||||||
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
trail_re = re.compile(br'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def asciihexdecode(data):
|
def asciihexdecode(data: bytes) -> bytes:
|
||||||
"""
|
"""
|
||||||
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
||||||
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
|
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
|
||||||
|
@ -57,7 +57,7 @@ def asciihexdecode(data):
|
||||||
the EOD marker after reading an odd number of hexadecimal digits, it
|
the EOD marker after reading an odd number of hexadecimal digits, it
|
||||||
will behave as if a 0 followed the last digit.
|
will behave as if a 0 followed the last digit.
|
||||||
"""
|
"""
|
||||||
def decode(x):
|
def decode(x: bytes) -> bytes:
|
||||||
i = int(x, 16)
|
i = int(x, 16)
|
||||||
return bytes((i,))
|
return bytes((i,))
|
||||||
|
|
||||||
|
|
|
@ -11,25 +11,39 @@
|
||||||
# FOR GROUP 4 FACSIMILE APPARATUS"
|
# FOR GROUP 4 FACSIMILE APPARATUS"
|
||||||
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import array
|
import array
|
||||||
|
from typing import (Any, Callable, Dict, Iterator, List, MutableSequence,
|
||||||
|
Optional, Sequence, Union, cast)
|
||||||
|
|
||||||
|
|
||||||
def get_bytes(data):
|
def get_bytes(data: bytes) -> Iterator[int]:
|
||||||
yield from data
|
yield from data
|
||||||
|
|
||||||
|
|
||||||
|
# Workaround https://github.com/python/mypy/issues/731
|
||||||
|
BitParserState = MutableSequence[Any]
|
||||||
|
# A better definition (not supported by mypy) would be:
|
||||||
|
# BitParserState = MutableSequence[Union["BitParserState", int, str, None]]
|
||||||
|
|
||||||
|
|
||||||
class BitParser:
|
class BitParser:
|
||||||
def __init__(self):
|
_state: BitParserState
|
||||||
|
|
||||||
|
# _accept is declared Optional solely as a workaround for
|
||||||
|
# https://github.com/python/mypy/issues/708
|
||||||
|
_accept: Optional[Callable[[Any], BitParserState]]
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
self._pos = 0
|
self._pos = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def add(cls, root, v, bits):
|
def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None:
|
||||||
p = root
|
p: BitParserState = root
|
||||||
b = None
|
b = None
|
||||||
for i in range(len(bits)):
|
for i in range(len(bits)):
|
||||||
if 0 < i:
|
if 0 < i:
|
||||||
|
assert b is not None
|
||||||
if p[b] is None:
|
if p[b] is None:
|
||||||
p[b] = [None, None]
|
p[b] = [None, None]
|
||||||
p = p[b]
|
p = p[b]
|
||||||
|
@ -37,16 +51,17 @@ class BitParser:
|
||||||
b = 1
|
b = 1
|
||||||
else:
|
else:
|
||||||
b = 0
|
b = 0
|
||||||
|
assert b is not None
|
||||||
p[b] = v
|
p[b] = v
|
||||||
return
|
return
|
||||||
|
|
||||||
def feedbytes(self, data):
|
def feedbytes(self, data: bytes) -> None:
|
||||||
for byte in get_bytes(data):
|
for byte in get_bytes(data):
|
||||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||||
self._parse_bit(byte & m)
|
self._parse_bit(byte & m)
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_bit(self, x):
|
def _parse_bit(self, x: object) -> None:
|
||||||
if x:
|
if x:
|
||||||
v = self._state[1]
|
v = self._state[1]
|
||||||
else:
|
else:
|
||||||
|
@ -55,6 +70,7 @@ class BitParser:
|
||||||
if isinstance(v, list):
|
if isinstance(v, list):
|
||||||
self._state = v
|
self._state = v
|
||||||
else:
|
else:
|
||||||
|
assert self._accept is not None
|
||||||
self._state = self._accept(v)
|
self._state = self._accept(v)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -318,14 +334,16 @@ class CCITTG4Parser(BitParser):
|
||||||
class ByteSkip(Exception):
|
class ByteSkip(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __init__(self, width, bytealign=False):
|
_color: int
|
||||||
|
|
||||||
|
def __init__(self, width: int, bytealign: bool = False) -> None:
|
||||||
BitParser.__init__(self)
|
BitParser.__init__(self)
|
||||||
self.width = width
|
self.width = width
|
||||||
self.bytealign = bytealign
|
self.bytealign = bytealign
|
||||||
self.reset()
|
self.reset()
|
||||||
return
|
return
|
||||||
|
|
||||||
def feedbytes(self, data):
|
def feedbytes(self, data: bytes) -> None:
|
||||||
for byte in get_bytes(data):
|
for byte in get_bytes(data):
|
||||||
try:
|
try:
|
||||||
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
for m in (128, 64, 32, 16, 8, 4, 2, 1):
|
||||||
|
@ -337,7 +355,7 @@ class CCITTG4Parser(BitParser):
|
||||||
break
|
break
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_mode(self, mode):
|
def _parse_mode(self, mode: object) -> BitParserState:
|
||||||
if mode == 'p':
|
if mode == 'p':
|
||||||
self._do_pass()
|
self._do_pass()
|
||||||
self._flush_line()
|
self._flush_line()
|
||||||
|
@ -361,7 +379,7 @@ class CCITTG4Parser(BitParser):
|
||||||
else:
|
else:
|
||||||
raise self.InvalidData(mode)
|
raise self.InvalidData(mode)
|
||||||
|
|
||||||
def _parse_horiz1(self, n):
|
def _parse_horiz1(self, n: Any) -> BitParserState:
|
||||||
if n is None:
|
if n is None:
|
||||||
raise self.InvalidData
|
raise self.InvalidData
|
||||||
self._n1 += n
|
self._n1 += n
|
||||||
|
@ -374,7 +392,7 @@ class CCITTG4Parser(BitParser):
|
||||||
else:
|
else:
|
||||||
return self.BLACK
|
return self.BLACK
|
||||||
|
|
||||||
def _parse_horiz2(self, n):
|
def _parse_horiz2(self, n: Any) -> BitParserState:
|
||||||
if n is None:
|
if n is None:
|
||||||
raise self.InvalidData
|
raise self.InvalidData
|
||||||
self._n2 += n
|
self._n2 += n
|
||||||
|
@ -389,7 +407,7 @@ class CCITTG4Parser(BitParser):
|
||||||
else:
|
else:
|
||||||
return self.BLACK
|
return self.BLACK
|
||||||
|
|
||||||
def _parse_uncompressed(self, bits):
|
def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
|
||||||
if not bits:
|
if not bits:
|
||||||
raise self.InvalidData
|
raise self.InvalidData
|
||||||
if bits.startswith('T'):
|
if bits.startswith('T'):
|
||||||
|
@ -401,10 +419,10 @@ class CCITTG4Parser(BitParser):
|
||||||
self._do_uncompressed(bits)
|
self._do_uncompressed(bits)
|
||||||
return self.UNCOMPRESSED
|
return self.UNCOMPRESSED
|
||||||
|
|
||||||
def _get_bits(self):
|
def _get_bits(self) -> str:
|
||||||
return ''.join(str(b) for b in self._curline[:self._curpos])
|
return ''.join(str(b) for b in self._curline[:self._curpos])
|
||||||
|
|
||||||
def _get_refline(self, i):
|
def _get_refline(self, i: int) -> str:
|
||||||
if i < 0:
|
if i < 0:
|
||||||
return '[]'+''.join(str(b) for b in self._refline)
|
return '[]'+''.join(str(b) for b in self._refline)
|
||||||
elif len(self._refline) <= i:
|
elif len(self._refline) <= i:
|
||||||
|
@ -414,7 +432,7 @@ class CCITTG4Parser(BitParser):
|
||||||
'['+str(self._refline[i])+']' +
|
'['+str(self._refline[i])+']' +
|
||||||
''.join(str(b) for b in self._refline[i+1:]))
|
''.join(str(b) for b in self._refline[i+1:]))
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._y = 0
|
self._y = 0
|
||||||
self._curline = array.array('b', [1]*self.width)
|
self._curline = array.array('b', [1]*self.width)
|
||||||
self._reset_line()
|
self._reset_line()
|
||||||
|
@ -422,18 +440,18 @@ class CCITTG4Parser(BitParser):
|
||||||
self._state = self.MODE
|
self._state = self.MODE
|
||||||
return
|
return
|
||||||
|
|
||||||
def output_line(self, y, bits):
|
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
||||||
print(y, ''.join(str(b) for b in bits))
|
print(y, ''.join(str(b) for b in bits))
|
||||||
return
|
return
|
||||||
|
|
||||||
def _reset_line(self):
|
def _reset_line(self) -> None:
|
||||||
self._refline = self._curline
|
self._refline = self._curline
|
||||||
self._curline = array.array('b', [1]*self.width)
|
self._curline = array.array('b', [1]*self.width)
|
||||||
self._curpos = -1
|
self._curpos = -1
|
||||||
self._color = 1
|
self._color = 1
|
||||||
return
|
return
|
||||||
|
|
||||||
def _flush_line(self):
|
def _flush_line(self) -> None:
|
||||||
if self.width <= self._curpos:
|
if self.width <= self._curpos:
|
||||||
self.output_line(self._y, self._curline)
|
self.output_line(self._y, self._curline)
|
||||||
self._y += 1
|
self._y += 1
|
||||||
|
@ -442,7 +460,7 @@ class CCITTG4Parser(BitParser):
|
||||||
raise self.ByteSkip
|
raise self.ByteSkip
|
||||||
return
|
return
|
||||||
|
|
||||||
def _do_vertical(self, dx):
|
def _do_vertical(self, dx: int) -> None:
|
||||||
x1 = self._curpos+1
|
x1 = self._curpos+1
|
||||||
while 1:
|
while 1:
|
||||||
if x1 == 0:
|
if x1 == 0:
|
||||||
|
@ -467,7 +485,7 @@ class CCITTG4Parser(BitParser):
|
||||||
self._color = 1-self._color
|
self._color = 1-self._color
|
||||||
return
|
return
|
||||||
|
|
||||||
def _do_pass(self):
|
def _do_pass(self) -> None:
|
||||||
x1 = self._curpos+1
|
x1 = self._curpos+1
|
||||||
while 1:
|
while 1:
|
||||||
if x1 == 0:
|
if x1 == 0:
|
||||||
|
@ -494,7 +512,7 @@ class CCITTG4Parser(BitParser):
|
||||||
self._curpos = x1
|
self._curpos = x1
|
||||||
return
|
return
|
||||||
|
|
||||||
def _do_horizontal(self, n1, n2):
|
def _do_horizontal(self, n1: int, n2: int) -> None:
|
||||||
if self._curpos < 0:
|
if self._curpos < 0:
|
||||||
self._curpos = 0
|
self._curpos = 0
|
||||||
x = self._curpos
|
x = self._curpos
|
||||||
|
@ -511,7 +529,7 @@ class CCITTG4Parser(BitParser):
|
||||||
self._curpos = x
|
self._curpos = x
|
||||||
return
|
return
|
||||||
|
|
||||||
def _do_uncompressed(self, bits):
|
def _do_uncompressed(self, bits: str) -> None:
|
||||||
for c in bits:
|
for c in bits:
|
||||||
self._curline[self._curpos] = int(c)
|
self._curline[self._curpos] = int(c)
|
||||||
self._curpos += 1
|
self._curpos += 1
|
||||||
|
@ -521,32 +539,33 @@ class CCITTG4Parser(BitParser):
|
||||||
|
|
||||||
class CCITTFaxDecoder(CCITTG4Parser):
|
class CCITTFaxDecoder(CCITTG4Parser):
|
||||||
|
|
||||||
def __init__(self, width, bytealign=False, reversed=False):
|
def __init__(self, width: int, bytealign: bool = False,
|
||||||
|
reversed: bool = False) -> None:
|
||||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||||
self.reversed = reversed
|
self.reversed = reversed
|
||||||
self._buf = b''
|
self._buf = b''
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> bytes:
|
||||||
return self._buf
|
return self._buf
|
||||||
|
|
||||||
def output_line(self, y, bits):
|
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
||||||
bytes = array.array('B', [0]*((len(bits)+7)//8))
|
arr = array.array('B', [0]*((len(bits)+7)//8))
|
||||||
if self.reversed:
|
if self.reversed:
|
||||||
bits = [1-b for b in bits]
|
bits = [1-b for b in bits]
|
||||||
for (i, b) in enumerate(bits):
|
for (i, b) in enumerate(bits):
|
||||||
if b:
|
if b:
|
||||||
bytes[i//8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
|
arr[i//8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
|
||||||
self._buf += bytes.tostring()
|
self._buf += arr.tobytes()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def ccittfaxdecode(data, params):
|
def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
|
||||||
K = params.get('K')
|
K = params.get('K')
|
||||||
cols = params.get('Columns')
|
|
||||||
bytealign = params.get('EncodedByteAlign')
|
|
||||||
reversed = params.get('BlackIs1')
|
|
||||||
if K == -1:
|
if K == -1:
|
||||||
|
cols = cast(int, params.get('Columns'))
|
||||||
|
bytealign = cast(bool, params.get('EncodedByteAlign'))
|
||||||
|
reversed = cast(bool, params.get('BlackIs1'))
|
||||||
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
|
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
|
||||||
else:
|
else:
|
||||||
raise ValueError(K)
|
raise ValueError(K)
|
||||||
|
@ -555,19 +574,20 @@ def ccittfaxdecode(data, params):
|
||||||
|
|
||||||
|
|
||||||
# test
|
# test
|
||||||
def main(argv):
|
def main(argv: List[str]) -> None:
|
||||||
if not argv[1:]:
|
if not argv[1:]:
|
||||||
import unittest
|
import unittest
|
||||||
return unittest.main()
|
unittest.main()
|
||||||
|
return
|
||||||
|
|
||||||
class Parser(CCITTG4Parser):
|
class Parser(CCITTG4Parser):
|
||||||
def __init__(self, width, bytealign=False):
|
def __init__(self, width: int, bytealign: bool = False) -> None:
|
||||||
import pygame
|
import pygame # type: ignore[import]
|
||||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||||
self.img = pygame.Surface((self.width, 1000))
|
self.img = pygame.Surface((self.width, 1000))
|
||||||
return
|
return
|
||||||
|
|
||||||
def output_line(self, y, bits):
|
def output_line(self, y: int, bits: Sequence[int]) -> None:
|
||||||
for (x, b) in enumerate(bits):
|
for (x, b) in enumerate(bits):
|
||||||
if b:
|
if b:
|
||||||
self.img.set_at((x, y), (255, 255, 255))
|
self.img.set_at((x, y), (255, 255, 255))
|
||||||
|
@ -575,7 +595,7 @@ def main(argv):
|
||||||
self.img.set_at((x, y), (0, 0, 0))
|
self.img.set_at((x, y), (0, 0, 0))
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
import pygame
|
import pygame
|
||||||
pygame.image.save(self.img, 'out.bmp')
|
pygame.image.save(self.img, 'out.bmp')
|
||||||
return
|
return
|
||||||
|
@ -587,7 +607,3 @@ def main(argv):
|
||||||
parser.close()
|
parser.close()
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main(sys.argv))
|
|
||||||
|
|
|
@ -16,9 +16,12 @@ import gzip
|
||||||
import pickle as pickle
|
import pickle as pickle
|
||||||
import struct
|
import struct
|
||||||
import logging
|
import logging
|
||||||
|
from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List,
|
||||||
|
MutableMapping, Optional, TextIO, Tuple, Union, cast)
|
||||||
from .psparser import PSStackParser
|
from .psparser import PSStackParser
|
||||||
from .psparser import PSSyntaxError
|
from .psparser import PSSyntaxError
|
||||||
from .psparser import PSEOF
|
from .psparser import PSEOF
|
||||||
|
from .psparser import PSKeyword
|
||||||
from .psparser import PSLiteral
|
from .psparser import PSLiteral
|
||||||
from .psparser import literal_name
|
from .psparser import literal_name
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
|
@ -38,44 +41,48 @@ class CMapBase:
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs: object) -> None:
|
||||||
self.attrs = kwargs.copy()
|
self.attrs: MutableMapping[str, object] = kwargs.copy()
|
||||||
return
|
return
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self) -> bool:
|
||||||
return self.attrs.get('WMode', 0) != 0
|
return self.attrs.get('WMode', 0) != 0
|
||||||
|
|
||||||
def set_attr(self, k, v):
|
def set_attr(self, k: str, v: object) -> None:
|
||||||
self.attrs[k] = v
|
self.attrs[k] = v
|
||||||
return
|
return
|
||||||
|
|
||||||
def add_code2cid(self, code, cid):
|
def add_code2cid(self, code: str, cid: int) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def add_cid2unichr(self, cid, code):
|
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
|
||||||
|
) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def use_cmap(self, cmap):
|
def use_cmap(self, cmap: "CMapBase") -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def decode(self, code: bytes) -> Iterable[int]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class CMap(CMapBase):
|
class CMap(CMapBase):
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs: Union[str, int]) -> None:
|
||||||
CMapBase.__init__(self, **kwargs)
|
CMapBase.__init__(self, **kwargs)
|
||||||
self.code2cid = {}
|
self.code2cid: Dict[int, object] = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<CMap: %s>' % self.attrs.get('CMapName')
|
return '<CMap: %s>' % self.attrs.get('CMapName')
|
||||||
|
|
||||||
def use_cmap(self, cmap):
|
def use_cmap(self, cmap: CMapBase) -> None:
|
||||||
assert isinstance(cmap, CMap), str(type(cmap))
|
assert isinstance(cmap, CMap), str(type(cmap))
|
||||||
|
|
||||||
def copy(dst, src):
|
def copy(dst: Dict[int, object], src: Dict[int, object]) -> None:
|
||||||
for (k, v) in src.items():
|
for (k, v) in src.items():
|
||||||
if isinstance(v, dict):
|
if isinstance(v, dict):
|
||||||
d = {}
|
d: Dict[int, object] = {}
|
||||||
dst[k] = d
|
dst[k] = d
|
||||||
copy(d, v)
|
copy(d, v)
|
||||||
else:
|
else:
|
||||||
|
@ -83,20 +90,24 @@ class CMap(CMapBase):
|
||||||
copy(self.code2cid, cmap.code2cid)
|
copy(self.code2cid, cmap.code2cid)
|
||||||
return
|
return
|
||||||
|
|
||||||
def decode(self, code):
|
def decode(self, code: bytes) -> Iterator[int]:
|
||||||
log.debug('decode: %r, %r', self, code)
|
log.debug('decode: %r, %r', self, code)
|
||||||
d = self.code2cid
|
d = self.code2cid
|
||||||
for i in iter(code):
|
for i in iter(code):
|
||||||
if i in d:
|
if i in d:
|
||||||
d = d[i]
|
x = d[i]
|
||||||
if isinstance(d, int):
|
if isinstance(x, int):
|
||||||
yield d
|
yield x
|
||||||
d = self.code2cid
|
d = self.code2cid
|
||||||
|
else:
|
||||||
|
d = cast(Dict[int, object], x)
|
||||||
else:
|
else:
|
||||||
d = self.code2cid
|
d = self.code2cid
|
||||||
return
|
return
|
||||||
|
|
||||||
def dump(self, out=sys.stdout, code2cid=None, code=None):
|
def dump(self, out: TextIO = sys.stdout,
|
||||||
|
code2cid: Optional[Dict[int, object]] = None,
|
||||||
|
code: Tuple[int, ...] = ()) -> None:
|
||||||
if code2cid is None:
|
if code2cid is None:
|
||||||
code2cid = self.code2cid
|
code2cid = self.code2cid
|
||||||
code = ()
|
code = ()
|
||||||
|
@ -105,13 +116,13 @@ class CMap(CMapBase):
|
||||||
if isinstance(v, int):
|
if isinstance(v, int):
|
||||||
out.write('code %r = cid %d\n' % (c, v))
|
out.write('code %r = cid %d\n' % (c, v))
|
||||||
else:
|
else:
|
||||||
self.dump(out=out, code2cid=v, code=c)
|
self.dump(out=out, code2cid=cast(Dict[int, object], v), code=c)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class IdentityCMap(CMapBase):
|
class IdentityCMap(CMapBase):
|
||||||
|
|
||||||
def decode(self, code):
|
def decode(self, code: bytes) -> Tuple[int, ...]:
|
||||||
n = len(code)//2
|
n = len(code)//2
|
||||||
if n:
|
if n:
|
||||||
return struct.unpack('>%dH' % n, code)
|
return struct.unpack('>%dH' % n, code)
|
||||||
|
@ -121,7 +132,7 @@ class IdentityCMap(CMapBase):
|
||||||
|
|
||||||
class IdentityCMapByte(IdentityCMap):
|
class IdentityCMapByte(IdentityCMap):
|
||||||
|
|
||||||
def decode(self, code):
|
def decode(self, code: bytes) -> Tuple[int, ...]:
|
||||||
n = len(code)
|
n = len(code)
|
||||||
if n:
|
if n:
|
||||||
return struct.unpack('>%dB' % n, code)
|
return struct.unpack('>%dB' % n, code)
|
||||||
|
@ -131,19 +142,19 @@ class IdentityCMapByte(IdentityCMap):
|
||||||
|
|
||||||
class UnicodeMap(CMapBase):
|
class UnicodeMap(CMapBase):
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs: Union[str, int]) -> None:
|
||||||
CMapBase.__init__(self, **kwargs)
|
CMapBase.__init__(self, **kwargs)
|
||||||
self.cid2unichr = {}
|
self.cid2unichr: Dict[int, str] = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
|
return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
|
||||||
|
|
||||||
def get_unichr(self, cid):
|
def get_unichr(self, cid: int) -> str:
|
||||||
log.debug('get_unichr: %r, %r', self, cid)
|
log.debug('get_unichr: %r, %r', self, cid)
|
||||||
return self.cid2unichr[cid]
|
return self.cid2unichr[cid]
|
||||||
|
|
||||||
def dump(self, out=sys.stdout):
|
def dump(self, out: TextIO = sys.stdout) -> None:
|
||||||
for (k, v) in sorted(self.cid2unichr.items()):
|
for (k, v) in sorted(self.cid2unichr.items()):
|
||||||
out.write('cid %d = unicode %r\n' % (k, v))
|
out.write('cid %d = unicode %r\n' % (k, v))
|
||||||
return
|
return
|
||||||
|
@ -151,29 +162,31 @@ class UnicodeMap(CMapBase):
|
||||||
|
|
||||||
class FileCMap(CMap):
|
class FileCMap(CMap):
|
||||||
|
|
||||||
def add_code2cid(self, code, cid):
|
def add_code2cid(self, code: str, cid: int) -> None:
|
||||||
assert isinstance(code, str) and isinstance(cid, int),\
|
assert isinstance(code, str) and isinstance(cid, int),\
|
||||||
str((type(code), type(cid)))
|
str((type(code), type(cid)))
|
||||||
d = self.code2cid
|
d = self.code2cid
|
||||||
for c in code[:-1]:
|
for c in code[:-1]:
|
||||||
c = ord(c)
|
ci = ord(c)
|
||||||
if c in d:
|
if ci in d:
|
||||||
d = d[c]
|
d = cast(Dict[int, object], d[ci])
|
||||||
else:
|
else:
|
||||||
t = {}
|
t: Dict[int, object] = {}
|
||||||
d[c] = t
|
d[ci] = t
|
||||||
d = t
|
d = t
|
||||||
c = ord(code[-1])
|
ci = ord(code[-1])
|
||||||
d[c] = cid
|
d[ci] = cid
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class FileUnicodeMap(UnicodeMap):
|
class FileUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
def add_cid2unichr(self, cid, code):
|
def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]
|
||||||
|
) -> None:
|
||||||
assert isinstance(cid, int), str(type(cid))
|
assert isinstance(cid, int), str(type(cid))
|
||||||
if isinstance(code, PSLiteral):
|
if isinstance(code, PSLiteral):
|
||||||
# Interpret as an Adobe glyph name.
|
# Interpret as an Adobe glyph name.
|
||||||
|
assert isinstance(code.name, str)
|
||||||
self.cid2unichr[cid] = name2unicode(code.name)
|
self.cid2unichr[cid] = name2unicode(code.name)
|
||||||
elif isinstance(code, bytes):
|
elif isinstance(code, bytes):
|
||||||
# Interpret as UTF-16BE.
|
# Interpret as UTF-16BE.
|
||||||
|
@ -187,8 +200,8 @@ class FileUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
class PyCMap(CMap):
|
class PyCMap(CMap):
|
||||||
|
|
||||||
def __init__(self, name, module):
|
def __init__(self, name: str, module: Any) -> None:
|
||||||
CMap.__init__(self, CMapName=name)
|
super().__init__(CMapName=name)
|
||||||
self.code2cid = module.CODE2CID
|
self.code2cid = module.CODE2CID
|
||||||
if module.IS_VERTICAL:
|
if module.IS_VERTICAL:
|
||||||
self.attrs['WMode'] = 1
|
self.attrs['WMode'] = 1
|
||||||
|
@ -197,8 +210,8 @@ class PyCMap(CMap):
|
||||||
|
|
||||||
class PyUnicodeMap(UnicodeMap):
|
class PyUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
def __init__(self, name, module, vertical):
|
def __init__(self, name: str, module: Any, vertical: bool) -> None:
|
||||||
UnicodeMap.__init__(self, CMapName=name)
|
super().__init__(CMapName=name)
|
||||||
if vertical:
|
if vertical:
|
||||||
self.cid2unichr = module.CID2UNICHR_V
|
self.cid2unichr = module.CID2UNICHR_V
|
||||||
self.attrs['WMode'] = 1
|
self.attrs['WMode'] = 1
|
||||||
|
@ -209,14 +222,14 @@ class PyUnicodeMap(UnicodeMap):
|
||||||
|
|
||||||
class CMapDB:
|
class CMapDB:
|
||||||
|
|
||||||
_cmap_cache = {}
|
_cmap_cache: Dict[str, PyCMap] = {}
|
||||||
_umap_cache = {}
|
_umap_cache: Dict[str, List[PyUnicodeMap]] = {}
|
||||||
|
|
||||||
class CMapNotFound(CMapError):
|
class CMapNotFound(CMapError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _load_data(cls, name):
|
def _load_data(cls, name: str) -> Any:
|
||||||
name = name.replace("\0", "")
|
name = name.replace("\0", "")
|
||||||
filename = '%s.pickle.gz' % name
|
filename = '%s.pickle.gz' % name
|
||||||
log.info('loading: %r', name)
|
log.info('loading: %r', name)
|
||||||
|
@ -234,7 +247,7 @@ class CMapDB:
|
||||||
raise CMapDB.CMapNotFound(name)
|
raise CMapDB.CMapNotFound(name)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_cmap(cls, name):
|
def get_cmap(cls, name: str) -> CMapBase:
|
||||||
if name == 'Identity-H':
|
if name == 'Identity-H':
|
||||||
return IdentityCMap(WMode=0)
|
return IdentityCMap(WMode=0)
|
||||||
elif name == 'Identity-V':
|
elif name == 'Identity-V':
|
||||||
|
@ -252,7 +265,7 @@ class CMapDB:
|
||||||
return cmap
|
return cmap
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_unicode_map(cls, name, vertical=False):
|
def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
|
||||||
try:
|
try:
|
||||||
return cls._umap_cache[name][vertical]
|
return cls._umap_cache[name][vertical]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -263,16 +276,16 @@ class CMapDB:
|
||||||
return cls._umap_cache[name][vertical]
|
return cls._umap_cache[name][vertical]
|
||||||
|
|
||||||
|
|
||||||
class CMapParser(PSStackParser):
|
class CMapParser(PSStackParser[PSKeyword]):
|
||||||
|
|
||||||
def __init__(self, cmap, fp):
|
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
|
||||||
PSStackParser.__init__(self, fp)
|
PSStackParser.__init__(self, fp)
|
||||||
self.cmap = cmap
|
self.cmap = cmap
|
||||||
# some ToUnicode maps don't have "begincmap" keyword.
|
# some ToUnicode maps don't have "begincmap" keyword.
|
||||||
self._in_cmap = True
|
self._in_cmap = True
|
||||||
return
|
return
|
||||||
|
|
||||||
def run(self):
|
def run(self) -> None:
|
||||||
try:
|
try:
|
||||||
self.nextobject()
|
self.nextobject()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
|
@ -296,7 +309,7 @@ class CMapParser(PSStackParser):
|
||||||
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
|
KEYWORD_BEGINNOTDEFRANGE = KWD(b'beginnotdefrange')
|
||||||
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
|
KEYWORD_ENDNOTDEFRANGE = KWD(b'endnotdefrange')
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||||
if token is self.KEYWORD_BEGINCMAP:
|
if token is self.KEYWORD_BEGINCMAP:
|
||||||
self._in_cmap = True
|
self._in_cmap = True
|
||||||
self.popall()
|
self.popall()
|
||||||
|
@ -380,6 +393,7 @@ class CMapParser(PSStackParser):
|
||||||
for i in range(e1-s1+1):
|
for i in range(e1-s1+1):
|
||||||
self.cmap.add_cid2unichr(s1+i, code[i])
|
self.cmap.add_cid2unichr(s1+i, code[i])
|
||||||
else:
|
else:
|
||||||
|
assert isinstance(code, bytes)
|
||||||
var = code[-4:]
|
var = code[-4:]
|
||||||
base = nunpack(var)
|
base = nunpack(var)
|
||||||
prefix = code[:-4]
|
prefix = code[:-4]
|
||||||
|
@ -410,7 +424,7 @@ class CMapParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv: List[str]) -> None:
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
for fname in args:
|
for fname in args:
|
||||||
fp = open(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
|
@ -422,4 +436,4 @@ def main(argv):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
main(sys.argv)
|
||||||
|
|
|
@ -1,13 +1,19 @@
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
from pdfminer.pdfcolor import PDFColorSpace
|
||||||
|
from typing import (BinaryIO, Dict, Generic, List, Optional, Sequence, TextIO,
|
||||||
|
Tuple, TypeVar, Union, cast)
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
|
from .layout import LAParams, LTComponent, TextGroupElement
|
||||||
from .layout import LTChar
|
from .layout import LTChar
|
||||||
from .layout import LTContainer
|
from .layout import LTContainer
|
||||||
from .layout import LTCurve
|
from .layout import LTCurve
|
||||||
from .layout import LTFigure
|
from .layout import LTFigure
|
||||||
from .layout import LTImage
|
from .layout import LTImage
|
||||||
|
from .layout import LTItem
|
||||||
|
from .layout import LTLayoutContainer
|
||||||
from .layout import LTLine
|
from .layout import LTLine
|
||||||
from .layout import LTPage
|
from .layout import LTPage
|
||||||
from .layout import LTRect
|
from .layout import LTRect
|
||||||
|
@ -17,25 +23,38 @@ from .layout import LTTextBoxVertical
|
||||||
from .layout import LTTextGroup
|
from .layout import LTTextGroup
|
||||||
from .layout import LTTextLine
|
from .layout import LTTextLine
|
||||||
from .pdfdevice import PDFTextDevice
|
from .pdfdevice import PDFTextDevice
|
||||||
|
from .pdffont import PDFFont
|
||||||
from .pdffont import PDFUnicodeNotDefined
|
from .pdffont import PDFUnicodeNotDefined
|
||||||
|
from .pdfinterp import PDFGraphicState, PDFResourceManager
|
||||||
|
from .pdfpage import PDFPage
|
||||||
|
from .pdftypes import PDFStream
|
||||||
|
from .utils import AnyIO, Point, Matrix, Rect, PathSegment
|
||||||
from .utils import apply_matrix_pt
|
from .utils import apply_matrix_pt
|
||||||
from .utils import bbox2str
|
from .utils import bbox2str
|
||||||
from .utils import enc
|
from .utils import enc
|
||||||
from .utils import mult_matrix
|
from .utils import mult_matrix
|
||||||
|
from .image import ImageWriter
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PDFLayoutAnalyzer(PDFTextDevice):
|
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
|
cur_item: LTLayoutContainer
|
||||||
|
ctm: Matrix
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: PDFResourceManager,
|
||||||
|
pageno: int = 1,
|
||||||
|
laparams: Optional[LAParams] = None
|
||||||
|
) -> None:
|
||||||
PDFTextDevice.__init__(self, rsrcmgr)
|
PDFTextDevice.__init__(self, rsrcmgr)
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
self.laparams = laparams
|
self.laparams = laparams
|
||||||
self._stack = []
|
self._stack: List[LTLayoutContainer] = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
||||||
(x0, y0, x1, y1) = page.mediabox
|
(x0, y0, x1, y1) = page.mediabox
|
||||||
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
||||||
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
||||||
|
@ -43,7 +62,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
self.cur_item = LTPage(self.pageno, mediabox)
|
self.cur_item = LTPage(self.pageno, mediabox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page: PDFPage) -> None:
|
||||||
assert not self._stack, str(len(self._stack))
|
assert not self._stack, str(len(self._stack))
|
||||||
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
|
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
|
||||||
if self.laparams is not None:
|
if self.laparams is not None:
|
||||||
|
@ -52,19 +71,19 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
self.receive_layout(self.cur_item)
|
self.receive_layout(self.cur_item)
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
|
||||||
self._stack.append(self.cur_item)
|
self._stack.append(self.cur_item)
|
||||||
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_figure(self, _):
|
def end_figure(self, _: str) -> None:
|
||||||
fig = self.cur_item
|
fig = self.cur_item
|
||||||
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
||||||
self.cur_item = self._stack.pop()
|
self.cur_item = self._stack.pop()
|
||||||
self.cur_item.add(fig)
|
self.cur_item.add(fig)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name: str, stream: PDFStream) -> None:
|
||||||
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
||||||
item = LTImage(name, stream,
|
item = LTImage(name, stream,
|
||||||
(self.cur_item.x0, self.cur_item.y0,
|
(self.cur_item.x0, self.cur_item.y0,
|
||||||
|
@ -72,7 +91,14 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return
|
return
|
||||||
|
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
def paint_path(
|
||||||
|
self,
|
||||||
|
gstate: PDFGraphicState,
|
||||||
|
stroke: bool,
|
||||||
|
fill: bool,
|
||||||
|
evenodd: bool,
|
||||||
|
path: Sequence[PathSegment]
|
||||||
|
) -> None:
|
||||||
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
||||||
shape = ''.join(x[0] for x in path)
|
shape = ''.join(x[0] for x in path)
|
||||||
|
|
||||||
|
@ -90,7 +116,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
# And, per Section 4.4's Table 4.9, all other path commands place
|
# And, per Section 4.4's Table 4.9, all other path commands place
|
||||||
# their point-position in their final two arguments. (Any preceding
|
# their point-position in their final two arguments. (Any preceding
|
||||||
# arguments represent control points on Bézier curves.)
|
# arguments represent control points on Bézier curves.)
|
||||||
raw_pts = [p[-2:] if p[0] != 'h' else path[0][-2:] for p in path]
|
raw_pts = [cast(Point, p[-2:] if p[0] != 'h' else path[0][-2:])
|
||||||
|
for p in path]
|
||||||
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
||||||
|
|
||||||
if shape in {'mlh', 'ml'}:
|
if shape in {'mlh', 'ml'}:
|
||||||
|
@ -123,8 +150,17 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
gstate.scolor, gstate.ncolor)
|
gstate.scolor, gstate.ncolor)
|
||||||
self.cur_item.add(curve)
|
self.cur_item.add(curve)
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
def render_char(
|
||||||
graphicstate):
|
self,
|
||||||
|
matrix: Matrix,
|
||||||
|
font: PDFFont,
|
||||||
|
fontsize: float,
|
||||||
|
scaling: float,
|
||||||
|
rise: float,
|
||||||
|
cid: int,
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: PDFGraphicState
|
||||||
|
) -> float:
|
||||||
try:
|
try:
|
||||||
text = font.to_unichr(cid)
|
text = font.to_unichr(cid)
|
||||||
assert isinstance(text, str), str(type(text))
|
assert isinstance(text, str), str(type(text))
|
||||||
|
@ -137,40 +173,56 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return item.adv
|
return item.adv
|
||||||
|
|
||||||
def handle_undefined_char(self, font, cid):
|
def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
|
||||||
log.info('undefined: %r, %r', font, cid)
|
log.info('undefined: %r, %r', font, cid)
|
||||||
return '(cid:%d)' % cid
|
return '(cid:%d)' % cid
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage: LTPage) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class PDFPageAggregator(PDFLayoutAnalyzer):
|
class PDFPageAggregator(PDFLayoutAnalyzer):
|
||||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: PDFResourceManager,
|
||||||
|
pageno: int = 1,
|
||||||
|
laparams: Optional[LAParams] = None
|
||||||
|
) -> None:
|
||||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||||
laparams=laparams)
|
laparams=laparams)
|
||||||
self.result = None
|
self.result: Optional[LTPage] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage: LTPage) -> None:
|
||||||
self.result = ltpage
|
self.result = ltpage
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_result(self):
|
def get_result(self) -> LTPage:
|
||||||
|
assert self.result is not None
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
|
|
||||||
class PDFConverter(PDFLayoutAnalyzer):
|
# Some PDFConverter children support only binary I/O
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
IOType = TypeVar('IOType', TextIO, BinaryIO, AnyIO)
|
||||||
laparams=None):
|
|
||||||
|
|
||||||
|
class PDFConverter(PDFLayoutAnalyzer, Generic[IOType]):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: PDFResourceManager,
|
||||||
|
outfp: IOType,
|
||||||
|
codec: str = 'utf-8',
|
||||||
|
pageno: int = 1,
|
||||||
|
laparams: Optional[LAParams] = None
|
||||||
|
) -> None:
|
||||||
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno,
|
||||||
laparams=laparams)
|
laparams=laparams)
|
||||||
self.outfp = outfp
|
self.outfp: IOType = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
self.outfp_binary = self._is_binary_stream(self.outfp)
|
self.outfp_binary = self._is_binary_stream(self.outfp)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_binary_stream(outfp):
|
def _is_binary_stream(outfp: AnyIO) -> bool:
|
||||||
"""Test if an stream is binary or not"""
|
"""Test if an stream is binary or not"""
|
||||||
if 'b' in getattr(outfp, 'mode', ''):
|
if 'b' in getattr(outfp, 'mode', ''):
|
||||||
return True
|
return True
|
||||||
|
@ -187,24 +239,33 @@ class PDFConverter(PDFLayoutAnalyzer):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter[AnyIO]):
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(
|
||||||
showpageno=False, imagewriter=None):
|
self,
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
rsrcmgr: PDFResourceManager,
|
||||||
|
outfp: AnyIO,
|
||||||
|
codec: str = 'utf-8',
|
||||||
|
pageno: int = 1,
|
||||||
|
laparams: Optional[LAParams] = None,
|
||||||
|
showpageno: bool = False,
|
||||||
|
imagewriter: Optional[ImageWriter] = None
|
||||||
|
) -> None:
|
||||||
|
super().__init__(rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||||
laparams=laparams)
|
laparams=laparams)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.imagewriter = imagewriter
|
self.imagewriter = imagewriter
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text: str) -> None:
|
||||||
text = utils.compatible_encode_method(text, self.codec, 'ignore')
|
text = utils.compatible_encode_method(text, self.codec, 'ignore')
|
||||||
if self.outfp_binary:
|
if self.outfp_binary:
|
||||||
text = text.encode()
|
cast(BinaryIO, self.outfp).write(text.encode())
|
||||||
self.outfp.write(text)
|
else:
|
||||||
|
cast(TextIO, self.outfp).write(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage: LTPage) -> None:
|
||||||
def render(item):
|
def render(item: LTItem) -> None:
|
||||||
if isinstance(item, LTContainer):
|
if isinstance(item, LTContainer):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
|
@ -224,17 +285,24 @@ class TextConverter(PDFConverter):
|
||||||
# Some dummy functions to save memory/CPU when all that is wanted
|
# Some dummy functions to save memory/CPU when all that is wanted
|
||||||
# is text. This stops all the image and drawing output from being
|
# is text. This stops all the image and drawing output from being
|
||||||
# recorded and taking up RAM.
|
# recorded and taking up RAM.
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name: str, stream: PDFStream) -> None:
|
||||||
if self.imagewriter is None:
|
if self.imagewriter is None:
|
||||||
return
|
return
|
||||||
PDFConverter.render_image(self, name, stream)
|
PDFConverter.render_image(self, name, stream)
|
||||||
return
|
return
|
||||||
|
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
def paint_path(
|
||||||
|
self,
|
||||||
|
gstate: PDFGraphicState,
|
||||||
|
stroke: bool,
|
||||||
|
fill: bool,
|
||||||
|
evenodd: bool,
|
||||||
|
path: Sequence[PathSegment]
|
||||||
|
) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter[AnyIO]):
|
||||||
RECT_COLORS = {
|
RECT_COLORS = {
|
||||||
'figure': 'yellow',
|
'figure': 'yellow',
|
||||||
'textline': 'magenta',
|
'textline': 'magenta',
|
||||||
|
@ -249,12 +317,30 @@ class HTMLConverter(PDFConverter):
|
||||||
'char': 'black',
|
'char': 'black',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(
|
||||||
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
|
self,
|
||||||
pagemargin=50, imagewriter=None, debug=0, rect_colors=None,
|
rsrcmgr: PDFResourceManager,
|
||||||
text_colors=None):
|
outfp: AnyIO,
|
||||||
|
codec: str = 'utf-8',
|
||||||
|
pageno: int = 1,
|
||||||
|
laparams: Optional[LAParams] = None,
|
||||||
|
scale: float = 1,
|
||||||
|
fontscale: float = 1.0,
|
||||||
|
layoutmode: str = 'normal',
|
||||||
|
showpageno: bool = True,
|
||||||
|
pagemargin: int = 50,
|
||||||
|
imagewriter: Optional[ImageWriter] = None,
|
||||||
|
debug: int = 0,
|
||||||
|
rect_colors: Optional[Dict[str, str]] = None,
|
||||||
|
text_colors: Optional[Dict[str, str]] = None
|
||||||
|
) -> None:
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||||
laparams=laparams)
|
laparams=laparams)
|
||||||
|
|
||||||
|
# write() assumes a codec for binary I/O, or no codec for text I/O.
|
||||||
|
if self.outfp_binary == (not self.codec):
|
||||||
|
raise ValueError("Codec is required for a binary I/O output")
|
||||||
|
|
||||||
if text_colors is None:
|
if text_colors is None:
|
||||||
text_colors = {'char': 'black'}
|
text_colors = {'char': 'black'}
|
||||||
if rect_colors is None:
|
if rect_colors is None:
|
||||||
|
@ -271,19 +357,20 @@ class HTMLConverter(PDFConverter):
|
||||||
if debug:
|
if debug:
|
||||||
self.rect_colors.update(self.RECT_COLORS)
|
self.rect_colors.update(self.RECT_COLORS)
|
||||||
self.text_colors.update(self.TEXT_COLORS)
|
self.text_colors.update(self.TEXT_COLORS)
|
||||||
self._yoffset = self.pagemargin
|
self._yoffset: float = self.pagemargin
|
||||||
self._font = None
|
self._font: Optional[Tuple[str, float]] = None
|
||||||
self._fontstack = []
|
self._fontstack: List[Optional[Tuple[str, float]]] = []
|
||||||
self.write_header()
|
self.write_header()
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text: str) -> None:
|
||||||
if self.codec:
|
if self.codec:
|
||||||
text = text.encode(self.codec)
|
cast(BinaryIO, self.outfp).write(text.encode(self.codec))
|
||||||
self.outfp.write(text)
|
else:
|
||||||
|
cast(TextIO, self.outfp).write(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_header(self):
|
def write_header(self) -> None:
|
||||||
self.write('<html><head>\n')
|
self.write('<html><head>\n')
|
||||||
if self.codec:
|
if self.codec:
|
||||||
s = '<meta http-equiv="Content-Type" content="text/html; ' \
|
s = '<meta http-equiv="Content-Type" content="text/html; ' \
|
||||||
|
@ -294,7 +381,7 @@ class HTMLConverter(PDFConverter):
|
||||||
self.write('</head><body>\n')
|
self.write('</head><body>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_footer(self):
|
def write_footer(self) -> None:
|
||||||
page_links = ['<a href="#{}">{}</a>'.format(i, i)
|
page_links = ['<a href="#{}">{}</a>'.format(i, i)
|
||||||
for i in range(1, self.pageno)]
|
for i in range(1, self.pageno)]
|
||||||
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
|
s = '<div style="position:absolute; top:0px;">Page: %s</div>\n' % \
|
||||||
|
@ -303,28 +390,49 @@ class HTMLConverter(PDFConverter):
|
||||||
self.write('</body></html>\n')
|
self.write('</body></html>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text: str) -> None:
|
||||||
self.write(enc(text))
|
self.write(enc(text))
|
||||||
return
|
return
|
||||||
|
|
||||||
def place_rect(self, color, borderwidth, x, y, w, h):
|
def place_rect(
|
||||||
color = self.rect_colors.get(color)
|
self,
|
||||||
if color is not None:
|
color: str,
|
||||||
|
borderwidth: int,
|
||||||
|
x: float,
|
||||||
|
y: float,
|
||||||
|
w: float,
|
||||||
|
h: float
|
||||||
|
) -> None:
|
||||||
|
color2 = self.rect_colors.get(color)
|
||||||
|
if color2 is not None:
|
||||||
s = '<span style="position:absolute; border: %s %dpx solid; ' \
|
s = '<span style="position:absolute; border: %s %dpx solid; ' \
|
||||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % \
|
||||||
(color, borderwidth, x * self.scale,
|
(color2, borderwidth, x * self.scale,
|
||||||
(self._yoffset - y) * self.scale, w * self.scale,
|
(self._yoffset - y) * self.scale, w * self.scale,
|
||||||
h * self.scale)
|
h * self.scale)
|
||||||
self.write(
|
self.write(
|
||||||
s)
|
s)
|
||||||
return
|
return
|
||||||
|
|
||||||
def place_border(self, color, borderwidth, item):
|
def place_border(
|
||||||
|
self,
|
||||||
|
color: str,
|
||||||
|
borderwidth: int,
|
||||||
|
item: LTComponent
|
||||||
|
) -> None:
|
||||||
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
|
self.place_rect(color, borderwidth, item.x0, item.y1, item.width,
|
||||||
item.height)
|
item.height)
|
||||||
return
|
return
|
||||||
|
|
||||||
def place_image(self, item, borderwidth, x, y, w, h):
|
def place_image(
|
||||||
|
self,
|
||||||
|
item: LTImage,
|
||||||
|
borderwidth: int,
|
||||||
|
x: float,
|
||||||
|
y: float,
|
||||||
|
w: float,
|
||||||
|
h: float
|
||||||
|
) -> None:
|
||||||
if self.imagewriter is not None:
|
if self.imagewriter is not None:
|
||||||
name = self.imagewriter.export_image(item)
|
name = self.imagewriter.export_image(item)
|
||||||
s = '<img src="%s" border="%d" style="position:absolute; ' \
|
s = '<img src="%s" border="%d" style="position:absolute; ' \
|
||||||
|
@ -335,19 +443,35 @@ class HTMLConverter(PDFConverter):
|
||||||
self.write(s)
|
self.write(s)
|
||||||
return
|
return
|
||||||
|
|
||||||
def place_text(self, color, text, x, y, size):
|
def place_text(
|
||||||
color = self.text_colors.get(color)
|
self,
|
||||||
if color is not None:
|
color: str,
|
||||||
|
text: str,
|
||||||
|
x: float,
|
||||||
|
y: float,
|
||||||
|
size: float
|
||||||
|
) -> None:
|
||||||
|
color2 = self.text_colors.get(color)
|
||||||
|
if color2 is not None:
|
||||||
s = '<span style="position:absolute; color:%s; left:%dpx; ' \
|
s = '<span style="position:absolute; color:%s; left:%dpx; ' \
|
||||||
'top:%dpx; font-size:%dpx;">' % \
|
'top:%dpx; font-size:%dpx;">' % \
|
||||||
(color, x * self.scale, (self._yoffset - y) * self.scale,
|
(color2, x * self.scale, (self._yoffset - y) * self.scale,
|
||||||
size * self.scale * self.fontscale)
|
size * self.scale * self.fontscale)
|
||||||
self.write(s)
|
self.write(s)
|
||||||
self.write_text(text)
|
self.write_text(text)
|
||||||
self.write('</span>\n')
|
self.write('</span>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
|
def begin_div(
|
||||||
|
self,
|
||||||
|
color: str,
|
||||||
|
borderwidth: int,
|
||||||
|
x: float,
|
||||||
|
y: float,
|
||||||
|
w: float,
|
||||||
|
h: float,
|
||||||
|
writing_mode: str = 'False'
|
||||||
|
) -> None:
|
||||||
self._fontstack.append(self._font)
|
self._fontstack.append(self._font)
|
||||||
self._font = None
|
self._font = None
|
||||||
s = '<div style="position:absolute; border: %s %dpx solid; ' \
|
s = '<div style="position:absolute; border: %s %dpx solid; ' \
|
||||||
|
@ -358,14 +482,14 @@ class HTMLConverter(PDFConverter):
|
||||||
self.write(s)
|
self.write(s)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_div(self, color):
|
def end_div(self, color: str) -> None:
|
||||||
if self._font is not None:
|
if self._font is not None:
|
||||||
self.write('</span>')
|
self.write('</span>')
|
||||||
self._font = self._fontstack.pop()
|
self._font = self._fontstack.pop()
|
||||||
self.write('</div>')
|
self.write('</div>')
|
||||||
return
|
return
|
||||||
|
|
||||||
def put_text(self, text, fontname, fontsize):
|
def put_text(self, text: str, fontname: str, fontsize: float) -> None:
|
||||||
font = (fontname, fontsize)
|
font = (fontname, fontsize)
|
||||||
if font != self._font:
|
if font != self._font:
|
||||||
if self._font is not None:
|
if self._font is not None:
|
||||||
|
@ -379,19 +503,20 @@ class HTMLConverter(PDFConverter):
|
||||||
self.write_text(text)
|
self.write_text(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
def put_newline(self):
|
def put_newline(self) -> None:
|
||||||
self.write('<br>')
|
self.write('<br>')
|
||||||
return
|
return
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage: LTPage) -> None:
|
||||||
def show_group(item):
|
def show_group(item: Union[LTTextGroup, TextGroupElement]) -> None:
|
||||||
if isinstance(item, LTTextGroup):
|
if isinstance(item, LTTextGroup):
|
||||||
self.place_border('textgroup', 1, item)
|
self.place_border('textgroup', 1, item)
|
||||||
for child in item:
|
for child in item:
|
||||||
show_group(child)
|
show_group(child)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render(item):
|
def render(item: LTItem) -> None:
|
||||||
|
child: LTItem
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self._yoffset += item.y1
|
self._yoffset += item.y1
|
||||||
self.place_border('page', 1, item)
|
self.place_border('page', 1, item)
|
||||||
|
@ -455,31 +580,45 @@ class HTMLConverter(PDFConverter):
|
||||||
self._yoffset += self.pagemargin
|
self._yoffset += self.pagemargin
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
self.write_footer()
|
self.write_footer()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class XMLConverter(PDFConverter):
|
class XMLConverter(PDFConverter[AnyIO]):
|
||||||
|
|
||||||
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(
|
||||||
imagewriter=None, stripcontrol=False):
|
self,
|
||||||
|
rsrcmgr: PDFResourceManager,
|
||||||
|
outfp: AnyIO,
|
||||||
|
codec: str = 'utf-8',
|
||||||
|
pageno: int = 1,
|
||||||
|
laparams: Optional[LAParams] = None,
|
||||||
|
imagewriter: Optional[ImageWriter] = None,
|
||||||
|
stripcontrol: bool = False
|
||||||
|
) -> None:
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno,
|
||||||
laparams=laparams)
|
laparams=laparams)
|
||||||
|
|
||||||
|
# write() assumes a codec for binary I/O, or no codec for text I/O.
|
||||||
|
if self.outfp_binary == (not self.codec):
|
||||||
|
raise ValueError("Codec is required for a binary I/O output")
|
||||||
|
|
||||||
self.imagewriter = imagewriter
|
self.imagewriter = imagewriter
|
||||||
self.stripcontrol = stripcontrol
|
self.stripcontrol = stripcontrol
|
||||||
self.write_header()
|
self.write_header()
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text: str) -> None:
|
||||||
if self.codec:
|
if self.codec:
|
||||||
text = text.encode(self.codec)
|
cast(BinaryIO, self.outfp).write(text.encode(self.codec))
|
||||||
self.outfp.write(text)
|
else:
|
||||||
|
cast(TextIO, self.outfp).write(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_header(self):
|
def write_header(self) -> None:
|
||||||
if self.codec:
|
if self.codec:
|
||||||
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
|
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
|
||||||
else:
|
else:
|
||||||
|
@ -487,18 +626,18 @@ class XMLConverter(PDFConverter):
|
||||||
self.write('<pages>\n')
|
self.write('<pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_footer(self):
|
def write_footer(self) -> None:
|
||||||
self.write('</pages>\n')
|
self.write('</pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text: str) -> None:
|
||||||
if self.stripcontrol:
|
if self.stripcontrol:
|
||||||
text = self.CONTROL.sub('', text)
|
text = self.CONTROL.sub('', text)
|
||||||
self.write(enc(text))
|
self.write(enc(text))
|
||||||
return
|
return
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage: LTPage) -> None:
|
||||||
def show_group(item):
|
def show_group(item: LTItem) -> None:
|
||||||
if isinstance(item, LTTextBox):
|
if isinstance(item, LTTextBox):
|
||||||
self.write('<textbox id="%d" bbox="%s" />\n' %
|
self.write('<textbox id="%d" bbox="%s" />\n' %
|
||||||
(item.index, bbox2str(item.bbox)))
|
(item.index, bbox2str(item.bbox)))
|
||||||
|
@ -509,7 +648,8 @@ class XMLConverter(PDFConverter):
|
||||||
self.write('</textgroup>\n')
|
self.write('</textgroup>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def render(item):
|
def render(item: LTItem) -> None:
|
||||||
|
child: LTItem
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
|
s = '<page id="%s" bbox="%s" rotate="%d">\n' % \
|
||||||
(item.pageid, bbox2str(item.bbox), item.rotate)
|
(item.pageid, bbox2str(item.bbox), item.rotate)
|
||||||
|
@ -580,6 +720,6 @@ class XMLConverter(PDFConverter):
|
||||||
render(ltpage)
|
render(ltpage)
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
self.write_footer()
|
self.write_footer()
|
||||||
return
|
return
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from typing import Dict, Iterable, Optional, cast
|
||||||
|
|
||||||
from .glyphlist import glyphname2unicode
|
from .glyphlist import glyphname2unicode
|
||||||
from .latin_enc import ENCODING
|
from .latin_enc import ENCODING
|
||||||
|
@ -10,7 +11,7 @@ HEXADECIMAL = re.compile(r'[0-9a-fA-F]+')
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def name2unicode(name):
|
def name2unicode(name: str) -> str:
|
||||||
"""Converts Adobe glyph names to Unicode numbers.
|
"""Converts Adobe glyph names to Unicode numbers.
|
||||||
|
|
||||||
In contrast to the specification, this raises a KeyError instead of return
|
In contrast to the specification, this raises a KeyError instead of return
|
||||||
|
@ -32,7 +33,7 @@ def name2unicode(name):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if name in glyphname2unicode:
|
if name in glyphname2unicode:
|
||||||
return glyphname2unicode.get(name)
|
return glyphname2unicode[name]
|
||||||
|
|
||||||
elif name.startswith('uni'):
|
elif name.startswith('uni'):
|
||||||
name_without_uni = name.strip('uni')
|
name_without_uni = name.strip('uni')
|
||||||
|
@ -59,7 +60,7 @@ def name2unicode(name):
|
||||||
'it does not match specification' % name)
|
'it does not match specification' % name)
|
||||||
|
|
||||||
|
|
||||||
def raise_key_error_for_invalid_unicode(unicode_digit):
|
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
|
||||||
"""Unicode values should not be in the range D800 through DFFF because
|
"""Unicode values should not be in the range D800 through DFFF because
|
||||||
that is used for surrogate pairs in UTF-16
|
that is used for surrogate pairs in UTF-16
|
||||||
|
|
||||||
|
@ -72,10 +73,10 @@ def raise_key_error_for_invalid_unicode(unicode_digit):
|
||||||
|
|
||||||
class EncodingDB:
|
class EncodingDB:
|
||||||
|
|
||||||
std2unicode = {}
|
std2unicode: Dict[int, str] = {}
|
||||||
mac2unicode = {}
|
mac2unicode: Dict[int, str] = {}
|
||||||
win2unicode = {}
|
win2unicode: Dict[int, str] = {}
|
||||||
pdf2unicode = {}
|
pdf2unicode: Dict[int, str] = {}
|
||||||
for (name, std, mac, win, pdf) in ENCODING:
|
for (name, std, mac, win, pdf) in ENCODING:
|
||||||
c = name2unicode(name)
|
c = name2unicode(name)
|
||||||
if std:
|
if std:
|
||||||
|
@ -95,7 +96,11 @@ class EncodingDB:
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_encoding(cls, name, diff=None):
|
def get_encoding(
|
||||||
|
cls,
|
||||||
|
name: str,
|
||||||
|
diff: Optional[Iterable[object]] = None
|
||||||
|
) -> Dict[int, str]:
|
||||||
cid2unicode = cls.encodings.get(name, cls.std2unicode)
|
cid2unicode = cls.encodings.get(name, cls.std2unicode)
|
||||||
if diff:
|
if diff:
|
||||||
cid2unicode = cid2unicode.copy()
|
cid2unicode = cid2unicode.copy()
|
||||||
|
@ -105,7 +110,7 @@ class EncodingDB:
|
||||||
cid = x
|
cid = x
|
||||||
elif isinstance(x, PSLiteral):
|
elif isinstance(x, PSLiteral):
|
||||||
try:
|
try:
|
||||||
cid2unicode[cid] = name2unicode(x.name)
|
cid2unicode[cid] = name2unicode(cast(str, x.name))
|
||||||
except (KeyError, ValueError) as e:
|
except (KeyError, ValueError) as e:
|
||||||
log.debug(str(e))
|
log.debug(str(e))
|
||||||
cid += 1
|
cid += 1
|
||||||
|
|
|
@ -3,22 +3,36 @@
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
|
||||||
|
|
||||||
from .converter import XMLConverter, HTMLConverter, TextConverter, \
|
from .converter import XMLConverter, HTMLConverter, TextConverter, \
|
||||||
PDFPageAggregator
|
PDFPageAggregator
|
||||||
from .image import ImageWriter
|
from .image import ImageWriter
|
||||||
from .layout import LAParams
|
from .layout import LAParams, LTPage
|
||||||
from .pdfdevice import TagExtractor
|
from .pdfdevice import PDFDevice, TagExtractor
|
||||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
from .pdfpage import PDFPage
|
from .pdfpage import PDFPage
|
||||||
from .utils import open_filename
|
from .utils import open_filename, FileOrName, AnyIO
|
||||||
|
|
||||||
|
|
||||||
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
def extract_text_to_fp(
|
||||||
laparams=None, maxpages=0, page_numbers=None,
|
inf: BinaryIO,
|
||||||
password="", scale=1.0, rotation=0, layoutmode='normal',
|
outfp: AnyIO,
|
||||||
output_dir=None, strip_control=False, debug=False,
|
output_type: str = 'text',
|
||||||
disable_caching=False, **kwargs):
|
codec: str = 'utf-8',
|
||||||
|
laparams: Optional[LAParams] = None,
|
||||||
|
maxpages: int = 0,
|
||||||
|
page_numbers: Optional[Container[int]] = None,
|
||||||
|
password: str = "",
|
||||||
|
scale: float = 1.0,
|
||||||
|
rotation: int = 0,
|
||||||
|
layoutmode: str = 'normal',
|
||||||
|
output_dir: Optional[str] = None,
|
||||||
|
strip_control: bool = False,
|
||||||
|
debug: bool = False,
|
||||||
|
disable_caching: bool = False,
|
||||||
|
**kwargs: Any
|
||||||
|
) -> None:
|
||||||
"""Parses text from inf-file and writes to outfp file-like object.
|
"""Parses text from inf-file and writes to outfp file-like object.
|
||||||
|
|
||||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
Takes loads of optional arguments but the defaults are somewhat sane.
|
||||||
|
@ -56,7 +70,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
imagewriter = ImageWriter(output_dir)
|
imagewriter = ImageWriter(output_dir)
|
||||||
|
|
||||||
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
||||||
device = None
|
device: Optional[PDFDevice] = None
|
||||||
|
|
||||||
if output_type != 'text' and outfp == sys.stdout:
|
if output_type != 'text' and outfp == sys.stdout:
|
||||||
outfp = sys.stdout.buffer
|
outfp = sys.stdout.buffer
|
||||||
|
@ -76,13 +90,15 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
imagewriter=imagewriter)
|
imagewriter=imagewriter)
|
||||||
|
|
||||||
elif output_type == 'tag':
|
elif output_type == 'tag':
|
||||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
# Binary I/O is required, but we have no good way to test it here.
|
||||||
|
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
msg = f"Output type can be text, html, xml or tag but is " \
|
msg = f"Output type can be text, html, xml or tag but is " \
|
||||||
f"{output_type}"
|
f"{output_type}"
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
assert device is not None
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
for page in PDFPage.get_pages(inf,
|
for page in PDFPage.get_pages(inf,
|
||||||
page_numbers,
|
page_numbers,
|
||||||
|
@ -95,8 +111,15 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
device.close()
|
device.close()
|
||||||
|
|
||||||
|
|
||||||
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
def extract_text(
|
||||||
caching=True, codec='utf-8', laparams=None):
|
pdf_file: FileOrName,
|
||||||
|
password: str = '',
|
||||||
|
page_numbers: Optional[Container[int]] = None,
|
||||||
|
maxpages: int = 0,
|
||||||
|
caching: bool = True,
|
||||||
|
codec: str = 'utf-8',
|
||||||
|
laparams: Optional[LAParams] = None
|
||||||
|
) -> str:
|
||||||
"""Parse and return the text contained in a PDF file.
|
"""Parse and return the text contained in a PDF file.
|
||||||
|
|
||||||
:param pdf_file: Either a file path or a file-like object for the PDF file
|
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||||
|
@ -114,6 +137,7 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
|
|
||||||
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
|
with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
|
||||||
|
fp = cast(BinaryIO, fp) # we opened in binary mode
|
||||||
rsrcmgr = PDFResourceManager(caching=caching)
|
rsrcmgr = PDFResourceManager(caching=caching)
|
||||||
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
device = TextConverter(rsrcmgr, output_string, codec=codec,
|
||||||
laparams=laparams)
|
laparams=laparams)
|
||||||
|
@ -131,8 +155,14 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
return output_string.getvalue()
|
return output_string.getvalue()
|
||||||
|
|
||||||
|
|
||||||
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
def extract_pages(
|
||||||
caching=True, laparams=None):
|
pdf_file: FileOrName,
|
||||||
|
password: str = '',
|
||||||
|
page_numbers: Optional[Container[int]] = None,
|
||||||
|
maxpages: int = 0,
|
||||||
|
caching: bool = True,
|
||||||
|
laparams: Optional[LAParams] = None
|
||||||
|
) -> Iterator[LTPage]:
|
||||||
"""Extract and yield LTPage objects
|
"""Extract and yield LTPage objects
|
||||||
|
|
||||||
:param pdf_file: Either a file path or a file-like object for the PDF file
|
:param pdf_file: Either a file path or a file-like object for the PDF file
|
||||||
|
@ -149,6 +179,7 @@ def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
|
|
||||||
with open_filename(pdf_file, "rb") as fp:
|
with open_filename(pdf_file, "rb") as fp:
|
||||||
|
fp = cast(BinaryIO, fp) # we opened in binary mode
|
||||||
resource_manager = PDFResourceManager(caching=caching)
|
resource_manager = PDFResourceManager(caching=caching)
|
||||||
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
||||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||||
|
|
|
@ -2,20 +2,28 @@ import os
|
||||||
import os.path
|
import os.path
|
||||||
import struct
|
import struct
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from typing import BinaryIO, Tuple
|
||||||
|
|
||||||
from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
|
from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
|
||||||
|
from .layout import LTImage
|
||||||
from .pdfcolor import LITERAL_DEVICE_CMYK
|
from .pdfcolor import LITERAL_DEVICE_CMYK
|
||||||
from .pdfcolor import LITERAL_DEVICE_GRAY
|
from .pdfcolor import LITERAL_DEVICE_GRAY
|
||||||
from .pdfcolor import LITERAL_DEVICE_RGB
|
from .pdfcolor import LITERAL_DEVICE_RGB
|
||||||
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE
|
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE
|
||||||
|
|
||||||
|
|
||||||
def align32(x):
|
def align32(x: int) -> int:
|
||||||
return ((x+3)//4)*4
|
return ((x+3)//4)*4
|
||||||
|
|
||||||
|
|
||||||
class BMPWriter:
|
class BMPWriter:
|
||||||
def __init__(self, fp, bits, width, height):
|
def __init__(
|
||||||
|
self,
|
||||||
|
fp: BinaryIO,
|
||||||
|
bits: int,
|
||||||
|
width: int,
|
||||||
|
height: int
|
||||||
|
) -> None:
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.bits = bits
|
self.bits = bits
|
||||||
self.width = width
|
self.width = width
|
||||||
|
@ -51,7 +59,7 @@ class BMPWriter:
|
||||||
self.pos1 = self.pos0 + self.datasize
|
self.pos1 = self.pos0 + self.datasize
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_line(self, y, data):
|
def write_line(self, y: int, data: bytes) -> None:
|
||||||
self.fp.seek(self.pos1 - (y+1)*self.linesize)
|
self.fp.seek(self.pos1 - (y+1)*self.linesize)
|
||||||
self.fp.write(data)
|
self.fp.write(data)
|
||||||
return
|
return
|
||||||
|
@ -63,13 +71,13 @@ class ImageWriter:
|
||||||
Supports various image types: JPEG, JBIG2 and bitmaps
|
Supports various image types: JPEG, JBIG2 and bitmaps
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, outdir):
|
def __init__(self, outdir: str) -> None:
|
||||||
self.outdir = outdir
|
self.outdir = outdir
|
||||||
if not os.path.exists(self.outdir):
|
if not os.path.exists(self.outdir):
|
||||||
os.makedirs(self.outdir)
|
os.makedirs(self.outdir)
|
||||||
return
|
return
|
||||||
|
|
||||||
def export_image(self, image):
|
def export_image(self, image: LTImage) -> str:
|
||||||
(width, height) = image.srcsize
|
(width, height) = image.srcsize
|
||||||
|
|
||||||
is_jbig2 = self.is_jbig2_image(image)
|
is_jbig2 = self.is_jbig2_image(image)
|
||||||
|
@ -80,8 +88,9 @@ class ImageWriter:
|
||||||
fp = open(path, 'wb')
|
fp = open(path, 'wb')
|
||||||
if ext == '.jpg':
|
if ext == '.jpg':
|
||||||
raw_data = image.stream.get_rawdata()
|
raw_data = image.stream.get_rawdata()
|
||||||
|
assert raw_data is not None
|
||||||
if LITERAL_DEVICE_CMYK in image.colorspace:
|
if LITERAL_DEVICE_CMYK in image.colorspace:
|
||||||
from PIL import Image
|
from PIL import Image # type: ignore[import]
|
||||||
from PIL import ImageChops
|
from PIL import ImageChops
|
||||||
ifp = BytesIO(raw_data)
|
ifp = BytesIO(raw_data)
|
||||||
i = Image.open(ifp)
|
i = Image.open(ifp)
|
||||||
|
@ -128,7 +137,7 @@ class ImageWriter:
|
||||||
return name
|
return name
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_jbig2_image(image):
|
def is_jbig2_image(image: LTImage) -> bool:
|
||||||
filters = image.stream.get_filters()
|
filters = image.stream.get_filters()
|
||||||
is_jbig2 = False
|
is_jbig2 = False
|
||||||
for filter_name, params in filters:
|
for filter_name, params in filters:
|
||||||
|
@ -138,7 +147,12 @@ class ImageWriter:
|
||||||
return is_jbig2
|
return is_jbig2
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_image_extension(image, width, height, is_jbig2):
|
def _get_image_extension(
|
||||||
|
image: LTImage,
|
||||||
|
width: int,
|
||||||
|
height: int,
|
||||||
|
is_jbig2: bool
|
||||||
|
) -> str:
|
||||||
filters = image.stream.get_filters()
|
filters = image.stream.get_filters()
|
||||||
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
|
||||||
ext = '.jpg'
|
ext = '.jpg'
|
||||||
|
@ -154,7 +168,11 @@ class ImageWriter:
|
||||||
return ext
|
return ext
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_unique_image_name(dirname, image_name, ext):
|
def _create_unique_image_name(
|
||||||
|
dirname: str,
|
||||||
|
image_name: str,
|
||||||
|
ext: str
|
||||||
|
) -> Tuple[str, str]:
|
||||||
name = image_name + ext
|
name = image_name + ext
|
||||||
path = os.path.join(dirname, name)
|
path = os.path.join(dirname, name)
|
||||||
img_index = 0
|
img_index = 0
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
from struct import pack, unpack, calcsize
|
from struct import pack, unpack, calcsize
|
||||||
|
from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple, Union, cast
|
||||||
|
|
||||||
# segment structure base
|
# segment structure base
|
||||||
SEG_STRUCT = [
|
SEG_STRUCT = [
|
||||||
|
@ -34,15 +35,15 @@ FILE_HEAD_FLAG_SEQUENTIAL = 0b00000001
|
||||||
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
|
FILE_HEAD_FLAG_PAGES_UNKNOWN = 0b00000010
|
||||||
|
|
||||||
|
|
||||||
def bit_set(bit_pos, value):
|
def bit_set(bit_pos: int, value: int) -> bool:
|
||||||
return bool((value >> bit_pos) & 1)
|
return bool((value >> bit_pos) & 1)
|
||||||
|
|
||||||
|
|
||||||
def check_flag(flag, value):
|
def check_flag(flag: int, value: int) -> bool:
|
||||||
return bool(flag & value)
|
return bool(flag & value)
|
||||||
|
|
||||||
|
|
||||||
def masked_value(mask, value):
|
def masked_value(mask: int, value: int) -> int:
|
||||||
for bit_pos in range(0, 31):
|
for bit_pos in range(0, 31):
|
||||||
if bit_set(bit_pos, mask):
|
if bit_set(bit_pos, mask):
|
||||||
return (value & mask) >> bit_pos
|
return (value & mask) >> bit_pos
|
||||||
|
@ -50,7 +51,7 @@ def masked_value(mask, value):
|
||||||
raise Exception("Invalid mask or value")
|
raise Exception("Invalid mask or value")
|
||||||
|
|
||||||
|
|
||||||
def mask_value(mask, value):
|
def mask_value(mask: int, value: int) -> int:
|
||||||
for bit_pos in range(0, 31):
|
for bit_pos in range(0, 31):
|
||||||
if bit_set(bit_pos, mask):
|
if bit_set(bit_pos, mask):
|
||||||
return (value & (mask >> bit_pos)) << bit_pos
|
return (value & (mask >> bit_pos)) << bit_pos
|
||||||
|
@ -58,25 +59,34 @@ def mask_value(mask, value):
|
||||||
raise Exception("Invalid mask or value")
|
raise Exception("Invalid mask or value")
|
||||||
|
|
||||||
|
|
||||||
|
def unpack_int(format: str, buffer: bytes) -> int:
|
||||||
|
assert format in {">B", ">I", ">L"}
|
||||||
|
[result] = cast(Tuple[int], unpack(format, buffer))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
JBIG2SegmentFlags = Dict[str, Union[int, bool]]
|
||||||
|
JBIG2RetentionFlags = Dict[str, Union[int, List[int], List[bool]]]
|
||||||
|
JBIG2Segment = Dict[str, Union[bool, int, bytes, JBIG2SegmentFlags,
|
||||||
|
JBIG2RetentionFlags]]
|
||||||
|
|
||||||
|
|
||||||
class JBIG2StreamReader:
|
class JBIG2StreamReader:
|
||||||
"""Read segments from a JBIG2 byte stream"""
|
"""Read segments from a JBIG2 byte stream"""
|
||||||
|
def __init__(self, stream: BinaryIO) -> None:
|
||||||
def __init__(self, stream):
|
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
|
|
||||||
def get_segments(self):
|
def get_segments(self) -> List[JBIG2Segment]:
|
||||||
segments = []
|
segments: List[JBIG2Segment] = []
|
||||||
while not self.is_eof():
|
while not self.is_eof():
|
||||||
segment = {}
|
segment: JBIG2Segment = {}
|
||||||
for field_format, name in SEG_STRUCT:
|
for field_format, name in SEG_STRUCT:
|
||||||
field_len = calcsize(field_format)
|
field_len = calcsize(field_format)
|
||||||
field = self.stream.read(field_len)
|
field = self.stream.read(field_len)
|
||||||
if len(field) < field_len:
|
if len(field) < field_len:
|
||||||
segment["_error"] = True
|
segment["_error"] = True
|
||||||
break
|
break
|
||||||
value = unpack(field_format, field)
|
value = unpack_int(field_format, field)
|
||||||
if len(value) == 1:
|
|
||||||
[value] = value
|
|
||||||
parser = getattr(self, "parse_%s" % name, None)
|
parser = getattr(self, "parse_%s" % name, None)
|
||||||
if callable(parser):
|
if callable(parser):
|
||||||
value = parser(segment, value, field)
|
value = parser(segment, value, field)
|
||||||
|
@ -86,21 +96,31 @@ class JBIG2StreamReader:
|
||||||
segments.append(segment)
|
segments.append(segment)
|
||||||
return segments
|
return segments
|
||||||
|
|
||||||
def is_eof(self):
|
def is_eof(self) -> bool:
|
||||||
if self.stream.read(1) == b'':
|
if self.stream.read(1) == b'':
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
self.stream.seek(-1, os.SEEK_CUR)
|
self.stream.seek(-1, os.SEEK_CUR)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def parse_flags(self, segment, flags, field):
|
def parse_flags(
|
||||||
|
self,
|
||||||
|
segment: JBIG2Segment,
|
||||||
|
flags: int,
|
||||||
|
field: bytes
|
||||||
|
) -> JBIG2SegmentFlags:
|
||||||
return {
|
return {
|
||||||
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
|
"deferred": check_flag(HEADER_FLAG_DEFERRED, flags),
|
||||||
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
|
"page_assoc_long": check_flag(HEADER_FLAG_PAGE_ASSOC_LONG, flags),
|
||||||
"type": masked_value(SEG_TYPE_MASK, flags)
|
"type": masked_value(SEG_TYPE_MASK, flags)
|
||||||
}
|
}
|
||||||
|
|
||||||
def parse_retention_flags(self, segment, flags, field):
|
def parse_retention_flags(
|
||||||
|
self,
|
||||||
|
segment: JBIG2Segment,
|
||||||
|
flags: int,
|
||||||
|
field: bytes
|
||||||
|
) -> JBIG2RetentionFlags:
|
||||||
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
|
ref_count = masked_value(REF_COUNT_SHORT_MASK, flags)
|
||||||
retain_segments = []
|
retain_segments = []
|
||||||
ref_segments = []
|
ref_segments = []
|
||||||
|
@ -110,15 +130,16 @@ class JBIG2StreamReader:
|
||||||
retain_segments.append(bit_set(bit_pos, flags))
|
retain_segments.append(bit_set(bit_pos, flags))
|
||||||
else:
|
else:
|
||||||
field += self.stream.read(3)
|
field += self.stream.read(3)
|
||||||
[ref_count] = unpack(">L", field)
|
ref_count = unpack_int(">L", field)
|
||||||
ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
|
ref_count = masked_value(REF_COUNT_LONG_MASK, ref_count)
|
||||||
ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
|
ret_bytes_count = int(math.ceil((ref_count + 1) / 8))
|
||||||
for ret_byte_index in range(ret_bytes_count):
|
for ret_byte_index in range(ret_bytes_count):
|
||||||
[ret_byte] = unpack(">B", self.stream.read(1))
|
ret_byte = unpack_int(">B", self.stream.read(1))
|
||||||
for bit_pos in range(7):
|
for bit_pos in range(7):
|
||||||
retain_segments.append(bit_set(bit_pos, ret_byte))
|
retain_segments.append(bit_set(bit_pos, ret_byte))
|
||||||
|
|
||||||
seg_num = segment["number"]
|
seg_num = segment["number"]
|
||||||
|
assert isinstance(seg_num, int)
|
||||||
if seg_num <= 256:
|
if seg_num <= 256:
|
||||||
ref_format = ">B"
|
ref_format = ">B"
|
||||||
elif seg_num <= 65536:
|
elif seg_num <= 65536:
|
||||||
|
@ -129,8 +150,8 @@ class JBIG2StreamReader:
|
||||||
ref_size = calcsize(ref_format)
|
ref_size = calcsize(ref_format)
|
||||||
|
|
||||||
for ref_index in range(ref_count):
|
for ref_index in range(ref_count):
|
||||||
ref = self.stream.read(ref_size)
|
ref_data = self.stream.read(ref_size)
|
||||||
[ref] = unpack(ref_format, ref)
|
ref = unpack_int(ref_format, ref_data)
|
||||||
ref_segments.append(ref)
|
ref_segments.append(ref)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -139,15 +160,26 @@ class JBIG2StreamReader:
|
||||||
"ref_segments": ref_segments,
|
"ref_segments": ref_segments,
|
||||||
}
|
}
|
||||||
|
|
||||||
def parse_page_assoc(self, segment, page, field):
|
def parse_page_assoc(
|
||||||
if segment["flags"]["page_assoc_long"]:
|
self,
|
||||||
|
segment: JBIG2Segment,
|
||||||
|
page: int,
|
||||||
|
field: bytes
|
||||||
|
) -> int:
|
||||||
|
if cast(JBIG2SegmentFlags, segment["flags"])["page_assoc_long"]:
|
||||||
field += self.stream.read(3)
|
field += self.stream.read(3)
|
||||||
[page] = unpack(">L", field)
|
page = unpack_int(">L", field)
|
||||||
return page
|
return page
|
||||||
|
|
||||||
def parse_data_length(self, segment, length, field):
|
def parse_data_length(
|
||||||
|
self,
|
||||||
|
segment: JBIG2Segment,
|
||||||
|
length: int,
|
||||||
|
field: bytes
|
||||||
|
) -> int:
|
||||||
if length:
|
if length:
|
||||||
if (segment["flags"]["type"] == SEG_TYPE_IMMEDIATE_GEN_REGION) \
|
if (cast(JBIG2SegmentFlags, segment["flags"])["type"] ==
|
||||||
|
SEG_TYPE_IMMEDIATE_GEN_REGION) \
|
||||||
and (length == DATA_LEN_UNKNOWN):
|
and (length == DATA_LEN_UNKNOWN):
|
||||||
|
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
|
@ -163,25 +195,36 @@ class JBIG2StreamReader:
|
||||||
class JBIG2StreamWriter:
|
class JBIG2StreamWriter:
|
||||||
"""Write JBIG2 segments to a file in JBIG2 format"""
|
"""Write JBIG2 segments to a file in JBIG2 format"""
|
||||||
|
|
||||||
def __init__(self, stream):
|
EMPTY_RETENTION_FLAGS: JBIG2RetentionFlags = {
|
||||||
|
'ref_count': 0,
|
||||||
|
'ref_segments': cast(List[int], []),
|
||||||
|
'retain_segments': cast(List[bool], [])
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, stream: BinaryIO) -> None:
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
|
|
||||||
def write_segments(self, segments, fix_last_page=True):
|
def write_segments(
|
||||||
|
self,
|
||||||
|
segments: Iterable[JBIG2Segment],
|
||||||
|
fix_last_page: bool = True
|
||||||
|
) -> int:
|
||||||
data_len = 0
|
data_len = 0
|
||||||
current_page = None
|
current_page: Optional[int] = None
|
||||||
seg_num = None
|
seg_num: Optional[int] = None
|
||||||
|
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
data = self.encode_segment(segment)
|
data = self.encode_segment(segment)
|
||||||
self.stream.write(data)
|
self.stream.write(data)
|
||||||
data_len += len(data)
|
data_len += len(data)
|
||||||
|
|
||||||
seg_num = segment["number"]
|
seg_num = cast(Optional[int], segment["number"])
|
||||||
|
|
||||||
if fix_last_page:
|
if fix_last_page:
|
||||||
seg_page = segment.get("page_assoc")
|
seg_page = cast(int, segment.get("page_assoc"))
|
||||||
|
|
||||||
if segment["flags"]["type"] == SEG_TYPE_END_OF_PAGE:
|
if cast(JBIG2SegmentFlags, segment["flags"])["type"] == \
|
||||||
|
SEG_TYPE_END_OF_PAGE:
|
||||||
current_page = None
|
current_page = None
|
||||||
elif seg_page:
|
elif seg_page:
|
||||||
current_page = seg_page
|
current_page = seg_page
|
||||||
|
@ -194,7 +237,11 @@ class JBIG2StreamWriter:
|
||||||
|
|
||||||
return data_len
|
return data_len
|
||||||
|
|
||||||
def write_file(self, segments, fix_last_page=True):
|
def write_file(
|
||||||
|
self,
|
||||||
|
segments: Iterable[JBIG2Segment],
|
||||||
|
fix_last_page: bool = True
|
||||||
|
) -> int:
|
||||||
header = FILE_HEADER_ID
|
header = FILE_HEADER_ID
|
||||||
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
|
header_flags = FILE_HEAD_FLAG_SEQUENTIAL | FILE_HEAD_FLAG_PAGES_UNKNOWN
|
||||||
header += pack(">B", header_flags)
|
header += pack(">B", header_flags)
|
||||||
|
@ -205,7 +252,7 @@ class JBIG2StreamWriter:
|
||||||
|
|
||||||
seg_num = 0
|
seg_num = 0
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
seg_num = segment["number"]
|
seg_num = cast(int, segment["number"])
|
||||||
|
|
||||||
eof_segment = self.get_eof_segment(seg_num + 1)
|
eof_segment = self.get_eof_segment(seg_num + 1)
|
||||||
data = self.encode_segment(eof_segment)
|
data = self.encode_segment(eof_segment)
|
||||||
|
@ -215,7 +262,7 @@ class JBIG2StreamWriter:
|
||||||
|
|
||||||
return data_len
|
return data_len
|
||||||
|
|
||||||
def encode_segment(self, segment):
|
def encode_segment(self, segment: JBIG2Segment) -> bytes:
|
||||||
data = b''
|
data = b''
|
||||||
for field_format, name in SEG_STRUCT:
|
for field_format, name in SEG_STRUCT:
|
||||||
value = segment.get(name)
|
value = segment.get(name)
|
||||||
|
@ -227,7 +274,8 @@ class JBIG2StreamWriter:
|
||||||
data += field
|
data += field
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def encode_flags(self, value, segment):
|
def encode_flags(self, value: JBIG2SegmentFlags, segment: JBIG2Segment
|
||||||
|
) -> bytes:
|
||||||
flags = 0
|
flags = 0
|
||||||
if value.get("deferred"):
|
if value.get("deferred"):
|
||||||
flags |= HEADER_FLAG_DEFERRED
|
flags |= HEADER_FLAG_DEFERRED
|
||||||
|
@ -237,17 +285,22 @@ class JBIG2StreamWriter:
|
||||||
if value["page_assoc_long"] else flags
|
if value["page_assoc_long"] else flags
|
||||||
else:
|
else:
|
||||||
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
flags |= HEADER_FLAG_PAGE_ASSOC_LONG \
|
||||||
if segment.get("page", 0) > 255 else flags
|
if cast(int, segment.get("page", 0)) > 255 else flags
|
||||||
|
|
||||||
flags |= mask_value(SEG_TYPE_MASK, value["type"])
|
flags |= mask_value(SEG_TYPE_MASK, value["type"])
|
||||||
|
|
||||||
return pack(">B", flags)
|
return pack(">B", flags)
|
||||||
|
|
||||||
def encode_retention_flags(self, value, segment):
|
def encode_retention_flags(
|
||||||
|
self,
|
||||||
|
value: JBIG2RetentionFlags,
|
||||||
|
segment: JBIG2Segment
|
||||||
|
) -> bytes:
|
||||||
flags = []
|
flags = []
|
||||||
flags_format = ">B"
|
flags_format = ">B"
|
||||||
ref_count = value["ref_count"]
|
ref_count = value["ref_count"]
|
||||||
retain_segments = value.get("retain_segments", [])
|
assert isinstance(ref_count, int)
|
||||||
|
retain_segments = cast(List[bool], value.get("retain_segments", []))
|
||||||
|
|
||||||
if ref_count <= 4:
|
if ref_count <= 4:
|
||||||
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
|
flags_byte = mask_value(REF_COUNT_SHORT_MASK, ref_count)
|
||||||
|
@ -271,9 +324,9 @@ class JBIG2StreamWriter:
|
||||||
|
|
||||||
flags.append(ret_byte)
|
flags.append(ret_byte)
|
||||||
|
|
||||||
ref_segments = value.get("ref_segments", [])
|
ref_segments = cast(List[int], value.get("ref_segments", []))
|
||||||
|
|
||||||
seg_num = segment["number"]
|
seg_num = cast(int, segment["number"])
|
||||||
if seg_num <= 256:
|
if seg_num <= 256:
|
||||||
ref_format = "B"
|
ref_format = "B"
|
||||||
elif seg_num <= 65536:
|
elif seg_num <= 65536:
|
||||||
|
@ -287,35 +340,31 @@ class JBIG2StreamWriter:
|
||||||
|
|
||||||
return pack(flags_format, *flags)
|
return pack(flags_format, *flags)
|
||||||
|
|
||||||
def encode_data_length(self, value, segment):
|
def encode_data_length(self, value: int, segment: JBIG2Segment) -> bytes:
|
||||||
data = pack(">L", value)
|
data = pack(">L", value)
|
||||||
data += segment["raw_data"]
|
data += cast(bytes, segment["raw_data"])
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_eop_segment(self, seg_number, page_number):
|
def get_eop_segment(
|
||||||
|
self,
|
||||||
|
seg_number: int,
|
||||||
|
page_number: int
|
||||||
|
) -> JBIG2Segment:
|
||||||
return {
|
return {
|
||||||
'data_length': 0,
|
'data_length': 0,
|
||||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE},
|
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_PAGE},
|
||||||
'number': seg_number,
|
'number': seg_number,
|
||||||
'page_assoc': page_number,
|
'page_assoc': page_number,
|
||||||
'raw_data': b'',
|
'raw_data': b'',
|
||||||
'retention_flags': {
|
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS
|
||||||
'ref_count': 0,
|
|
||||||
'ref_segments': [],
|
|
||||||
'retain_segments': []
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_eof_segment(self, seg_number):
|
def get_eof_segment(self, seg_number: int) -> JBIG2Segment:
|
||||||
return {
|
return {
|
||||||
'data_length': 0,
|
'data_length': 0,
|
||||||
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE},
|
'flags': {'deferred': False, 'type': SEG_TYPE_END_OF_FILE},
|
||||||
'number': seg_number,
|
'number': seg_number,
|
||||||
'page_assoc': 0,
|
'page_assoc': 0,
|
||||||
'raw_data': b'',
|
'raw_data': b'',
|
||||||
'retention_flags': {
|
'retention_flags': JBIG2StreamWriter.EMPTY_RETENTION_FLAGS
|
||||||
'ref_count': 0,
|
|
||||||
'ref_segments': [],
|
|
||||||
'retain_segments': []
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,12 @@ This table is extracted from PDF Reference Manual 1.6, pp.925
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ENCODING = [
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
EncodingRow = \
|
||||||
|
Tuple[str, Optional[int], Optional[int], Optional[int], Optional[int]]
|
||||||
|
|
||||||
|
ENCODING: List[EncodingRow] = [
|
||||||
# (name, std, mac, win, pdf)
|
# (name, std, mac, win, pdf)
|
||||||
('A', 65, 65, 65, 65),
|
('A', 65, 65, 65, 65),
|
||||||
('AE', 225, 174, 198, 198),
|
('AE', 225, 174, 198, 198),
|
||||||
|
|
|
@ -1,25 +1,36 @@
|
||||||
import heapq
|
import heapq
|
||||||
import logging
|
import logging
|
||||||
|
from typing import (Dict, Generic, Iterable, Iterator, List, Optional,
|
||||||
|
Sequence, Set, Tuple, TypeVar, Union, cast)
|
||||||
|
|
||||||
from .utils import INF
|
from .utils import INF
|
||||||
|
from .utils import LTComponentT
|
||||||
|
from .utils import Matrix
|
||||||
from .utils import Plane
|
from .utils import Plane
|
||||||
|
from .utils import Point
|
||||||
|
from .utils import Rect
|
||||||
from .utils import apply_matrix_pt
|
from .utils import apply_matrix_pt
|
||||||
from .utils import bbox2str
|
from .utils import bbox2str
|
||||||
from .utils import fsplit
|
from .utils import fsplit
|
||||||
from .utils import get_bound
|
from .utils import get_bound
|
||||||
from .utils import matrix2str
|
from .utils import matrix2str
|
||||||
from .utils import uniq
|
from .utils import uniq
|
||||||
|
from .pdfcolor import PDFColorSpace
|
||||||
|
from .pdftypes import PDFStream
|
||||||
|
from .pdfinterp import Color
|
||||||
|
from .pdfinterp import PDFGraphicState
|
||||||
|
from .pdffont import PDFFont
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class IndexAssigner:
|
class IndexAssigner:
|
||||||
|
|
||||||
def __init__(self, index=0):
|
def __init__(self, index: int = 0) -> None:
|
||||||
self.index = index
|
self.index = index
|
||||||
return
|
return
|
||||||
|
|
||||||
def run(self, obj):
|
def run(self, obj: "LTItem") -> None:
|
||||||
if isinstance(obj, LTTextBox):
|
if isinstance(obj, LTTextBox):
|
||||||
obj.index = self.index
|
obj.index = self.index
|
||||||
self.index += 1
|
self.index += 1
|
||||||
|
@ -57,14 +68,16 @@ class LAParams:
|
||||||
figures.
|
figures.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
line_overlap=0.5,
|
self,
|
||||||
char_margin=2.0,
|
line_overlap: float = 0.5,
|
||||||
line_margin=0.5,
|
char_margin: float = 2.0,
|
||||||
word_margin=0.1,
|
line_margin: float = 0.5,
|
||||||
boxes_flow=0.5,
|
word_margin: float = 0.1,
|
||||||
detect_vertical=False,
|
boxes_flow: Optional[float] = 0.5,
|
||||||
all_texts=False):
|
detect_vertical: bool = False,
|
||||||
|
all_texts: bool = False
|
||||||
|
) -> None:
|
||||||
self.line_overlap = line_overlap
|
self.line_overlap = line_overlap
|
||||||
self.char_margin = char_margin
|
self.char_margin = char_margin
|
||||||
self.line_margin = line_margin
|
self.line_margin = line_margin
|
||||||
|
@ -76,7 +89,7 @@ class LAParams:
|
||||||
self._validate()
|
self._validate()
|
||||||
return
|
return
|
||||||
|
|
||||||
def _validate(self):
|
def _validate(self) -> None:
|
||||||
if self.boxes_flow is not None:
|
if self.boxes_flow is not None:
|
||||||
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
|
boxes_flow_err_msg = ("LAParam boxes_flow should be None, or a "
|
||||||
"number between -1 and +1")
|
"number between -1 and +1")
|
||||||
|
@ -86,7 +99,7 @@ class LAParams:
|
||||||
if not -1 <= self.boxes_flow <= 1:
|
if not -1 <= self.boxes_flow <= 1:
|
||||||
raise ValueError(boxes_flow_err_msg)
|
raise ValueError(boxes_flow_err_msg)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
return '<LAParams: char_margin=%.1f, line_margin=%.1f, ' \
|
||||||
'word_margin=%.1f all_texts=%r>' % \
|
'word_margin=%.1f all_texts=%r>' % \
|
||||||
(self.char_margin, self.line_margin, self.word_margin,
|
(self.char_margin, self.line_margin, self.word_margin,
|
||||||
|
@ -96,7 +109,7 @@ class LAParams:
|
||||||
class LTItem:
|
class LTItem:
|
||||||
"""Interface for things that can be analyzed"""
|
"""Interface for things that can be analyzed"""
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
"""Perform the layout analysis."""
|
"""Perform the layout analysis."""
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -104,11 +117,11 @@ class LTItem:
|
||||||
class LTText:
|
class LTText:
|
||||||
"""Interface for things that have text"""
|
"""Interface for things that have text"""
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s %r>' %
|
return ('<%s %r>' %
|
||||||
(self.__class__.__name__, self.get_text()))
|
(self.__class__.__name__, self.get_text()))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self) -> str:
|
||||||
"""Text contained in this object"""
|
"""Text contained in this object"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -116,29 +129,29 @@ class LTText:
|
||||||
class LTComponent(LTItem):
|
class LTComponent(LTItem):
|
||||||
"""Object with a bounding box"""
|
"""Object with a bounding box"""
|
||||||
|
|
||||||
def __init__(self, bbox):
|
def __init__(self, bbox: Rect) -> None:
|
||||||
LTItem.__init__(self)
|
LTItem.__init__(self)
|
||||||
self.set_bbox(bbox)
|
self.set_bbox(bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s %s>' %
|
return ('<%s %s>' %
|
||||||
(self.__class__.__name__, bbox2str(self.bbox)))
|
(self.__class__.__name__, bbox2str(self.bbox)))
|
||||||
|
|
||||||
# Disable comparison.
|
# Disable comparison.
|
||||||
def __lt__(self, _):
|
def __lt__(self, _: object) -> bool:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
def __le__(self, _):
|
def __le__(self, _: object) -> bool:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
def __gt__(self, _):
|
def __gt__(self, _: object) -> bool:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
def __ge__(self, _):
|
def __ge__(self, _: object) -> bool:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
def set_bbox(self, bbox):
|
def set_bbox(self, bbox: Rect) -> None:
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
self.x0 = x0
|
self.x0 = x0
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
|
@ -149,39 +162,39 @@ class LTComponent(LTItem):
|
||||||
self.bbox = bbox
|
self.bbox = bbox
|
||||||
return
|
return
|
||||||
|
|
||||||
def is_empty(self):
|
def is_empty(self) -> bool:
|
||||||
return self.width <= 0 or self.height <= 0
|
return self.width <= 0 or self.height <= 0
|
||||||
|
|
||||||
def is_hoverlap(self, obj):
|
def is_hoverlap(self, obj: "LTComponent") -> bool:
|
||||||
assert isinstance(obj, LTComponent), str(type(obj))
|
assert isinstance(obj, LTComponent), str(type(obj))
|
||||||
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
||||||
|
|
||||||
def hdistance(self, obj):
|
def hdistance(self, obj: "LTComponent") -> float:
|
||||||
assert isinstance(obj, LTComponent), str(type(obj))
|
assert isinstance(obj, LTComponent), str(type(obj))
|
||||||
if self.is_hoverlap(obj):
|
if self.is_hoverlap(obj):
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
|
|
||||||
def hoverlap(self, obj):
|
def hoverlap(self, obj: "LTComponent") -> float:
|
||||||
assert isinstance(obj, LTComponent), str(type(obj))
|
assert isinstance(obj, LTComponent), str(type(obj))
|
||||||
if self.is_hoverlap(obj):
|
if self.is_hoverlap(obj):
|
||||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def is_voverlap(self, obj):
|
def is_voverlap(self, obj: "LTComponent") -> bool:
|
||||||
assert isinstance(obj, LTComponent), str(type(obj))
|
assert isinstance(obj, LTComponent), str(type(obj))
|
||||||
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
||||||
|
|
||||||
def vdistance(self, obj):
|
def vdistance(self, obj: "LTComponent") -> float:
|
||||||
assert isinstance(obj, LTComponent), str(type(obj))
|
assert isinstance(obj, LTComponent), str(type(obj))
|
||||||
if self.is_voverlap(obj):
|
if self.is_voverlap(obj):
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||||
|
|
||||||
def voverlap(self, obj):
|
def voverlap(self, obj: "LTComponent") -> float:
|
||||||
assert isinstance(obj, LTComponent), str(type(obj))
|
assert isinstance(obj, LTComponent), str(type(obj))
|
||||||
if self.is_voverlap(obj):
|
if self.is_voverlap(obj):
|
||||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||||
|
@ -192,8 +205,16 @@ class LTComponent(LTItem):
|
||||||
class LTCurve(LTComponent):
|
class LTCurve(LTComponent):
|
||||||
"""A generic Bezier curve"""
|
"""A generic Bezier curve"""
|
||||||
|
|
||||||
def __init__(self, linewidth, pts, stroke=False, fill=False, evenodd=False,
|
def __init__(
|
||||||
stroking_color=None, non_stroking_color=None):
|
self,
|
||||||
|
linewidth: float,
|
||||||
|
pts: List[Point],
|
||||||
|
stroke: bool = False,
|
||||||
|
fill: bool = False,
|
||||||
|
evenodd: bool = False,
|
||||||
|
stroking_color: Optional[Color] = None,
|
||||||
|
non_stroking_color: Optional[Color] = None
|
||||||
|
) -> None:
|
||||||
LTComponent.__init__(self, get_bound(pts))
|
LTComponent.__init__(self, get_bound(pts))
|
||||||
self.pts = pts
|
self.pts = pts
|
||||||
self.linewidth = linewidth
|
self.linewidth = linewidth
|
||||||
|
@ -204,7 +225,7 @@ class LTCurve(LTComponent):
|
||||||
self.non_stroking_color = non_stroking_color
|
self.non_stroking_color = non_stroking_color
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_pts(self):
|
def get_pts(self) -> str:
|
||||||
return ','.join('%.3f,%.3f' % p for p in self.pts)
|
return ','.join('%.3f,%.3f' % p for p in self.pts)
|
||||||
|
|
||||||
|
|
||||||
|
@ -214,8 +235,17 @@ class LTLine(LTCurve):
|
||||||
Could be used for separating text or figures.
|
Could be used for separating text or figures.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, linewidth, p0, p1, stroke=False, fill=False,
|
def __init__(
|
||||||
evenodd=False, stroking_color=None, non_stroking_color=None):
|
self,
|
||||||
|
linewidth: float,
|
||||||
|
p0: Point,
|
||||||
|
p1: Point,
|
||||||
|
stroke: bool = False,
|
||||||
|
fill: bool = False,
|
||||||
|
evenodd: bool = False,
|
||||||
|
stroking_color: Optional[Color] = None,
|
||||||
|
non_stroking_color: Optional[Color] = None
|
||||||
|
) -> None:
|
||||||
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
|
LTCurve.__init__(self, linewidth, [p0, p1], stroke, fill, evenodd,
|
||||||
stroking_color, non_stroking_color)
|
stroking_color, non_stroking_color)
|
||||||
return
|
return
|
||||||
|
@ -227,8 +257,16 @@ class LTRect(LTCurve):
|
||||||
Could be used for framing another pictures or figures.
|
Could be used for framing another pictures or figures.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, linewidth, bbox, stroke=False, fill=False,
|
def __init__(
|
||||||
evenodd=False, stroking_color=None, non_stroking_color=None):
|
self,
|
||||||
|
linewidth: float,
|
||||||
|
bbox: Rect,
|
||||||
|
stroke: bool = False,
|
||||||
|
fill: bool = False,
|
||||||
|
evenodd: bool = False,
|
||||||
|
stroking_color: Optional[Color] = None,
|
||||||
|
non_stroking_color: Optional[Color] = None
|
||||||
|
) -> None:
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
LTCurve.__init__(self, linewidth,
|
LTCurve.__init__(self, linewidth,
|
||||||
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke,
|
[(x0, y0), (x1, y0), (x1, y1), (x0, y1)], stroke,
|
||||||
|
@ -242,7 +280,7 @@ class LTImage(LTComponent):
|
||||||
Embedded images can be in JPEG, Bitmap or JBIG2.
|
Embedded images can be in JPEG, Bitmap or JBIG2.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name, stream, bbox):
|
def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None:
|
||||||
LTComponent.__init__(self, bbox)
|
LTComponent.__init__(self, bbox)
|
||||||
self.name = name
|
self.name = name
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
|
@ -255,7 +293,7 @@ class LTImage(LTComponent):
|
||||||
self.colorspace = [self.colorspace]
|
self.colorspace = [self.colorspace]
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s(%s) %s %r>' %
|
return ('<%s(%s) %s %r>' %
|
||||||
(self.__class__.__name__, self.name,
|
(self.__class__.__name__, self.name,
|
||||||
bbox2str(self.bbox), self.srcsize))
|
bbox2str(self.bbox), self.srcsize))
|
||||||
|
@ -269,19 +307,30 @@ class LTAnno(LTItem, LTText):
|
||||||
according to the relationship between two characters (e.g. a space).
|
according to the relationship between two characters (e.g. a space).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, text):
|
def __init__(self, text: str) -> None:
|
||||||
self._text = text
|
self._text = text
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self) -> str:
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
|
|
||||||
class LTChar(LTComponent, LTText):
|
class LTChar(LTComponent, LTText):
|
||||||
"""Actual letter in the text as a Unicode string."""
|
"""Actual letter in the text as a Unicode string."""
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, scaling, rise,
|
def __init__(
|
||||||
text, textwidth, textdisp, ncs, graphicstate):
|
self,
|
||||||
|
matrix: Matrix,
|
||||||
|
font: PDFFont,
|
||||||
|
fontsize: float,
|
||||||
|
scaling: float,
|
||||||
|
rise: float,
|
||||||
|
text: str,
|
||||||
|
textwidth: float,
|
||||||
|
textdisp: Union[float, Tuple[Optional[float], float]],
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: PDFGraphicState
|
||||||
|
) -> None:
|
||||||
LTText.__init__(self)
|
LTText.__init__(self)
|
||||||
self._text = text
|
self._text = text
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
|
@ -292,6 +341,7 @@ class LTChar(LTComponent, LTText):
|
||||||
# compute the boundary rectangle.
|
# compute the boundary rectangle.
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
# vertical
|
# vertical
|
||||||
|
assert isinstance(textdisp, tuple)
|
||||||
(vx, vy) = textdisp
|
(vx, vy) = textdisp
|
||||||
if vx is None:
|
if vx is None:
|
||||||
vx = fontsize * 0.5
|
vx = fontsize * 0.5
|
||||||
|
@ -320,114 +370,129 @@ class LTChar(LTComponent, LTText):
|
||||||
self.size = self.height
|
self.size = self.height
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
||||||
(self.__class__.__name__, bbox2str(self.bbox),
|
(self.__class__.__name__, bbox2str(self.bbox),
|
||||||
matrix2str(self.matrix), self.fontname, self.adv,
|
matrix2str(self.matrix), self.fontname, self.adv,
|
||||||
self.get_text()))
|
self.get_text()))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self) -> str:
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
def is_compatible(self, obj):
|
def is_compatible(self, obj: object) -> bool:
|
||||||
"""Returns True if two characters can coexist in the same line."""
|
"""Returns True if two characters can coexist in the same line."""
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
class LTContainer(LTComponent):
|
LTItemT = TypeVar('LTItemT', bound=LTItem)
|
||||||
|
|
||||||
|
|
||||||
|
class LTContainer(LTComponent, Generic[LTItemT]):
|
||||||
"""Object that can be extended and analyzed"""
|
"""Object that can be extended and analyzed"""
|
||||||
|
|
||||||
def __init__(self, bbox):
|
def __init__(self, bbox: Rect) -> None:
|
||||||
LTComponent.__init__(self, bbox)
|
LTComponent.__init__(self, bbox)
|
||||||
self._objs = []
|
self._objs: List[LTItemT] = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self) -> Iterator[LTItemT]:
|
||||||
return iter(self._objs)
|
return iter(self._objs)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self._objs)
|
return len(self._objs)
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj: LTItemT) -> None:
|
||||||
self._objs.append(obj)
|
self._objs.append(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def extend(self, objs):
|
def extend(self, objs: Iterable[LTItemT]) -> None:
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
self.add(obj)
|
self.add(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
for obj in self._objs:
|
for obj in self._objs:
|
||||||
obj.analyze(laparams)
|
obj.analyze(laparams)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class LTExpandableContainer(LTContainer):
|
class LTExpandableContainer(LTContainer[LTItemT]):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
|
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
|
||||||
return
|
return
|
||||||
|
|
||||||
def add(self, obj):
|
# Incompatible override: we take an LTComponent (with bounding box), but
|
||||||
LTContainer.add(self, obj)
|
# super() LTContainer only considers LTItem (no bounding box).
|
||||||
|
def add(self, obj: LTComponent) -> None: # type: ignore[override]
|
||||||
|
LTContainer.add(self, cast(LTItemT, obj))
|
||||||
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
|
self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
|
||||||
max(self.x1, obj.x1), max(self.y1, obj.y1)))
|
max(self.x1, obj.x1), max(self.y1, obj.y1)))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class LTTextContainer(LTExpandableContainer, LTText):
|
class LTTextContainer(LTExpandableContainer[LTItemT], LTText):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
LTText.__init__(self)
|
LTText.__init__(self)
|
||||||
LTExpandableContainer.__init__(self)
|
LTExpandableContainer.__init__(self)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self) -> str:
|
||||||
return ''.join(obj.get_text() for obj in self
|
return ''.join(cast(LTText, obj).get_text() for obj in self
|
||||||
if isinstance(obj, LTText))
|
if isinstance(obj, LTText))
|
||||||
|
|
||||||
|
|
||||||
class LTTextLine(LTTextContainer):
|
TextLineElement = Union[LTChar, LTAnno]
|
||||||
|
|
||||||
|
|
||||||
|
class LTTextLine(LTTextContainer[TextLineElement]):
|
||||||
"""Contains a list of LTChar objects that represent a single text line.
|
"""Contains a list of LTChar objects that represent a single text line.
|
||||||
|
|
||||||
The characters are aligned either horizontally or vertically, depending on
|
The characters are aligned either horizontally or vertically, depending on
|
||||||
the text's writing mode.
|
the text's writing mode.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, word_margin):
|
def __init__(self, word_margin: float) -> None:
|
||||||
LTTextContainer.__init__(self)
|
super().__init__()
|
||||||
self.word_margin = word_margin
|
self.word_margin = word_margin
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s %s %r>' %
|
return ('<%s %s %r>' %
|
||||||
(self.__class__.__name__, bbox2str(self.bbox),
|
(self.__class__.__name__, bbox2str(self.bbox),
|
||||||
self.get_text()))
|
self.get_text()))
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
LTTextContainer.analyze(self, laparams)
|
LTTextContainer.analyze(self, laparams)
|
||||||
LTContainer.add(self, LTAnno('\n'))
|
LTContainer.add(self, LTAnno('\n'))
|
||||||
return
|
return
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane: Plane[LTComponentT], ratio: float
|
||||||
|
) -> List["LTTextLine"]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class LTTextLineHorizontal(LTTextLine):
|
class LTTextLineHorizontal(LTTextLine):
|
||||||
def __init__(self, word_margin):
|
def __init__(self, word_margin: float) -> None:
|
||||||
LTTextLine.__init__(self, word_margin)
|
LTTextLine.__init__(self, word_margin)
|
||||||
self._x1 = +INF
|
self._x1: float = +INF
|
||||||
return
|
return
|
||||||
|
|
||||||
def add(self, obj):
|
# Incompatible override: we take an LTComponent (with bounding box), but
|
||||||
|
# LTContainer only considers LTItem (no bounding box).
|
||||||
|
def add(self, obj: LTComponent) -> None: # type: ignore[override]
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
if isinstance(obj, LTChar) and self.word_margin:
|
||||||
margin = self.word_margin * max(obj.width, obj.height)
|
margin = self.word_margin * max(obj.width, obj.height)
|
||||||
if self._x1 < obj.x0 - margin:
|
if self._x1 < obj.x0 - margin:
|
||||||
LTContainer.add(self, LTAnno(' '))
|
LTContainer.add(self, LTAnno(' '))
|
||||||
self._x1 = obj.x1
|
self._x1 = obj.x1
|
||||||
LTTextLine.add(self, obj)
|
super().add(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(
|
||||||
|
self,
|
||||||
|
plane: Plane[LTComponentT],
|
||||||
|
ratio: float
|
||||||
|
) -> List[LTTextLine]:
|
||||||
"""
|
"""
|
||||||
Finds neighboring LTTextLineHorizontals in the plane.
|
Finds neighboring LTTextLineHorizontals in the plane.
|
||||||
|
|
||||||
|
@ -445,45 +510,67 @@ class LTTextLineHorizontal(LTTextLine):
|
||||||
self._is_right_aligned_with(obj, tolerance=d) or
|
self._is_right_aligned_with(obj, tolerance=d) or
|
||||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||||
|
|
||||||
def _is_left_aligned_with(self, other, tolerance=0):
|
def _is_left_aligned_with(
|
||||||
|
self,
|
||||||
|
other: LTComponent,
|
||||||
|
tolerance: float = 0
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Whether the left-hand edge of `other` is within `tolerance`.
|
Whether the left-hand edge of `other` is within `tolerance`.
|
||||||
"""
|
"""
|
||||||
return abs(other.x0 - self.x0) <= tolerance
|
return abs(other.x0 - self.x0) <= tolerance
|
||||||
|
|
||||||
def _is_right_aligned_with(self, other, tolerance=0):
|
def _is_right_aligned_with(
|
||||||
|
self,
|
||||||
|
other: LTComponent,
|
||||||
|
tolerance: float = 0
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Whether the right-hand edge of `other` is within `tolerance`.
|
Whether the right-hand edge of `other` is within `tolerance`.
|
||||||
"""
|
"""
|
||||||
return abs(other.x1 - self.x1) <= tolerance
|
return abs(other.x1 - self.x1) <= tolerance
|
||||||
|
|
||||||
def _is_centrally_aligned_with(self, other, tolerance=0):
|
def _is_centrally_aligned_with(
|
||||||
|
self,
|
||||||
|
other: LTComponent,
|
||||||
|
tolerance: float = 0
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Whether the horizontal center of `other` is within `tolerance`.
|
Whether the horizontal center of `other` is within `tolerance`.
|
||||||
"""
|
"""
|
||||||
return abs(
|
return abs(
|
||||||
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
(other.x0 + other.x1) / 2 - (self.x0 + self.x1) / 2) <= tolerance
|
||||||
|
|
||||||
def _is_same_height_as(self, other, tolerance):
|
def _is_same_height_as(
|
||||||
|
self,
|
||||||
|
other: LTComponent,
|
||||||
|
tolerance: float = 0
|
||||||
|
) -> bool:
|
||||||
return abs(other.height - self.height) <= tolerance
|
return abs(other.height - self.height) <= tolerance
|
||||||
|
|
||||||
|
|
||||||
class LTTextLineVertical(LTTextLine):
|
class LTTextLineVertical(LTTextLine):
|
||||||
def __init__(self, word_margin):
|
def __init__(self, word_margin: float) -> None:
|
||||||
LTTextLine.__init__(self, word_margin)
|
LTTextLine.__init__(self, word_margin)
|
||||||
self._y0 = -INF
|
self._y0: float = -INF
|
||||||
return
|
return
|
||||||
|
|
||||||
def add(self, obj):
|
# Incompatible override: we take an LTComponent (with bounding box), but
|
||||||
|
# LTContainer only considers LTItem (no bounding box).
|
||||||
|
def add(self, obj: LTComponent) -> None: # type: ignore[override]
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
if isinstance(obj, LTChar) and self.word_margin:
|
||||||
margin = self.word_margin * max(obj.width, obj.height)
|
margin = self.word_margin * max(obj.width, obj.height)
|
||||||
if obj.y1 + margin < self._y0:
|
if obj.y1 + margin < self._y0:
|
||||||
LTContainer.add(self, LTAnno(' '))
|
LTContainer.add(self, LTAnno(' '))
|
||||||
self._y0 = obj.y0
|
self._y0 = obj.y0
|
||||||
LTTextLine.add(self, obj)
|
super().add(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(
|
||||||
|
self,
|
||||||
|
plane: Plane[LTComponentT],
|
||||||
|
ratio: float
|
||||||
|
) -> List[LTTextLine]:
|
||||||
"""
|
"""
|
||||||
Finds neighboring LTTextLineVerticals in the plane.
|
Finds neighboring LTTextLineVerticals in the plane.
|
||||||
|
|
||||||
|
@ -501,30 +588,42 @@ class LTTextLineVertical(LTTextLine):
|
||||||
self._is_upper_aligned_with(obj, tolerance=d) or
|
self._is_upper_aligned_with(obj, tolerance=d) or
|
||||||
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
self._is_centrally_aligned_with(obj, tolerance=d)))]
|
||||||
|
|
||||||
def _is_lower_aligned_with(self, other, tolerance=0):
|
def _is_lower_aligned_with(
|
||||||
|
self,
|
||||||
|
other: LTComponent,
|
||||||
|
tolerance: float = 0
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Whether the lower edge of `other` is within `tolerance`.
|
Whether the lower edge of `other` is within `tolerance`.
|
||||||
"""
|
"""
|
||||||
return abs(other.y0 - self.y0) <= tolerance
|
return abs(other.y0 - self.y0) <= tolerance
|
||||||
|
|
||||||
def _is_upper_aligned_with(self, other, tolerance=0):
|
def _is_upper_aligned_with(
|
||||||
|
self,
|
||||||
|
other: LTComponent,
|
||||||
|
tolerance: float = 0
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Whether the upper edge of `other` is within `tolerance`.
|
Whether the upper edge of `other` is within `tolerance`.
|
||||||
"""
|
"""
|
||||||
return abs(other.y1 - self.y1) <= tolerance
|
return abs(other.y1 - self.y1) <= tolerance
|
||||||
|
|
||||||
def _is_centrally_aligned_with(self, other, tolerance=0):
|
def _is_centrally_aligned_with(
|
||||||
|
self,
|
||||||
|
other: LTComponent,
|
||||||
|
tolerance: float = 0
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Whether the vertical center of `other` is within `tolerance`.
|
Whether the vertical center of `other` is within `tolerance`.
|
||||||
"""
|
"""
|
||||||
return abs(
|
return abs(
|
||||||
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
(other.y0 + other.y1) / 2 - (self.y0 + self.y1) / 2) <= tolerance
|
||||||
|
|
||||||
def _is_same_width_as(self, other, tolerance):
|
def _is_same_width_as(self, other: LTComponent, tolerance: float) -> bool:
|
||||||
return abs(other.width - self.width) <= tolerance
|
return abs(other.width - self.width) <= tolerance
|
||||||
|
|
||||||
|
|
||||||
class LTTextBox(LTTextContainer):
|
class LTTextBox(LTTextContainer[LTTextLine]):
|
||||||
"""Represents a group of text chunks in a rectangular area.
|
"""Represents a group of text chunks in a rectangular area.
|
||||||
|
|
||||||
Note that this box is created by geometric analysis and does not
|
Note that this box is created by geometric analysis and does not
|
||||||
|
@ -532,72 +631,86 @@ class LTTextBox(LTTextContainer):
|
||||||
of LTTextLine objects.
|
of LTTextLine objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
LTTextContainer.__init__(self)
|
LTTextContainer.__init__(self)
|
||||||
self.index = -1
|
self.index: int = -1
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s(%s) %s %r>' %
|
return ('<%s(%s) %s %r>' %
|
||||||
(self.__class__.__name__,
|
(self.__class__.__name__,
|
||||||
self.index, bbox2str(self.bbox), self.get_text()))
|
self.index, bbox2str(self.bbox), self.get_text()))
|
||||||
|
|
||||||
|
def get_writing_mode(self) -> str:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class LTTextBoxHorizontal(LTTextBox):
|
class LTTextBoxHorizontal(LTTextBox):
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
LTTextBox.analyze(self, laparams)
|
super().analyze(laparams)
|
||||||
self._objs.sort(key=lambda obj: -obj.y1)
|
self._objs.sort(key=lambda obj: -obj.y1)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_writing_mode(self):
|
def get_writing_mode(self) -> str:
|
||||||
return 'lr-tb'
|
return 'lr-tb'
|
||||||
|
|
||||||
|
|
||||||
class LTTextBoxVertical(LTTextBox):
|
class LTTextBoxVertical(LTTextBox):
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
LTTextBox.analyze(self, laparams)
|
super().analyze(laparams)
|
||||||
self._objs.sort(key=lambda obj: -obj.x1)
|
self._objs.sort(key=lambda obj: -obj.x1)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_writing_mode(self):
|
def get_writing_mode(self) -> str:
|
||||||
return 'tb-rl'
|
return 'tb-rl'
|
||||||
|
|
||||||
|
|
||||||
class LTTextGroup(LTTextContainer):
|
TextGroupElement = Union[LTTextBox, "LTTextGroup"]
|
||||||
def __init__(self, objs):
|
|
||||||
LTTextContainer.__init__(self)
|
|
||||||
|
class LTTextGroup(LTTextContainer[TextGroupElement]):
|
||||||
|
def __init__(self, objs: Iterable[TextGroupElement]) -> None:
|
||||||
|
super().__init__()
|
||||||
self.extend(objs)
|
self.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class LTTextGroupLRTB(LTTextGroup):
|
class LTTextGroupLRTB(LTTextGroup):
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
LTTextGroup.analyze(self, laparams)
|
super().analyze(laparams)
|
||||||
|
assert laparams.boxes_flow is not None
|
||||||
|
boxes_flow = laparams.boxes_flow
|
||||||
# reorder the objects from top-left to bottom-right.
|
# reorder the objects from top-left to bottom-right.
|
||||||
self._objs.sort(
|
self._objs.sort(
|
||||||
key=lambda obj: (1 - laparams.boxes_flow) * obj.x0
|
key=lambda obj: (1 - boxes_flow) * obj.x0
|
||||||
- (1 + laparams.boxes_flow) * (obj.y0 + obj.y1))
|
- (1 + boxes_flow) * (obj.y0 + obj.y1))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class LTTextGroupTBRL(LTTextGroup):
|
class LTTextGroupTBRL(LTTextGroup):
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
LTTextGroup.analyze(self, laparams)
|
super().analyze(laparams)
|
||||||
|
assert laparams.boxes_flow is not None
|
||||||
|
boxes_flow = laparams.boxes_flow
|
||||||
# reorder the objects from top-right to bottom-left.
|
# reorder the objects from top-right to bottom-left.
|
||||||
self._objs.sort(
|
self._objs.sort(
|
||||||
key=lambda obj: - (1 + laparams.boxes_flow) * (obj.x0 + obj.x1)
|
key=lambda obj: - (1 + boxes_flow) * (obj.x0 + obj.x1)
|
||||||
- (1 - laparams.boxes_flow) * obj.y1)
|
- (1 - boxes_flow) * obj.y1)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class LTLayoutContainer(LTContainer):
|
class LTLayoutContainer(LTContainer[LTComponent]):
|
||||||
def __init__(self, bbox):
|
def __init__(self, bbox: Rect) -> None:
|
||||||
LTContainer.__init__(self, bbox)
|
LTContainer.__init__(self, bbox)
|
||||||
self.groups = None
|
self.groups: Optional[List[LTTextGroup]] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
# group_objects: group text object to textlines.
|
# group_objects: group text object to textlines.
|
||||||
def group_objects(self, laparams, objs):
|
def group_objects(
|
||||||
|
self,
|
||||||
|
laparams: LAParams,
|
||||||
|
objs: Iterable[LTComponent]
|
||||||
|
) -> Iterator[LTTextLine]:
|
||||||
obj0 = None
|
obj0 = None
|
||||||
line = None
|
line = None
|
||||||
for obj1 in objs:
|
for obj1 in objs:
|
||||||
|
@ -667,15 +780,20 @@ class LTLayoutContainer(LTContainer):
|
||||||
obj0 = obj1
|
obj0 = obj1
|
||||||
if line is None:
|
if line is None:
|
||||||
line = LTTextLineHorizontal(laparams.word_margin)
|
line = LTTextLineHorizontal(laparams.word_margin)
|
||||||
|
assert obj0 is not None
|
||||||
line.add(obj0)
|
line.add(obj0)
|
||||||
yield line
|
yield line
|
||||||
return
|
return
|
||||||
|
|
||||||
def group_textlines(self, laparams, lines):
|
def group_textlines(
|
||||||
|
self,
|
||||||
|
laparams: LAParams,
|
||||||
|
lines: Iterable[LTTextLine]
|
||||||
|
) -> Iterator[LTTextBox]:
|
||||||
"""Group neighboring lines to textboxes"""
|
"""Group neighboring lines to textboxes"""
|
||||||
plane = Plane(self.bbox)
|
plane: Plane[LTTextLine] = Plane(self.bbox)
|
||||||
plane.extend(lines)
|
plane.extend(lines)
|
||||||
boxes = {}
|
boxes: Dict[LTTextLine, LTTextBox] = {}
|
||||||
for line in lines:
|
for line in lines:
|
||||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||||
members = [line]
|
members = [line]
|
||||||
|
@ -684,7 +802,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
if obj1 in boxes:
|
if obj1 in boxes:
|
||||||
members.extend(boxes.pop(obj1))
|
members.extend(boxes.pop(obj1))
|
||||||
if isinstance(line, LTTextLineHorizontal):
|
if isinstance(line, LTTextLineHorizontal):
|
||||||
box = LTTextBoxHorizontal()
|
box: LTTextBox = LTTextBoxHorizontal()
|
||||||
else:
|
else:
|
||||||
box = LTTextBoxVertical()
|
box = LTTextBoxVertical()
|
||||||
for obj in uniq(members):
|
for obj in uniq(members):
|
||||||
|
@ -702,7 +820,11 @@ class LTLayoutContainer(LTContainer):
|
||||||
yield box
|
yield box
|
||||||
return
|
return
|
||||||
|
|
||||||
def group_textboxes(self, laparams, boxes):
|
def group_textboxes(
|
||||||
|
self,
|
||||||
|
laparams: LAParams,
|
||||||
|
boxes: Sequence[LTTextBox]
|
||||||
|
) -> List[LTTextGroup]:
|
||||||
"""Group textboxes hierarchically.
|
"""Group textboxes hierarchically.
|
||||||
|
|
||||||
Get pair-wise distances, via dist func defined below, and then merge
|
Get pair-wise distances, via dist func defined below, and then merge
|
||||||
|
@ -718,10 +840,13 @@ class LTLayoutContainer(LTContainer):
|
||||||
|
|
||||||
:param laparams: LAParams object.
|
:param laparams: LAParams object.
|
||||||
:param boxes: All textbox objects to be grouped.
|
:param boxes: All textbox objects to be grouped.
|
||||||
:return: a list that has only one element, the final top level textbox.
|
:return: a list that has only one element, the final top level group.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def dist(obj1, obj2):
|
ElementT = Union[LTTextBox, LTTextGroup]
|
||||||
|
plane: Plane[ElementT] = Plane(self.bbox)
|
||||||
|
|
||||||
|
def dist(obj1: LTComponent, obj2: LTComponent) -> float:
|
||||||
"""A distance function between two TextBoxes.
|
"""A distance function between two TextBoxes.
|
||||||
|
|
||||||
Consider the bounding rectangle for obj1 and obj2.
|
Consider the bounding rectangle for obj1 and obj2.
|
||||||
|
@ -740,7 +865,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
return (x1 - x0) * (y1 - y0) \
|
return (x1 - x0) * (y1 - y0) \
|
||||||
- obj1.width*obj1.height - obj2.width*obj2.height
|
- obj1.width*obj1.height - obj2.width*obj2.height
|
||||||
|
|
||||||
def isany(obj1, obj2):
|
def isany(obj1: ElementT, obj2: ElementT) -> Set[ElementT]:
|
||||||
"""Check if there's any other object between obj1 and obj2."""
|
"""Check if there's any other object between obj1 and obj2."""
|
||||||
x0 = min(obj1.x0, obj2.x0)
|
x0 = min(obj1.x0, obj2.x0)
|
||||||
y0 = min(obj1.y0, obj2.y0)
|
y0 = min(obj1.y0, obj2.y0)
|
||||||
|
@ -749,16 +874,15 @@ class LTLayoutContainer(LTContainer):
|
||||||
objs = set(plane.find((x0, y0, x1, y1)))
|
objs = set(plane.find((x0, y0, x1, y1)))
|
||||||
return objs.difference((obj1, obj2))
|
return objs.difference((obj1, obj2))
|
||||||
|
|
||||||
dists = []
|
dists: List[Tuple[bool, float, int, int, ElementT, ElementT]] = []
|
||||||
for i in range(len(boxes)):
|
for i in range(len(boxes)):
|
||||||
obj1 = boxes[i]
|
box1 = boxes[i]
|
||||||
for j in range(i+1, len(boxes)):
|
for j in range(i+1, len(boxes)):
|
||||||
obj2 = boxes[j]
|
box2 = boxes[j]
|
||||||
dists.append((False, dist(obj1, obj2), id(obj1), id(obj2),
|
dists.append((False, dist(box1, box2), id(box1), id(box2),
|
||||||
obj1, obj2))
|
box1, box2))
|
||||||
heapq.heapify(dists)
|
heapq.heapify(dists)
|
||||||
|
|
||||||
plane = Plane(self.bbox)
|
|
||||||
plane.extend(boxes)
|
plane.extend(boxes)
|
||||||
done = set()
|
done = set()
|
||||||
while len(dists) > 0:
|
while len(dists) > 0:
|
||||||
|
@ -770,7 +894,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
continue
|
continue
|
||||||
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
|
||||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):
|
||||||
group = LTTextGroupTBRL([obj1, obj2])
|
group: LTTextGroup = LTTextGroupTBRL([obj1, obj2])
|
||||||
else:
|
else:
|
||||||
group = LTTextGroupLRTB([obj1, obj2])
|
group = LTTextGroupLRTB([obj1, obj2])
|
||||||
plane.remove(obj1)
|
plane.remove(obj1)
|
||||||
|
@ -781,9 +905,10 @@ class LTLayoutContainer(LTContainer):
|
||||||
heapq.heappush(dists, (False, dist(group, other),
|
heapq.heappush(dists, (False, dist(group, other),
|
||||||
id(group), id(other), group, other))
|
id(group), id(other), group, other))
|
||||||
plane.add(group)
|
plane.add(group)
|
||||||
return list(plane)
|
# By now only groups are in the plane
|
||||||
|
return list(cast(LTTextGroup, g) for g in plane)
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
# textobjs is a list of LTChar objects, i.e.
|
# textobjs is a list of LTChar objects, i.e.
|
||||||
# it has all the individual characters in the page.
|
# it has all the individual characters in the page.
|
||||||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
|
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
|
||||||
|
@ -801,7 +926,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
for textbox in textboxes:
|
for textbox in textboxes:
|
||||||
textbox.analyze(laparams)
|
textbox.analyze(laparams)
|
||||||
|
|
||||||
def getkey(box):
|
def getkey(box: LTTextBox) -> Tuple[int, float, float]:
|
||||||
if isinstance(box, LTTextBoxVertical):
|
if isinstance(box, LTTextBoxVertical):
|
||||||
return (0, -box.x1, -box.y0)
|
return (0, -box.x1, -box.y0)
|
||||||
else:
|
else:
|
||||||
|
@ -814,7 +939,8 @@ class LTLayoutContainer(LTContainer):
|
||||||
group.analyze(laparams)
|
group.analyze(laparams)
|
||||||
assigner.run(group)
|
assigner.run(group)
|
||||||
textboxes.sort(key=lambda box: box.index)
|
textboxes.sort(key=lambda box: box.index)
|
||||||
self._objs = textboxes + otherobjs + empties
|
self._objs = (cast(List[LTComponent], textboxes) + otherobjs
|
||||||
|
+ cast(List[LTComponent], empties))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -826,7 +952,7 @@ class LTFigure(LTLayoutContainer):
|
||||||
recursively.
|
recursively.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name, bbox, matrix):
|
def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
(x, y, w, h) = bbox
|
(x, y, w, h) = bbox
|
||||||
|
@ -835,12 +961,12 @@ class LTFigure(LTLayoutContainer):
|
||||||
LTLayoutContainer.__init__(self, bbox)
|
LTLayoutContainer.__init__(self, bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s(%s) %s matrix=%s>' %
|
return ('<%s(%s) %s matrix=%s>' %
|
||||||
(self.__class__.__name__, self.name,
|
(self.__class__.__name__, self.name,
|
||||||
bbox2str(self.bbox), matrix2str(self.matrix)))
|
bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams: LAParams) -> None:
|
||||||
if not laparams.all_texts:
|
if not laparams.all_texts:
|
||||||
return
|
return
|
||||||
LTLayoutContainer.analyze(self, laparams)
|
LTLayoutContainer.analyze(self, laparams)
|
||||||
|
@ -854,13 +980,13 @@ class LTPage(LTLayoutContainer):
|
||||||
LTCurve and LTLine.
|
LTCurve and LTLine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pageid, bbox, rotate=0):
|
def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None:
|
||||||
LTLayoutContainer.__init__(self, bbox)
|
LTLayoutContainer.__init__(self, bbox)
|
||||||
self.pageid = pageid
|
self.pageid = pageid
|
||||||
self.rotate = rotate
|
self.rotate = rotate
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<%s(%r) %s rotate=%r>' %
|
return ('<%s(%r) %s rotate=%r>' %
|
||||||
(self.__class__.__name__, self.pageid,
|
(self.__class__.__name__, self.pageid,
|
||||||
bbox2str(self.bbox), self.rotate))
|
bbox2str(self.bbox), self.rotate))
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import logging
|
import logging
|
||||||
|
from typing import BinaryIO, Iterator, List, Optional, cast
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -11,16 +12,17 @@ class CorruptDataError(Exception):
|
||||||
|
|
||||||
class LZWDecoder:
|
class LZWDecoder:
|
||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp: BinaryIO) -> None:
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.buff = 0
|
self.buff = 0
|
||||||
self.bpos = 8
|
self.bpos = 8
|
||||||
self.nbits = 9
|
self.nbits = 9
|
||||||
self.table = None
|
# NB: self.table stores None only in indices 256 and 257
|
||||||
self.prevbuf = None
|
self.table: Optional[List[Optional[bytes]]] = None
|
||||||
|
self.prevbuf: Optional[bytes] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def readbits(self, bits):
|
def readbits(self, bits: int) -> int:
|
||||||
v = 0
|
v = 0
|
||||||
while 1:
|
while 1:
|
||||||
# the number of remaining bits we can get from the current buffer.
|
# the number of remaining bits we can get from the current buffer.
|
||||||
|
@ -45,7 +47,7 @@ class LZWDecoder:
|
||||||
self.bpos = 0
|
self.bpos = 0
|
||||||
return v
|
return v
|
||||||
|
|
||||||
def feed(self, code):
|
def feed(self, code: int) -> bytes:
|
||||||
x = b''
|
x = b''
|
||||||
if code == 256:
|
if code == 256:
|
||||||
self.table = [bytes((c,)) for c in range(256)] # 0-255
|
self.table = [bytes((c,)) for c in range(256)] # 0-255
|
||||||
|
@ -56,14 +58,16 @@ class LZWDecoder:
|
||||||
elif code == 257:
|
elif code == 257:
|
||||||
pass
|
pass
|
||||||
elif not self.prevbuf:
|
elif not self.prevbuf:
|
||||||
x = self.prevbuf = self.table[code]
|
assert self.table is not None
|
||||||
|
x = self.prevbuf = cast(bytes, self.table[code]) # assume not None
|
||||||
else:
|
else:
|
||||||
|
assert self.table is not None
|
||||||
if code < len(self.table):
|
if code < len(self.table):
|
||||||
x = self.table[code]
|
x = cast(bytes, self.table[code]) # assume not None
|
||||||
self.table.append(self.prevbuf+x[:1])
|
self.table.append(self.prevbuf+x[:1])
|
||||||
elif code == len(self.table):
|
elif code == len(self.table):
|
||||||
self.table.append(self.prevbuf+self.prevbuf[:1])
|
self.table.append(self.prevbuf+self.prevbuf[:1])
|
||||||
x = self.table[code]
|
x = cast(bytes, self.table[code])
|
||||||
else:
|
else:
|
||||||
raise CorruptDataError
|
raise CorruptDataError
|
||||||
table_length = len(self.table)
|
table_length = len(self.table)
|
||||||
|
@ -76,7 +80,7 @@ class LZWDecoder:
|
||||||
self.prevbuf = x
|
self.prevbuf = x
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def run(self):
|
def run(self) -> Iterator[bytes]:
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
code = self.readbits(self.nbits)
|
code = self.readbits(self.nbits)
|
||||||
|
@ -88,12 +92,13 @@ class LZWDecoder:
|
||||||
# just ignore corrupt data and stop yielding there
|
# just ignore corrupt data and stop yielding there
|
||||||
break
|
break
|
||||||
yield x
|
yield x
|
||||||
|
assert self.table is not None
|
||||||
logger.debug('nbits=%d, code=%d, output=%r, table=%r'
|
logger.debug('nbits=%d, code=%d, output=%r, table=%r'
|
||||||
% (self.nbits, code, x, self.table[258:]))
|
% (self.nbits, code, x, self.table[258:]))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def lzwdecode(data):
|
def lzwdecode(data: bytes) -> bytes:
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
s = LZWDecoder(fp).run()
|
s = LZWDecoder(fp).run()
|
||||||
return b''.join(s)
|
return b''.join(s)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import collections
|
import collections
|
||||||
|
from typing import Dict
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,17 +10,17 @@ LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
||||||
|
|
||||||
class PDFColorSpace:
|
class PDFColorSpace:
|
||||||
|
|
||||||
def __init__(self, name, ncomponents):
|
def __init__(self, name: str, ncomponents: int) -> None:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.ncomponents = ncomponents
|
self.ncomponents = ncomponents
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFColorSpace: %s, ncomponents=%d>' % \
|
return '<PDFColorSpace: %s, ncomponents=%d>' % \
|
||||||
(self.name, self.ncomponents)
|
(self.name, self.ncomponents)
|
||||||
|
|
||||||
|
|
||||||
PREDEFINED_COLORSPACE = collections.OrderedDict()
|
PREDEFINED_COLORSPACE: Dict[str, PDFColorSpace] = collections.OrderedDict()
|
||||||
|
|
||||||
for (name, n) in [
|
for (name, n) in [
|
||||||
('DeviceGray', 1), # default value first
|
('DeviceGray', 1), # default value first
|
||||||
|
|
|
@ -1,66 +1,116 @@
|
||||||
|
from pdfminer.psparser import PSLiteral
|
||||||
|
from typing import (BinaryIO, Iterable, List, Optional, Sequence,
|
||||||
|
TYPE_CHECKING, Union, cast)
|
||||||
from . import utils
|
from . import utils
|
||||||
|
from .utils import Matrix, Point, Rect, PathSegment
|
||||||
|
from .pdfcolor import PDFColorSpace
|
||||||
|
from .pdffont import PDFFont
|
||||||
from .pdffont import PDFUnicodeNotDefined
|
from .pdffont import PDFUnicodeNotDefined
|
||||||
|
from .pdfpage import PDFPage
|
||||||
|
from .pdftypes import PDFStream
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .pdfinterp import PDFGraphicState
|
||||||
|
from .pdfinterp import PDFResourceManager
|
||||||
|
from .pdfinterp import PDFTextState
|
||||||
|
from .pdfinterp import PDFStackT
|
||||||
|
|
||||||
|
|
||||||
|
PDFTextSeq = Iterable[Union[int, float, bytes]]
|
||||||
|
|
||||||
|
|
||||||
class PDFDevice:
|
class PDFDevice:
|
||||||
"""Translate the output of PDFPageInterpreter to the output that is needed
|
"""Translate the output of PDFPageInterpreter to the output that is needed
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rsrcmgr):
|
def __init__(self, rsrcmgr: "PDFResourceManager") -> None:
|
||||||
self.rsrcmgr = rsrcmgr
|
self.rsrcmgr = rsrcmgr
|
||||||
self.ctm = None
|
self.ctm: Optional[Matrix] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFDevice>'
|
return '<PDFDevice>'
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> "PDFDevice":
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: object,
|
||||||
|
exc_val: object,
|
||||||
|
exc_tb: object
|
||||||
|
) -> None:
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_ctm(self, ctm):
|
def set_ctm(self, ctm: Matrix) -> None:
|
||||||
self.ctm = ctm
|
self.ctm = ctm
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
def begin_tag(
|
||||||
|
self,
|
||||||
|
tag: PSLiteral,
|
||||||
|
props: Optional["PDFStackT"] = None
|
||||||
|
) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_tag(self):
|
def end_tag(self) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_tag(self, tag, props=None):
|
def do_tag(
|
||||||
|
self,
|
||||||
|
tag: PSLiteral,
|
||||||
|
props: Optional["PDFStackT"] = None
|
||||||
|
) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page: PDFPage) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_figure(self, name):
|
def end_figure(self, name: str) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
def paint_path(
|
||||||
|
self,
|
||||||
|
graphicstate: "PDFGraphicState",
|
||||||
|
stroke: bool,
|
||||||
|
fill: bool,
|
||||||
|
evenodd: bool,
|
||||||
|
path: Sequence[PathSegment]
|
||||||
|
) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name: str, stream: PDFStream) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, seq, ncs, graphicstate):
|
def render_string(
|
||||||
|
self,
|
||||||
|
textstate: "PDFTextState",
|
||||||
|
seq: PDFTextSeq,
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: "PDFGraphicState"
|
||||||
|
) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
class PDFTextDevice(PDFDevice):
|
class PDFTextDevice(PDFDevice):
|
||||||
|
|
||||||
def render_string(self, textstate, seq, ncs, graphicstate):
|
def render_string(
|
||||||
|
self,
|
||||||
|
textstate: "PDFTextState",
|
||||||
|
seq: PDFTextSeq,
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: "PDFGraphicState"
|
||||||
|
) -> None:
|
||||||
|
assert self.ctm is not None
|
||||||
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
fontsize = textstate.fontsize
|
fontsize = textstate.fontsize
|
||||||
|
@ -68,6 +118,7 @@ class PDFTextDevice(PDFDevice):
|
||||||
charspace = textstate.charspace * scaling
|
charspace = textstate.charspace * scaling
|
||||||
wordspace = textstate.wordspace * scaling
|
wordspace = textstate.wordspace * scaling
|
||||||
rise = textstate.rise
|
rise = textstate.rise
|
||||||
|
assert font is not None
|
||||||
if font.is_multibyte():
|
if font.is_multibyte():
|
||||||
wordspace = 0
|
wordspace = 0
|
||||||
dxscale = .001 * fontsize * scaling
|
dxscale = .001 * fontsize * scaling
|
||||||
|
@ -83,13 +134,25 @@ class PDFTextDevice(PDFDevice):
|
||||||
graphicstate)
|
graphicstate)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string_horizontal(self, seq, matrix, pos,
|
def render_string_horizontal(
|
||||||
font, fontsize, scaling, charspace, wordspace,
|
self,
|
||||||
rise, dxscale, ncs, graphicstate):
|
seq: PDFTextSeq,
|
||||||
|
matrix: Matrix,
|
||||||
|
pos: Point,
|
||||||
|
font: PDFFont,
|
||||||
|
fontsize: float,
|
||||||
|
scaling: float,
|
||||||
|
charspace: float,
|
||||||
|
wordspace: float,
|
||||||
|
rise: float,
|
||||||
|
dxscale: float,
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: "PDFGraphicState"
|
||||||
|
) -> Point:
|
||||||
(x, y) = pos
|
(x, y) = pos
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if utils.isnumber(obj):
|
if isinstance(obj, (int, float)):
|
||||||
x -= obj*dxscale
|
x -= obj*dxscale
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
else:
|
else:
|
||||||
|
@ -104,13 +167,25 @@ class PDFTextDevice(PDFDevice):
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
def render_string_vertical(self, seq, matrix, pos,
|
def render_string_vertical(
|
||||||
font, fontsize, scaling, charspace, wordspace,
|
self,
|
||||||
rise, dxscale, ncs, graphicstate):
|
seq: PDFTextSeq,
|
||||||
|
matrix: Matrix,
|
||||||
|
pos: Point,
|
||||||
|
font: PDFFont,
|
||||||
|
fontsize: float,
|
||||||
|
scaling: float,
|
||||||
|
charspace: float,
|
||||||
|
wordspace: float,
|
||||||
|
rise: float,
|
||||||
|
dxscale: float,
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: "PDFGraphicState"
|
||||||
|
) -> Point:
|
||||||
(x, y) = pos
|
(x, y) = pos
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if utils.isnumber(obj):
|
if isinstance(obj, (int, float)):
|
||||||
y -= obj*dxscale
|
y -= obj*dxscale
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
else:
|
else:
|
||||||
|
@ -125,23 +200,44 @@ class PDFTextDevice(PDFDevice):
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
return (x, y)
|
return (x, y)
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
def render_char(
|
||||||
graphicstate):
|
self,
|
||||||
|
matrix: Matrix,
|
||||||
|
font: PDFFont,
|
||||||
|
fontsize: float,
|
||||||
|
scaling: float,
|
||||||
|
rise: float,
|
||||||
|
cid: int,
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: "PDFGraphicState"
|
||||||
|
) -> float:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
class TagExtractor(PDFDevice):
|
class TagExtractor(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: "PDFResourceManager",
|
||||||
|
outfp: BinaryIO,
|
||||||
|
codec: str = 'utf-8'
|
||||||
|
) -> None:
|
||||||
PDFDevice.__init__(self, rsrcmgr)
|
PDFDevice.__init__(self, rsrcmgr)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
self.pageno = 0
|
self.pageno = 0
|
||||||
self._stack = []
|
self._stack: List[PSLiteral] = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, seq, ncs, graphicstate):
|
def render_string(
|
||||||
|
self,
|
||||||
|
textstate: "PDFTextState",
|
||||||
|
seq: PDFTextSeq,
|
||||||
|
ncs: PDFColorSpace,
|
||||||
|
graphicstate: "PDFGraphicState"
|
||||||
|
) -> None:
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
|
assert font is not None
|
||||||
text = ''
|
text = ''
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if isinstance(obj, str):
|
if isinstance(obj, str):
|
||||||
|
@ -158,40 +254,42 @@ class TagExtractor(PDFDevice):
|
||||||
self._write(utils.enc(text))
|
self._write(utils.enc(text))
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
|
||||||
output = '<page id="%s" bbox="%s" rotate="%d">' %\
|
output = '<page id="%s" bbox="%s" rotate="%d">' %\
|
||||||
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
||||||
self._write(output)
|
self._write(output)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page: PDFPage) -> None:
|
||||||
self._write('</page>\n')
|
self._write('</page>\n')
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None
|
||||||
|
) -> None:
|
||||||
s = ''
|
s = ''
|
||||||
if isinstance(props, dict):
|
if isinstance(props, dict):
|
||||||
s = ''.join([
|
s = ''.join([
|
||||||
' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
|
' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
|
||||||
for (k, v) in sorted(props.items())
|
for (k, v) in sorted(props.items())
|
||||||
])
|
])
|
||||||
out_s = '<{}{}>'.format(utils.enc(tag.name), s)
|
out_s = '<{}{}>'.format(utils.enc(cast(str, tag.name)), s)
|
||||||
self._write(out_s)
|
self._write(out_s)
|
||||||
self._stack.append(tag)
|
self._stack.append(tag)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_tag(self):
|
def end_tag(self) -> None:
|
||||||
assert self._stack, str(self.pageno)
|
assert self._stack, str(self.pageno)
|
||||||
tag = self._stack.pop(-1)
|
tag = self._stack.pop(-1)
|
||||||
out_s = '</%s>' % utils.enc(tag.name)
|
out_s = '</%s>' % utils.enc(cast(str, tag.name))
|
||||||
self._write(out_s)
|
self._write(out_s)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_tag(self, tag, props=None):
|
def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None
|
||||||
|
) -> None:
|
||||||
self.begin_tag(tag, props)
|
self.begin_tag(tag, props)
|
||||||
self._stack.pop(-1)
|
self._stack.pop(-1)
|
||||||
return
|
return
|
||||||
|
|
||||||
def _write(self, s: str):
|
def _write(self, s: str) -> None:
|
||||||
self.outfp.write(s.encode(self.codec))
|
self.outfp.write(s.encode(self.codec))
|
||||||
|
|
|
@ -2,16 +2,18 @@ import logging
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
from hashlib import sha256, md5, sha384, sha512
|
from hashlib import sha256, md5, sha384, sha512
|
||||||
|
from typing import (Any, Callable, Dict, Iterable, Iterator, KeysView, List,
|
||||||
|
Optional, Sequence, Tuple, Type, Union, cast)
|
||||||
|
|
||||||
from cryptography.hazmat.backends import default_backend
|
from cryptography.hazmat.backends import default_backend
|
||||||
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
||||||
|
|
||||||
from . import settings
|
from . import settings
|
||||||
from .arcfour import Arcfour
|
from .arcfour import Arcfour
|
||||||
from .pdfparser import PDFSyntaxError, PDFStreamParser
|
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
|
||||||
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
|
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream,\
|
||||||
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
||||||
dict_value, stream_value
|
uint_value, dict_value, stream_value
|
||||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||||
from .utils import choplist, nunpack, decode_text
|
from .utils import choplist, nunpack, decode_text
|
||||||
|
|
||||||
|
@ -51,7 +53,7 @@ class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||||
|
|
||||||
|
|
||||||
class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
|
class PDFTextExtractionNotAllowedError(PDFTextExtractionNotAllowed):
|
||||||
def __init__(self, *args):
|
def __init__(self, *args: object) -> None:
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
warn('PDFTextExtractionNotAllowedError will be removed in the future. '
|
warn('PDFTextExtractionNotAllowedError will be removed in the future. '
|
||||||
'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning)
|
'Use PDFTextExtractionNotAllowed instead.', DeprecationWarning)
|
||||||
|
@ -65,31 +67,33 @@ LITERAL_CATALOG = LIT('Catalog')
|
||||||
|
|
||||||
|
|
||||||
class PDFBaseXRef:
|
class PDFBaseXRef:
|
||||||
|
def get_trailer(self) -> Dict[str, Any]:
|
||||||
def get_trailer(self):
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_objids(self):
|
def get_objids(self) -> Iterable[int]:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Must return
|
# Must return
|
||||||
# (strmid, index, genno)
|
# (strmid, index, genno)
|
||||||
# or (None, pos, genno)
|
# or (None, pos, genno)
|
||||||
def get_pos(self, objid):
|
def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
|
||||||
raise KeyError(objid)
|
raise KeyError(objid)
|
||||||
|
|
||||||
|
def load(self, parser: PDFParser) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class PDFXRef(PDFBaseXRef):
|
class PDFXRef(PDFBaseXRef):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.offsets = {}
|
self.offsets: Dict[int, Tuple[Optional[int], int, int]] = {}
|
||||||
self.trailer = {}
|
self.trailer: Dict[str, Any] = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
|
return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
|
||||||
|
|
||||||
def load(self, parser):
|
def load(self, parser: PDFParser) -> None:
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
|
@ -123,15 +127,15 @@ class PDFXRef(PDFBaseXRef):
|
||||||
error_msg = 'Invalid XRef format: {!r}, line={!r}'\
|
error_msg = 'Invalid XRef format: {!r}, line={!r}'\
|
||||||
.format(parser, line)
|
.format(parser, line)
|
||||||
raise PDFNoValidXRef(error_msg)
|
raise PDFNoValidXRef(error_msg)
|
||||||
(pos, genno, use) = f
|
(pos_b, genno_b, use_b) = f
|
||||||
if use != b'n':
|
if use_b != b'n':
|
||||||
continue
|
continue
|
||||||
self.offsets[objid] = (None, int(pos), int(genno))
|
self.offsets[objid] = (None, int(pos_b), int(genno_b))
|
||||||
log.info('xref objects: %r', self.offsets)
|
log.info('xref objects: %r', self.offsets)
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
return
|
return
|
||||||
|
|
||||||
def load_trailer(self, parser):
|
def load_trailer(self, parser: PDFParser) -> None:
|
||||||
try:
|
try:
|
||||||
(_, kwd) = parser.nexttoken()
|
(_, kwd) = parser.nexttoken()
|
||||||
assert kwd is KWD(b'trailer'), str(kwd)
|
assert kwd is KWD(b'trailer'), str(kwd)
|
||||||
|
@ -145,13 +149,13 @@ class PDFXRef(PDFBaseXRef):
|
||||||
log.debug('trailer=%r', self.trailer)
|
log.debug('trailer=%r', self.trailer)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_trailer(self):
|
def get_trailer(self) -> Dict[str, Any]:
|
||||||
return self.trailer
|
return self.trailer
|
||||||
|
|
||||||
def get_objids(self):
|
def get_objids(self) -> KeysView[int]:
|
||||||
return self.offsets.keys()
|
return self.offsets.keys()
|
||||||
|
|
||||||
def get_pos(self, objid):
|
def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
|
||||||
try:
|
try:
|
||||||
return self.offsets[objid]
|
return self.offsets[objid]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -160,30 +164,30 @@ class PDFXRef(PDFBaseXRef):
|
||||||
|
|
||||||
class PDFXRefFallback(PDFXRef):
|
class PDFXRefFallback(PDFXRef):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
||||||
|
|
||||||
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||||
|
|
||||||
def load(self, parser):
|
def load(self, parser: PDFParser) -> None:
|
||||||
parser.seek(0)
|
parser.seek(0)
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line_bytes) = parser.nextline()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
break
|
break
|
||||||
if line.startswith(b'trailer'):
|
if line_bytes.startswith(b'trailer'):
|
||||||
parser.seek(pos)
|
parser.seek(pos)
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
log.info('trailer: %r', self.trailer)
|
log.info('trailer: %r', self.trailer)
|
||||||
break
|
break
|
||||||
line = line.decode('latin-1') # default pdf encoding
|
line = line_bytes.decode('latin-1') # default pdf encoding
|
||||||
m = self.PDFOBJ_CUE.match(line)
|
m = self.PDFOBJ_CUE.match(line)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
(objid, genno) = m.groups()
|
(objid_s, genno_s) = m.groups()
|
||||||
objid = int(objid)
|
objid = int(objid_s)
|
||||||
genno = int(genno)
|
genno = int(genno_s)
|
||||||
self.offsets[objid] = (None, pos, genno)
|
self.offsets[objid] = (None, pos, genno)
|
||||||
# expand ObjStm.
|
# expand ObjStm.
|
||||||
parser.seek(pos)
|
parser.seek(pos)
|
||||||
|
@ -198,11 +202,11 @@ class PDFXRefFallback(PDFXRef):
|
||||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||||
n = 0
|
n = 0
|
||||||
parser1 = PDFStreamParser(stream.get_data())
|
parser1 = PDFStreamParser(stream.get_data())
|
||||||
objs = []
|
objs: List[int] = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
(_, obj) = parser1.nextobject()
|
(_, obj) = parser1.nextobject()
|
||||||
objs.append(obj)
|
objs.append(cast(int, obj))
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
pass
|
pass
|
||||||
n = min(n, len(objs)//2)
|
n = min(n, len(objs)//2)
|
||||||
|
@ -214,17 +218,19 @@ class PDFXRefFallback(PDFXRef):
|
||||||
|
|
||||||
class PDFXRefStream(PDFBaseXRef):
|
class PDFXRefStream(PDFBaseXRef):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.data = None
|
self.data: Optional[bytes] = None
|
||||||
self.entlen = None
|
self.entlen: Optional[int] = None
|
||||||
self.fl1 = self.fl2 = self.fl3 = None
|
self.fl1: Optional[int] = None
|
||||||
self.ranges = []
|
self.fl2: Optional[int] = None
|
||||||
|
self.fl3: Optional[int] = None
|
||||||
|
self.ranges: List[Tuple[int, int]] = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
|
||||||
|
|
||||||
def load(self, parser):
|
def load(self, parser: PDFParser) -> None:
|
||||||
(_, objid) = parser.nexttoken() # ignored
|
(_, objid) = parser.nexttoken() # ignored
|
||||||
(_, genno) = parser.nexttoken() # ignored
|
(_, genno) = parser.nexttoken() # ignored
|
||||||
(_, kwd) = parser.nexttoken()
|
(_, kwd) = parser.nexttoken()
|
||||||
|
@ -236,8 +242,11 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
index_array = stream.get('Index', (0, size))
|
index_array = stream.get('Index', (0, size))
|
||||||
if len(index_array) % 2 != 0:
|
if len(index_array) % 2 != 0:
|
||||||
raise PDFSyntaxError('Invalid index number')
|
raise PDFSyntaxError('Invalid index number')
|
||||||
self.ranges.extend(choplist(2, index_array))
|
self.ranges.extend(cast(Iterator[Tuple[int, int]],
|
||||||
|
choplist(2, index_array)))
|
||||||
(self.fl1, self.fl2, self.fl3) = stream['W']
|
(self.fl1, self.fl2, self.fl3) = stream['W']
|
||||||
|
assert (self.fl1 is not None and self.fl2 is not None
|
||||||
|
and self.fl3 is not None)
|
||||||
self.data = stream.get_data()
|
self.data = stream.get_data()
|
||||||
self.entlen = self.fl1+self.fl2+self.fl3
|
self.entlen = self.fl1+self.fl2+self.fl3
|
||||||
self.trailer = stream.attrs
|
self.trailer = stream.attrs
|
||||||
|
@ -246,12 +255,14 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
self.fl1, self.fl2, self.fl3)
|
self.fl1, self.fl2, self.fl3)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_trailer(self):
|
def get_trailer(self) -> Dict[str, Any]:
|
||||||
return self.trailer
|
return self.trailer
|
||||||
|
|
||||||
def get_objids(self):
|
def get_objids(self) -> Iterator[int]:
|
||||||
for (start, nobjs) in self.ranges:
|
for (start, nobjs) in self.ranges:
|
||||||
for i in range(nobjs):
|
for i in range(nobjs):
|
||||||
|
assert self.entlen is not None
|
||||||
|
assert self.data is not None
|
||||||
offset = self.entlen * i
|
offset = self.entlen * i
|
||||||
ent = self.data[offset:offset+self.entlen]
|
ent = self.data[offset:offset+self.entlen]
|
||||||
f1 = nunpack(ent[:self.fl1], 1)
|
f1 = nunpack(ent[:self.fl1], 1)
|
||||||
|
@ -259,7 +270,7 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
yield start+i
|
yield start+i
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_pos(self, objid):
|
def get_pos(self, objid: int) -> Tuple[Optional[int], int, int]:
|
||||||
index = 0
|
index = 0
|
||||||
for (start, nobjs) in self.ranges:
|
for (start, nobjs) in self.ranges:
|
||||||
if start <= objid and objid < start+nobjs:
|
if start <= objid and objid < start+nobjs:
|
||||||
|
@ -269,6 +280,10 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
index += nobjs
|
index += nobjs
|
||||||
else:
|
else:
|
||||||
raise KeyError(objid)
|
raise KeyError(objid)
|
||||||
|
assert self.entlen is not None
|
||||||
|
assert self.data is not None
|
||||||
|
assert (self.fl1 is not None and self.fl2 is not None
|
||||||
|
and self.fl3 is not None)
|
||||||
offset = self.entlen * index
|
offset = self.entlen * index
|
||||||
ent = self.data[offset:offset+self.entlen]
|
ent = self.data[offset:offset+self.entlen]
|
||||||
f1 = nunpack(ent[:self.fl1], 1)
|
f1 = nunpack(ent[:self.fl1], 1)
|
||||||
|
@ -287,16 +302,21 @@ class PDFStandardSecurityHandler:
|
||||||
|
|
||||||
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
|
PASSWORD_PADDING = (b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08'
|
||||||
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
|
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
|
||||||
supported_revisions = (2, 3)
|
supported_revisions: Tuple[int, ...] = (2, 3)
|
||||||
|
|
||||||
def __init__(self, docid, param, password=''):
|
def __init__(
|
||||||
|
self,
|
||||||
|
docid: Sequence[bytes],
|
||||||
|
param: Dict[str, Any],
|
||||||
|
password: str = ''
|
||||||
|
) -> None:
|
||||||
self.docid = docid
|
self.docid = docid
|
||||||
self.param = param
|
self.param = param
|
||||||
self.password = password
|
self.password = password
|
||||||
self.init()
|
self.init()
|
||||||
return
|
return
|
||||||
|
|
||||||
def init(self):
|
def init(self) -> None:
|
||||||
self.init_params()
|
self.init_params()
|
||||||
if self.r not in self.supported_revisions:
|
if self.r not in self.supported_revisions:
|
||||||
error_msg = 'Unsupported revision: param=%r' % self.param
|
error_msg = 'Unsupported revision: param=%r' % self.param
|
||||||
|
@ -304,7 +324,7 @@ class PDFStandardSecurityHandler:
|
||||||
self.init_key()
|
self.init_key()
|
||||||
return
|
return
|
||||||
|
|
||||||
def init_params(self):
|
def init_params(self) -> None:
|
||||||
self.v = int_value(self.param.get('V', 0))
|
self.v = int_value(self.param.get('V', 0))
|
||||||
self.r = int_value(self.param['R'])
|
self.r = int_value(self.param['R'])
|
||||||
self.p = uint_value(self.param['P'], 32)
|
self.p = uint_value(self.param['P'], 32)
|
||||||
|
@ -313,22 +333,22 @@ class PDFStandardSecurityHandler:
|
||||||
self.length = int_value(self.param.get('Length', 40))
|
self.length = int_value(self.param.get('Length', 40))
|
||||||
return
|
return
|
||||||
|
|
||||||
def init_key(self):
|
def init_key(self) -> None:
|
||||||
self.key = self.authenticate(self.password)
|
self.key = self.authenticate(self.password)
|
||||||
if self.key is None:
|
if self.key is None:
|
||||||
raise PDFPasswordIncorrect
|
raise PDFPasswordIncorrect
|
||||||
return
|
return
|
||||||
|
|
||||||
def is_printable(self):
|
def is_printable(self) -> bool:
|
||||||
return bool(self.p & 4)
|
return bool(self.p & 4)
|
||||||
|
|
||||||
def is_modifiable(self):
|
def is_modifiable(self) -> bool:
|
||||||
return bool(self.p & 8)
|
return bool(self.p & 8)
|
||||||
|
|
||||||
def is_extractable(self):
|
def is_extractable(self) -> bool:
|
||||||
return bool(self.p & 16)
|
return bool(self.p & 16)
|
||||||
|
|
||||||
def compute_u(self, key):
|
def compute_u(self, key: bytes) -> bytes:
|
||||||
if self.r == 2:
|
if self.r == 2:
|
||||||
# Algorithm 3.4
|
# Algorithm 3.4
|
||||||
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
|
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
|
||||||
|
@ -343,7 +363,7 @@ class PDFStandardSecurityHandler:
|
||||||
result += result # 6
|
result += result # 6
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def compute_encryption_key(self, password):
|
def compute_encryption_key(self, password: bytes) -> bytes:
|
||||||
# Algorithm 3.2
|
# Algorithm 3.2
|
||||||
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
password = (password + self.PASSWORD_PADDING)[:32] # 1
|
||||||
hash = md5(password) # 2
|
hash = md5(password) # 2
|
||||||
|
@ -352,7 +372,7 @@ class PDFStandardSecurityHandler:
|
||||||
hash.update(struct.pack('<L', self.p)) # 4
|
hash.update(struct.pack('<L', self.p)) # 4
|
||||||
hash.update(self.docid[0]) # 5
|
hash.update(self.docid[0]) # 5
|
||||||
if self.r >= 4:
|
if self.r >= 4:
|
||||||
if not self.encrypt_metadata:
|
if not cast(PDFStandardSecurityHandlerV4, self).encrypt_metadata:
|
||||||
hash.update(b'\xff\xff\xff\xff')
|
hash.update(b'\xff\xff\xff\xff')
|
||||||
result = hash.digest()
|
result = hash.digest()
|
||||||
n = 5
|
n = 5
|
||||||
|
@ -362,28 +382,28 @@ class PDFStandardSecurityHandler:
|
||||||
result = md5(result[:n]).digest()
|
result = md5(result[:n]).digest()
|
||||||
return result[:n]
|
return result[:n]
|
||||||
|
|
||||||
def authenticate(self, password):
|
def authenticate(self, password: str) -> Optional[bytes]:
|
||||||
password = password.encode("latin1")
|
password_bytes = password.encode("latin1")
|
||||||
key = self.authenticate_user_password(password)
|
key = self.authenticate_user_password(password_bytes)
|
||||||
if key is None:
|
if key is None:
|
||||||
key = self.authenticate_owner_password(password)
|
key = self.authenticate_owner_password(password_bytes)
|
||||||
return key
|
return key
|
||||||
|
|
||||||
def authenticate_user_password(self, password):
|
def authenticate_user_password(self, password: bytes) -> Optional[bytes]:
|
||||||
key = self.compute_encryption_key(password)
|
key = self.compute_encryption_key(password)
|
||||||
if self.verify_encryption_key(key):
|
if self.verify_encryption_key(key):
|
||||||
return key
|
return key
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def verify_encryption_key(self, key):
|
def verify_encryption_key(self, key: bytes) -> bool:
|
||||||
# Algorithm 3.6
|
# Algorithm 3.6
|
||||||
u = self.compute_u(key)
|
u = self.compute_u(key)
|
||||||
if self.r == 2:
|
if self.r == 2:
|
||||||
return u == self.u
|
return u == self.u
|
||||||
return u[:16] == self.u[:16]
|
return u[:16] == self.u[:16]
|
||||||
|
|
||||||
def authenticate_owner_password(self, password):
|
def authenticate_owner_password(self, password: bytes) -> Optional[bytes]:
|
||||||
# Algorithm 3.7
|
# Algorithm 3.7
|
||||||
password = (password + self.PASSWORD_PADDING)[:32]
|
password = (password + self.PASSWORD_PADDING)[:32]
|
||||||
hash = md5(password)
|
hash = md5(password)
|
||||||
|
@ -403,10 +423,17 @@ class PDFStandardSecurityHandler:
|
||||||
user_password = Arcfour(k).decrypt(user_password)
|
user_password = Arcfour(k).decrypt(user_password)
|
||||||
return self.authenticate_user_password(user_password)
|
return self.authenticate_user_password(user_password)
|
||||||
|
|
||||||
def decrypt(self, objid, genno, data, attrs=None):
|
def decrypt(
|
||||||
|
self,
|
||||||
|
objid: int,
|
||||||
|
genno: int,
|
||||||
|
data: bytes,
|
||||||
|
attrs: Optional[Dict[str, Any]] = None
|
||||||
|
) -> bytes:
|
||||||
return self.decrypt_rc4(objid, genno, data)
|
return self.decrypt_rc4(objid, genno, data)
|
||||||
|
|
||||||
def decrypt_rc4(self, objid, genno, data):
|
def decrypt_rc4(self, objid: int, genno: int, data: bytes) -> bytes:
|
||||||
|
assert self.key is not None
|
||||||
key = self.key + struct.pack('<L', objid)[:3] \
|
key = self.key + struct.pack('<L', objid)[:3] \
|
||||||
+ struct.pack('<L', genno)[:2]
|
+ struct.pack('<L', genno)[:2]
|
||||||
hash = md5(key)
|
hash = md5(key)
|
||||||
|
@ -416,9 +443,9 @@ class PDFStandardSecurityHandler:
|
||||||
|
|
||||||
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||||
|
|
||||||
supported_revisions = (4,)
|
supported_revisions: Tuple[int, ...] = (4,)
|
||||||
|
|
||||||
def init_params(self):
|
def init_params(self) -> None:
|
||||||
super().init_params()
|
super().init_params()
|
||||||
self.length = 128
|
self.length = 128
|
||||||
self.cf = dict_value(self.param.get('CF'))
|
self.cf = dict_value(self.param.get('CF'))
|
||||||
|
@ -442,7 +469,10 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||||
raise PDFEncryptionError(error_msg)
|
raise PDFEncryptionError(error_msg)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cfm(self, name):
|
def get_cfm(
|
||||||
|
self,
|
||||||
|
name: str
|
||||||
|
) -> Optional[Callable[[int, int, bytes], bytes]]:
|
||||||
if name == 'V2':
|
if name == 'V2':
|
||||||
return self.decrypt_rc4
|
return self.decrypt_rc4
|
||||||
elif name == 'AESV2':
|
elif name == 'AESV2':
|
||||||
|
@ -450,7 +480,14 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def decrypt(self, objid, genno, data, attrs=None, name=None):
|
def decrypt(
|
||||||
|
self,
|
||||||
|
objid: int,
|
||||||
|
genno: int,
|
||||||
|
data: bytes,
|
||||||
|
attrs: Optional[Dict[str, Any]] = None,
|
||||||
|
name: Optional[str] = None
|
||||||
|
) -> bytes:
|
||||||
if not self.encrypt_metadata and attrs is not None:
|
if not self.encrypt_metadata and attrs is not None:
|
||||||
t = attrs.get('Type')
|
t = attrs.get('Type')
|
||||||
if t is not None and literal_name(t) == 'Metadata':
|
if t is not None and literal_name(t) == 'Metadata':
|
||||||
|
@ -459,10 +496,11 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||||
name = self.strf
|
name = self.strf
|
||||||
return self.cfm[name](objid, genno, data)
|
return self.cfm[name](objid, genno, data)
|
||||||
|
|
||||||
def decrypt_identity(self, objid, genno, data):
|
def decrypt_identity(self, objid: int, genno: int, data: bytes) -> bytes:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def decrypt_aes128(self, objid, genno, data):
|
def decrypt_aes128(self, objid: int, genno: int, data: bytes) -> bytes:
|
||||||
|
assert self.key is not None
|
||||||
key = self.key + struct.pack('<L', objid)[:3] \
|
key = self.key + struct.pack('<L', objid)[:3] \
|
||||||
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
+ struct.pack('<L', genno)[:2] + b'sAlT'
|
||||||
hash = md5(key)
|
hash = md5(key)
|
||||||
|
@ -471,15 +509,15 @@ class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
|
||||||
ciphertext = data[16:]
|
ciphertext = data[16:]
|
||||||
cipher = Cipher(algorithms.AES(key),
|
cipher = Cipher(algorithms.AES(key),
|
||||||
modes.CBC(initialization_vector),
|
modes.CBC(initialization_vector),
|
||||||
backend=default_backend())
|
backend=default_backend()) # type: ignore
|
||||||
return cipher.decryptor().update(ciphertext)
|
return cipher.decryptor().update(ciphertext) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
|
|
||||||
supported_revisions = (5, 6)
|
supported_revisions = (5, 6)
|
||||||
|
|
||||||
def init_params(self):
|
def init_params(self) -> None:
|
||||||
super().init_params()
|
super().init_params()
|
||||||
self.length = 256
|
self.length = 256
|
||||||
self.oe = str_value(self.param['OE'])
|
self.oe = str_value(self.param['OE'])
|
||||||
|
@ -492,31 +530,34 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
self.u_key_salt = self.u[40:]
|
self.u_key_salt = self.u[40:]
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cfm(self, name):
|
def get_cfm(
|
||||||
|
self,
|
||||||
|
name: str
|
||||||
|
) -> Optional[Callable[[int, int, bytes], bytes]]:
|
||||||
if name == 'AESV3':
|
if name == 'AESV3':
|
||||||
return self.decrypt_aes256
|
return self.decrypt_aes256
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def authenticate(self, password):
|
def authenticate(self, password: str) -> Optional[bytes]:
|
||||||
password = self._normalize_password(password)
|
password_b = self._normalize_password(password)
|
||||||
hash = self._password_hash(password, self.o_validation_salt, self.u)
|
hash = self._password_hash(password_b, self.o_validation_salt, self.u)
|
||||||
if hash == self.o_hash:
|
if hash == self.o_hash:
|
||||||
hash = self._password_hash(password, self.o_key_salt, self.u)
|
hash = self._password_hash(password_b, self.o_key_salt, self.u)
|
||||||
cipher = Cipher(algorithms.AES(hash),
|
cipher = Cipher(algorithms.AES(hash),
|
||||||
modes.CBC(b'\0' * 16),
|
modes.CBC(b'\0' * 16),
|
||||||
backend=default_backend())
|
backend=default_backend()) # type: ignore
|
||||||
return cipher.decryptor().update(self.oe)
|
return cipher.decryptor().update(self.oe) # type: ignore
|
||||||
hash = self._password_hash(password, self.u_validation_salt)
|
hash = self._password_hash(password_b, self.u_validation_salt)
|
||||||
if hash == self.u_hash:
|
if hash == self.u_hash:
|
||||||
hash = self._password_hash(password, self.u_key_salt)
|
hash = self._password_hash(password_b, self.u_key_salt)
|
||||||
cipher = Cipher(algorithms.AES(hash),
|
cipher = Cipher(algorithms.AES(hash),
|
||||||
modes.CBC(b'\0' * 16),
|
modes.CBC(b'\0' * 16),
|
||||||
backend=default_backend())
|
backend=default_backend()) # type: ignore
|
||||||
return cipher.decryptor().update(self.ue)
|
return cipher.decryptor().update(self.ue) # type: ignore
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _normalize_password(self, password):
|
def _normalize_password(self, password: str) -> bytes:
|
||||||
if self.r == 6:
|
if self.r == 6:
|
||||||
# saslprep expects non-empty strings, apparently
|
# saslprep expects non-empty strings, apparently
|
||||||
if not password:
|
if not password:
|
||||||
|
@ -525,7 +566,12 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
password = saslprep(password)
|
password = saslprep(password)
|
||||||
return password.encode('utf-8')[:127]
|
return password.encode('utf-8')[:127]
|
||||||
|
|
||||||
def _password_hash(self, password, salt, vector=None):
|
def _password_hash(
|
||||||
|
self,
|
||||||
|
password: bytes,
|
||||||
|
salt: bytes,
|
||||||
|
vector: Optional[bytes] = None
|
||||||
|
) -> bytes:
|
||||||
"""
|
"""
|
||||||
Compute password hash depending on revision number
|
Compute password hash depending on revision number
|
||||||
"""
|
"""
|
||||||
|
@ -533,7 +579,12 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
return self._r5_password(password, salt, vector)
|
return self._r5_password(password, salt, vector)
|
||||||
return self._r6_password(password, salt[0:8], vector)
|
return self._r6_password(password, salt[0:8], vector)
|
||||||
|
|
||||||
def _r5_password(self, password, salt, vector):
|
def _r5_password(
|
||||||
|
self,
|
||||||
|
password: bytes,
|
||||||
|
salt: bytes,
|
||||||
|
vector: Optional[bytes] = None
|
||||||
|
) -> bytes:
|
||||||
"""
|
"""
|
||||||
Compute the password for revision 5
|
Compute the password for revision 5
|
||||||
"""
|
"""
|
||||||
|
@ -543,7 +594,12 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
hash.update(vector)
|
hash.update(vector)
|
||||||
return hash.digest()
|
return hash.digest()
|
||||||
|
|
||||||
def _r6_password(self, password, salt, vector):
|
def _r6_password(
|
||||||
|
self,
|
||||||
|
password: bytes,
|
||||||
|
salt: bytes,
|
||||||
|
vector: Optional[bytes] = None
|
||||||
|
) -> bytes:
|
||||||
"""
|
"""
|
||||||
Compute the password for revision 6
|
Compute the password for revision 6
|
||||||
"""
|
"""
|
||||||
|
@ -568,22 +624,28 @@ class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
|
||||||
return k[:32]
|
return k[:32]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _bytes_mod_3(input_bytes):
|
def _bytes_mod_3(input_bytes: bytes) -> int:
|
||||||
# 256 is 1 mod 3, so we can just sum 'em
|
# 256 is 1 mod 3, so we can just sum 'em
|
||||||
return sum(b % 3 for b in input_bytes) % 3
|
return sum(b % 3 for b in input_bytes) % 3
|
||||||
|
|
||||||
def _aes_cbc_encrypt(self, key, iv, data):
|
def _aes_cbc_encrypt(
|
||||||
|
self,
|
||||||
|
key: bytes,
|
||||||
|
iv: bytes,
|
||||||
|
data: bytes
|
||||||
|
) -> bytes:
|
||||||
cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
|
cipher = Cipher(algorithms.AES(key), modes.CBC(iv))
|
||||||
encryptor = cipher.encryptor()
|
encryptor = cipher.encryptor() # type: ignore
|
||||||
return encryptor.update(data) + encryptor.finalize()
|
return encryptor.update(data) + encryptor.finalize() # type: ignore
|
||||||
|
|
||||||
def decrypt_aes256(self, objid, genno, data):
|
def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
|
||||||
initialization_vector = data[:16]
|
initialization_vector = data[:16]
|
||||||
ciphertext = data[16:]
|
ciphertext = data[16:]
|
||||||
|
assert self.key is not None
|
||||||
cipher = Cipher(algorithms.AES(self.key),
|
cipher = Cipher(algorithms.AES(self.key),
|
||||||
modes.CBC(initialization_vector),
|
modes.CBC(initialization_vector),
|
||||||
backend=default_backend())
|
backend=default_backend()) # type: ignore
|
||||||
return cipher.decryptor().update(ciphertext)
|
return cipher.decryptor().update(ciphertext) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
class PDFDocument:
|
class PDFDocument:
|
||||||
|
@ -599,24 +661,30 @@ class PDFDocument:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
security_handler_registry = {
|
security_handler_registry: Dict[int, Type[PDFStandardSecurityHandler]] = {
|
||||||
1: PDFStandardSecurityHandler,
|
1: PDFStandardSecurityHandler,
|
||||||
2: PDFStandardSecurityHandler,
|
2: PDFStandardSecurityHandler,
|
||||||
4: PDFStandardSecurityHandlerV4,
|
4: PDFStandardSecurityHandlerV4,
|
||||||
5: PDFStandardSecurityHandlerV5,
|
5: PDFStandardSecurityHandlerV5,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, parser, password='', caching=True, fallback=True):
|
def __init__(
|
||||||
|
self,
|
||||||
|
parser: PDFParser,
|
||||||
|
password: str = '',
|
||||||
|
caching: bool = True,
|
||||||
|
fallback: bool = True
|
||||||
|
) -> None:
|
||||||
"Set the document to use a given PDFParser object."
|
"Set the document to use a given PDFParser object."
|
||||||
self.caching = caching
|
self.caching = caching
|
||||||
self.xrefs = []
|
self.xrefs: List[PDFBaseXRef] = []
|
||||||
self.info = []
|
self.info = []
|
||||||
self.catalog = None
|
self.catalog: Dict[str, Any] = {}
|
||||||
self.encryption = None
|
self.encryption: Optional[Tuple[Any, Any]] = None
|
||||||
self.decipher = None
|
self.decipher: Optional[DecipherCallable] = None
|
||||||
self._parser = None
|
self._parser = None
|
||||||
self._cached_objs = {}
|
self._cached_objs: Dict[int, Tuple[object, int]] = {}
|
||||||
self._parsed_objs = {}
|
self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
|
||||||
self._parser = parser
|
self._parser = parser
|
||||||
self._parser.set_document(self)
|
self._parser.set_document(self)
|
||||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||||
|
@ -629,9 +697,9 @@ class PDFDocument:
|
||||||
pass # fallback = True
|
pass # fallback = True
|
||||||
if fallback:
|
if fallback:
|
||||||
parser.fallback = True
|
parser.fallback = True
|
||||||
xref = PDFXRefFallback()
|
newxref = PDFXRefFallback()
|
||||||
xref.load(parser)
|
newxref.load(parser)
|
||||||
self.xrefs.append(xref)
|
self.xrefs.append(newxref)
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
trailer = xref.get_trailer()
|
trailer = xref.get_trailer()
|
||||||
if not trailer:
|
if not trailer:
|
||||||
|
@ -665,7 +733,8 @@ class PDFDocument:
|
||||||
|
|
||||||
# _initialize_password(password=b'')
|
# _initialize_password(password=b'')
|
||||||
# Perform the initialization with a given password.
|
# Perform the initialization with a given password.
|
||||||
def _initialize_password(self, password=''):
|
def _initialize_password(self, password: str = '') -> None:
|
||||||
|
assert self.encryption is not None
|
||||||
(docid, param) = self.encryption
|
(docid, param) = self.encryption
|
||||||
if literal_name(param.get('Filter')) != 'Standard':
|
if literal_name(param.get('Filter')) != 'Standard':
|
||||||
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
||||||
|
@ -678,15 +747,22 @@ class PDFDocument:
|
||||||
self.is_printable = handler.is_printable()
|
self.is_printable = handler.is_printable()
|
||||||
self.is_modifiable = handler.is_modifiable()
|
self.is_modifiable = handler.is_modifiable()
|
||||||
self.is_extractable = handler.is_extractable()
|
self.is_extractable = handler.is_extractable()
|
||||||
|
assert self._parser is not None
|
||||||
self._parser.fallback = False # need to read streams with exact length
|
self._parser.fallback = False # need to read streams with exact length
|
||||||
return
|
return
|
||||||
|
|
||||||
def _getobj_objstm(self, stream, index, objid):
|
def _getobj_objstm(
|
||||||
|
self,
|
||||||
|
stream: PDFStream,
|
||||||
|
index: int,
|
||||||
|
objid: int
|
||||||
|
) -> object:
|
||||||
if stream.objid in self._parsed_objs:
|
if stream.objid in self._parsed_objs:
|
||||||
(objs, n) = self._parsed_objs[stream.objid]
|
(objs, n) = self._parsed_objs[stream.objid]
|
||||||
else:
|
else:
|
||||||
(objs, n) = self._get_objects(stream)
|
(objs, n) = self._get_objects(stream)
|
||||||
if self.caching:
|
if self.caching:
|
||||||
|
assert stream.objid is not None
|
||||||
self._parsed_objs[stream.objid] = (objs, n)
|
self._parsed_objs[stream.objid] = (objs, n)
|
||||||
i = n*2+index
|
i = n*2+index
|
||||||
try:
|
try:
|
||||||
|
@ -695,19 +771,19 @@ class PDFDocument:
|
||||||
raise PDFSyntaxError('index too big: %r' % index)
|
raise PDFSyntaxError('index too big: %r' % index)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
def _get_objects(self, stream):
|
def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]:
|
||||||
if stream.get('Type') is not LITERAL_OBJSTM:
|
if stream.get('Type') is not LITERAL_OBJSTM:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||||
try:
|
try:
|
||||||
n = stream['N']
|
n = cast(int, stream['N'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||||
n = 0
|
n = 0
|
||||||
parser = PDFStreamParser(stream.get_data())
|
parser = PDFStreamParser(stream.get_data())
|
||||||
parser.set_document(self)
|
parser.set_document(self)
|
||||||
objs = []
|
objs: List[object] = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
(_, obj) = parser.nextobject()
|
(_, obj) = parser.nextobject()
|
||||||
|
@ -716,7 +792,8 @@ class PDFDocument:
|
||||||
pass
|
pass
|
||||||
return (objs, n)
|
return (objs, n)
|
||||||
|
|
||||||
def _getobj_parse(self, pos, objid):
|
def _getobj_parse(self, pos: int, objid: int) -> object:
|
||||||
|
assert self._parser is not None
|
||||||
self._parser.seek(pos)
|
self._parser.seek(pos)
|
||||||
(_, objid1) = self._parser.nexttoken() # objid
|
(_, objid1) = self._parser.nexttoken() # objid
|
||||||
(_, genno) = self._parser.nexttoken() # genno
|
(_, genno) = self._parser.nexttoken() # genno
|
||||||
|
@ -744,7 +821,7 @@ class PDFDocument:
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
# can raise PDFObjectNotFound
|
# can raise PDFObjectNotFound
|
||||||
def getobj(self, objid):
|
def getobj(self, objid: int) -> object:
|
||||||
"""Get object from PDF
|
"""Get object from PDF
|
||||||
|
|
||||||
:raises PDFException if PDFDocument is not initialized
|
:raises PDFException if PDFDocument is not initialized
|
||||||
|
@ -783,11 +860,14 @@ class PDFDocument:
|
||||||
self._cached_objs[objid] = (obj, genno)
|
self._cached_objs[objid] = (obj, genno)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
def get_outlines(self):
|
OutlineType = Tuple[Any, Any, Any, Any, Any]
|
||||||
|
|
||||||
|
def get_outlines(self) -> Iterator[OutlineType]:
|
||||||
if 'Outlines' not in self.catalog:
|
if 'Outlines' not in self.catalog:
|
||||||
raise PDFNoOutlines
|
raise PDFNoOutlines
|
||||||
|
|
||||||
def search(entry, level):
|
def search(entry: object, level: int
|
||||||
|
) -> Iterator[PDFDocument.OutlineType]:
|
||||||
entry = dict_value(entry)
|
entry = dict_value(entry)
|
||||||
if 'Title' in entry:
|
if 'Title' in entry:
|
||||||
if 'A' in entry or 'Dest' in entry:
|
if 'A' in entry or 'Dest' in entry:
|
||||||
|
@ -803,7 +883,11 @@ class PDFDocument:
|
||||||
return
|
return
|
||||||
return search(self.catalog['Outlines'], 0)
|
return search(self.catalog['Outlines'], 0)
|
||||||
|
|
||||||
def lookup_name(self, cat, key):
|
def lookup_name(
|
||||||
|
self,
|
||||||
|
cat: str,
|
||||||
|
key: Union[str, bytes]
|
||||||
|
) -> Any:
|
||||||
try:
|
try:
|
||||||
names = dict_value(self.catalog['Names'])
|
names = dict_value(self.catalog['Names'])
|
||||||
except (PDFTypeError, KeyError):
|
except (PDFTypeError, KeyError):
|
||||||
|
@ -811,14 +895,15 @@ class PDFDocument:
|
||||||
# may raise KeyError
|
# may raise KeyError
|
||||||
d0 = dict_value(names[cat])
|
d0 = dict_value(names[cat])
|
||||||
|
|
||||||
def lookup(d):
|
def lookup(d: Dict[str, Any]) -> Any:
|
||||||
if 'Limits' in d:
|
if 'Limits' in d:
|
||||||
(k1, k2) = list_value(d['Limits'])
|
(k1, k2) = list_value(d['Limits'])
|
||||||
if key < k1 or k2 < key:
|
if key < k1 or k2 < key:
|
||||||
return None
|
return None
|
||||||
if 'Names' in d:
|
if 'Names' in d:
|
||||||
objs = list_value(d['Names'])
|
objs = list_value(d['Names'])
|
||||||
names = dict(choplist(2, objs))
|
names = dict(cast(Iterator[Tuple[Union[str, bytes], Any]],
|
||||||
|
choplist(2, objs)))
|
||||||
return names[key]
|
return names[key]
|
||||||
if 'Kids' in d:
|
if 'Kids' in d:
|
||||||
for c in list_value(d['Kids']):
|
for c in list_value(d['Kids']):
|
||||||
|
@ -828,7 +913,7 @@ class PDFDocument:
|
||||||
raise KeyError((cat, key))
|
raise KeyError((cat, key))
|
||||||
return lookup(d0)
|
return lookup(d0)
|
||||||
|
|
||||||
def get_dest(self, name):
|
def get_dest(self, name: Union[str, bytes]) -> Any:
|
||||||
try:
|
try:
|
||||||
# PDF-1.2 or later
|
# PDF-1.2 or later
|
||||||
obj = self.lookup_name('Dests', name)
|
obj = self.lookup_name('Dests', name)
|
||||||
|
@ -843,7 +928,7 @@ class PDFDocument:
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
# find_xref
|
# find_xref
|
||||||
def find_xref(self, parser):
|
def find_xref(self, parser: PDFParser) -> int:
|
||||||
"""Internal function used to locate the first XRef."""
|
"""Internal function used to locate the first XRef."""
|
||||||
# search the last xref table by scanning the file backwards.
|
# search the last xref table by scanning the file backwards.
|
||||||
prev = None
|
prev = None
|
||||||
|
@ -857,10 +942,16 @@ class PDFDocument:
|
||||||
else:
|
else:
|
||||||
raise PDFNoValidXRef('Unexpected EOF')
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
log.info('xref found: pos=%r', prev)
|
log.info('xref found: pos=%r', prev)
|
||||||
|
assert prev is not None
|
||||||
return int(prev)
|
return int(prev)
|
||||||
|
|
||||||
# read xref table
|
# read xref table
|
||||||
def read_xref_from(self, parser, start, xrefs):
|
def read_xref_from(
|
||||||
|
self,
|
||||||
|
parser: PDFParser,
|
||||||
|
start: int,
|
||||||
|
xrefs: List[PDFBaseXRef]
|
||||||
|
) -> None:
|
||||||
"""Reads XRefs from the given location."""
|
"""Reads XRefs from the given location."""
|
||||||
parser.seek(start)
|
parser.seek(start)
|
||||||
parser.reset()
|
parser.reset()
|
||||||
|
@ -873,7 +964,7 @@ class PDFDocument:
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
parser.seek(pos)
|
parser.seek(pos)
|
||||||
parser.reset()
|
parser.reset()
|
||||||
xref = PDFXRefStream()
|
xref: PDFBaseXRef = PDFXRefStream()
|
||||||
xref.load(parser)
|
xref.load(parser)
|
||||||
else:
|
else:
|
||||||
if token is parser.KEYWORD_XREF:
|
if token is parser.KEYWORD_XREF:
|
||||||
|
|
|
@ -2,11 +2,15 @@ import logging
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from typing import (Any, BinaryIO, Dict, Iterable, Iterator, List, Mapping,
|
||||||
|
Optional, Tuple, Union, cast, TYPE_CHECKING)
|
||||||
|
|
||||||
from . import settings
|
from . import settings
|
||||||
from .cmapdb import CMap
|
from .cmapdb import CMap
|
||||||
|
from .cmapdb import CMapBase
|
||||||
from .cmapdb import CMapDB
|
from .cmapdb import CMapDB
|
||||||
from .cmapdb import CMapParser
|
from .cmapdb import CMapParser
|
||||||
|
from .cmapdb import UnicodeMap
|
||||||
from .cmapdb import FileUnicodeMap
|
from .cmapdb import FileUnicodeMap
|
||||||
from .encodingdb import EncodingDB
|
from .encodingdb import EncodingDB
|
||||||
from .encodingdb import name2unicode
|
from .encodingdb import name2unicode
|
||||||
|
@ -22,52 +26,59 @@ from .pdftypes import stream_value
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from .psparser import PSEOF
|
from .psparser import PSEOF
|
||||||
|
from .psparser import PSKeyword
|
||||||
from .psparser import PSLiteral
|
from .psparser import PSLiteral
|
||||||
from .psparser import PSStackParser
|
from .psparser import PSStackParser
|
||||||
from .psparser import literal_name
|
from .psparser import literal_name
|
||||||
|
from .utils import Matrix, Point
|
||||||
|
from .utils import Rect
|
||||||
from .utils import apply_matrix_norm
|
from .utils import apply_matrix_norm
|
||||||
from .utils import choplist
|
from .utils import choplist
|
||||||
from .utils import isnumber
|
|
||||||
from .utils import nunpack
|
from .utils import nunpack
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .pdfinterp import PDFResourceManager
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def get_widths(seq):
|
def get_widths(seq: Iterable[object]) -> Dict[int, float]:
|
||||||
widths = {}
|
"""Build a mapping of character widths for horizontal writing."""
|
||||||
r = []
|
widths: Dict[int, float] = {}
|
||||||
|
r: List[float] = []
|
||||||
for v in seq:
|
for v in seq:
|
||||||
if isinstance(v, list):
|
if isinstance(v, list):
|
||||||
if r:
|
if r:
|
||||||
char1 = r[-1]
|
char1 = r[-1]
|
||||||
for (i, w) in enumerate(v):
|
for (i, w) in enumerate(v):
|
||||||
widths[char1+i] = w
|
widths[cast(int, char1) + i] = w
|
||||||
r = []
|
r = []
|
||||||
elif isnumber(v):
|
elif isinstance(v, (int, float)): # == utils.isnumber(v)
|
||||||
r.append(v)
|
r.append(v)
|
||||||
if len(r) == 3:
|
if len(r) == 3:
|
||||||
(char1, char2, w) = r
|
(char1, char2, w) = r
|
||||||
for i in range(char1, char2+1):
|
for i in range(cast(int, char1), cast(int, char2) + 1):
|
||||||
widths[i] = w
|
widths[i] = w
|
||||||
r = []
|
r = []
|
||||||
return widths
|
return widths
|
||||||
|
|
||||||
|
|
||||||
def get_widths2(seq):
|
def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]:
|
||||||
widths = {}
|
"""Build a mapping of character widths for vertical writing."""
|
||||||
r = []
|
widths: Dict[int, Tuple[float, Point]] = {}
|
||||||
|
r: List[float] = []
|
||||||
for v in seq:
|
for v in seq:
|
||||||
if isinstance(v, list):
|
if isinstance(v, list):
|
||||||
if r:
|
if r:
|
||||||
char1 = r[-1]
|
char1 = r[-1]
|
||||||
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
|
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
|
||||||
widths[char1+i] = (w, (vx, vy))
|
widths[cast(int, char1) + i] = (w, (vx, vy))
|
||||||
r = []
|
r = []
|
||||||
elif isnumber(v):
|
elif isinstance(v, (int, float)): # == utils.isnumber(v)
|
||||||
r.append(v)
|
r.append(v)
|
||||||
if len(r) == 5:
|
if len(r) == 5:
|
||||||
(char1, char2, w, vx, vy) = r
|
(char1, char2, w, vx, vy) = r
|
||||||
for i in range(char1, char2+1):
|
for i in range(cast(int, char1), cast(int, char2) + 1):
|
||||||
widths[i] = (w, (vx, vy))
|
widths[i] = (w, (vx, vy))
|
||||||
r = []
|
r = []
|
||||||
return widths
|
return widths
|
||||||
|
@ -76,11 +87,13 @@ def get_widths2(seq):
|
||||||
class FontMetricsDB:
|
class FontMetricsDB:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_metrics(cls, fontname):
|
def get_metrics(cls, fontname: str
|
||||||
|
) -> Tuple[Dict[str, object], Dict[str, int]]:
|
||||||
return FONT_METRICS[fontname]
|
return FONT_METRICS[fontname]
|
||||||
|
|
||||||
|
|
||||||
class Type1FontHeaderParser(PSStackParser):
|
# int here means that we're not extending PSStackParser with additional types.
|
||||||
|
class Type1FontHeaderParser(PSStackParser[int]):
|
||||||
|
|
||||||
KEYWORD_BEGIN = KWD(b'begin')
|
KEYWORD_BEGIN = KWD(b'begin')
|
||||||
KEYWORD_END = KWD(b'end')
|
KEYWORD_END = KWD(b'end')
|
||||||
|
@ -91,12 +104,12 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
KEYWORD_READONLY = KWD(b'readonly')
|
KEYWORD_READONLY = KWD(b'readonly')
|
||||||
KEYWORD_FOR = KWD(b'for')
|
KEYWORD_FOR = KWD(b'for')
|
||||||
|
|
||||||
def __init__(self, data):
|
def __init__(self, data: BinaryIO) -> None:
|
||||||
PSStackParser.__init__(self, data)
|
PSStackParser.__init__(self, data)
|
||||||
self._cid2unicode = {}
|
self._cid2unicode: Dict[int, str] = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_encoding(self):
|
def get_encoding(self) -> Dict[int, str]:
|
||||||
"""Parse the font encoding.
|
"""Parse the font encoding.
|
||||||
|
|
||||||
The Type1 font encoding maps character codes to character names. These
|
The Type1 font encoding maps character codes to character names. These
|
||||||
|
@ -116,12 +129,12 @@ class Type1FontHeaderParser(PSStackParser):
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
self._cid2unicode[cid] = name2unicode(name)
|
self._cid2unicode[cid] = name2unicode(cast(str, name))
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
log.debug(str(e))
|
log.debug(str(e))
|
||||||
return self._cid2unicode
|
return self._cid2unicode
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||||
if token is self.KEYWORD_PUT:
|
if token is self.KEYWORD_PUT:
|
||||||
((_, key), (_, value)) = self.pop(2)
|
((_, key), (_, value)) = self.pop(2)
|
||||||
if (isinstance(key, int) and isinstance(value, PSLiteral)):
|
if (isinstance(key, int) and isinstance(value, PSLiteral)):
|
||||||
|
@ -140,10 +153,10 @@ IDENTITY_ENCODER = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def getdict(data):
|
def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]:
|
||||||
d = {}
|
d: Dict[int, List[Union[float, int]]] = {}
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
stack = []
|
stack: List[Union[float, int]] = []
|
||||||
while 1:
|
while 1:
|
||||||
c = fp.read(1)
|
c = fp.read(1)
|
||||||
if not c:
|
if not c:
|
||||||
|
@ -162,7 +175,9 @@ def getdict(data):
|
||||||
if n == 15:
|
if n == 15:
|
||||||
loop = False
|
loop = False
|
||||||
else:
|
else:
|
||||||
s += NIBBLES[n]
|
nibble = NIBBLES[n]
|
||||||
|
assert nibble is not None
|
||||||
|
s += nibble
|
||||||
value = float(s)
|
value = float(s)
|
||||||
elif 32 <= b0 and b0 <= 246:
|
elif 32 <= b0 and b0 <= 246:
|
||||||
value = b0-139
|
value = b0-139
|
||||||
|
@ -270,9 +285,9 @@ class CFFFont:
|
||||||
|
|
||||||
class INDEX:
|
class INDEX:
|
||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp: BinaryIO) -> None:
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.offsets = []
|
self.offsets: List[int] = []
|
||||||
(count, offsize) = struct.unpack('>HB', self.fp.read(3))
|
(count, offsize) = struct.unpack('>HB', self.fp.read(3))
|
||||||
for i in range(count+1):
|
for i in range(count+1):
|
||||||
self.offsets.append(nunpack(self.fp.read(offsize)))
|
self.offsets.append(nunpack(self.fp.read(offsize)))
|
||||||
|
@ -280,20 +295,20 @@ class CFFFont:
|
||||||
self.fp.seek(self.base+self.offsets[-1])
|
self.fp.seek(self.base+self.offsets[-1])
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<INDEX: size=%d>' % len(self)
|
return '<INDEX: size=%d>' % len(self)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self.offsets)-1
|
return len(self.offsets)-1
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i: int) -> bytes:
|
||||||
self.fp.seek(self.base+self.offsets[i])
|
self.fp.seek(self.base+self.offsets[i])
|
||||||
return self.fp.read(self.offsets[i+1]-self.offsets[i])
|
return self.fp.read(self.offsets[i+1]-self.offsets[i])
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self) -> Iterator[bytes]:
|
||||||
return iter(self[i] for i in range(len(self)))
|
return iter(self[i] for i in range(len(self)))
|
||||||
|
|
||||||
def __init__(self, name, fp):
|
def __init__(self, name: str, fp: BinaryIO) -> None:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
# Header
|
# Header
|
||||||
|
@ -314,13 +329,13 @@ class CFFFont:
|
||||||
(encoding_pos,) = self.top_dict.get(16, [0])
|
(encoding_pos,) = self.top_dict.get(16, [0])
|
||||||
(charstring_pos,) = self.top_dict.get(17, [0])
|
(charstring_pos,) = self.top_dict.get(17, [0])
|
||||||
# CharStrings
|
# CharStrings
|
||||||
self.fp.seek(charstring_pos)
|
self.fp.seek(cast(int, charstring_pos))
|
||||||
self.charstring = self.INDEX(self.fp)
|
self.charstring = self.INDEX(self.fp)
|
||||||
self.nglyphs = len(self.charstring)
|
self.nglyphs = len(self.charstring)
|
||||||
# Encodings
|
# Encodings
|
||||||
self.code2gid = {}
|
self.code2gid = {}
|
||||||
self.gid2code = {}
|
self.gid2code = {}
|
||||||
self.fp.seek(encoding_pos)
|
self.fp.seek(cast(int, encoding_pos))
|
||||||
format = self.fp.read(1)
|
format = self.fp.read(1)
|
||||||
if format == b'\x00':
|
if format == b'\x00':
|
||||||
# Format 0
|
# Format 0
|
||||||
|
@ -344,17 +359,18 @@ class CFFFont:
|
||||||
# Charsets
|
# Charsets
|
||||||
self.name2gid = {}
|
self.name2gid = {}
|
||||||
self.gid2name = {}
|
self.gid2name = {}
|
||||||
self.fp.seek(charset_pos)
|
self.fp.seek(cast(int, charset_pos))
|
||||||
format = self.fp.read(1)
|
format = self.fp.read(1)
|
||||||
if format == b'\x00':
|
if format == b'\x00':
|
||||||
# Format 0
|
# Format 0
|
||||||
n = self.nglyphs-1
|
n = self.nglyphs-1
|
||||||
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n,
|
for (gid, sid) in enumerate(
|
||||||
self.fp.read(2*n))):
|
cast(Tuple[int, ...],
|
||||||
|
struct.unpack('>' + 'H' * n, self.fp.read(2 * n)))):
|
||||||
gid += 1
|
gid += 1
|
||||||
name = self.getstr(sid)
|
sidname = self.getstr(sid)
|
||||||
self.name2gid[name] = gid
|
self.name2gid[sidname] = gid
|
||||||
self.gid2name[gid] = name
|
self.gid2name[gid] = sidname
|
||||||
elif format == b'\x01':
|
elif format == b'\x01':
|
||||||
# Format 1
|
# Format 1
|
||||||
(n,) = struct.unpack('B', self.fp.read(1))
|
(n,) = struct.unpack('B', self.fp.read(1))
|
||||||
|
@ -362,9 +378,9 @@ class CFFFont:
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
(first, nleft) = struct.unpack('BB', self.fp.read(2))
|
||||||
for gid in range(first, first+nleft+1):
|
for gid in range(first, first+nleft+1):
|
||||||
name = self.getstr(sid)
|
sidname = self.getstr(sid)
|
||||||
self.name2gid[name] = gid
|
self.name2gid[sidname] = gid
|
||||||
self.gid2name[gid] = name
|
self.gid2name[gid] = sidname
|
||||||
sid += 1
|
sid += 1
|
||||||
elif format == b'\x02':
|
elif format == b'\x02':
|
||||||
# Format 2
|
# Format 2
|
||||||
|
@ -373,7 +389,9 @@ class CFFFont:
|
||||||
raise ValueError('unsupported charset format: %r' % format)
|
raise ValueError('unsupported charset format: %r' % format)
|
||||||
return
|
return
|
||||||
|
|
||||||
def getstr(self, sid):
|
def getstr(self, sid: int) -> Union[str, bytes]:
|
||||||
|
# This returns str for one of the STANDARD_STRINGS but bytes otherwise,
|
||||||
|
# and appears to be a needless source of type complexity.
|
||||||
if sid < len(self.STANDARD_STRINGS):
|
if sid < len(self.STANDARD_STRINGS):
|
||||||
return self.STANDARD_STRINGS[sid]
|
return self.STANDARD_STRINGS[sid]
|
||||||
return self.string_index[sid-len(self.STANDARD_STRINGS)]
|
return self.string_index[sid-len(self.STANDARD_STRINGS)]
|
||||||
|
@ -384,17 +402,19 @@ class TrueTypeFont:
|
||||||
class CMapNotFound(Exception):
|
class CMapNotFound(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __init__(self, name, fp):
|
def __init__(self, name: str, fp: BinaryIO) -> None:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.tables = {}
|
self.tables: Dict[bytes, Tuple[int, int]] = {}
|
||||||
self.fonttype = fp.read(4)
|
self.fonttype = fp.read(4)
|
||||||
try:
|
try:
|
||||||
(ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
|
(ntables, _1, _2, _3) = cast(Tuple[int, int, int, int],
|
||||||
|
struct.unpack('>HHHH', fp.read(8)))
|
||||||
for _ in range(ntables):
|
for _ in range(ntables):
|
||||||
(name, tsum, offset, length) = struct.unpack('>4sLLL',
|
(name_bytes, tsum, offset, length) = \
|
||||||
fp.read(16))
|
cast(Tuple[bytes, int, int, int],
|
||||||
self.tables[name] = (offset, length)
|
struct.unpack('>4sLLL', fp.read(16)))
|
||||||
|
self.tables[name_bytes] = (offset, length)
|
||||||
except struct.error:
|
except struct.error:
|
||||||
# Do not fail if there are not enough bytes to read. Even for
|
# Do not fail if there are not enough bytes to read. Even for
|
||||||
# corrupted PDFs we would like to get as much information as
|
# corrupted PDFs we would like to get as much information as
|
||||||
|
@ -402,34 +422,40 @@ class TrueTypeFont:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
def create_unicode_map(self):
|
def create_unicode_map(self) -> FileUnicodeMap:
|
||||||
if b'cmap' not in self.tables:
|
if b'cmap' not in self.tables:
|
||||||
raise TrueTypeFont.CMapNotFound
|
raise TrueTypeFont.CMapNotFound
|
||||||
(base_offset, length) = self.tables[b'cmap']
|
(base_offset, length) = self.tables[b'cmap']
|
||||||
fp = self.fp
|
fp = self.fp
|
||||||
fp.seek(base_offset)
|
fp.seek(base_offset)
|
||||||
(version, nsubtables) = struct.unpack('>HH', fp.read(4))
|
(version, nsubtables) = \
|
||||||
subtables = []
|
cast(Tuple[int, int], struct.unpack('>HH', fp.read(4)))
|
||||||
|
subtables: List[Tuple[int, int, int]] = []
|
||||||
for i in range(nsubtables):
|
for i in range(nsubtables):
|
||||||
subtables.append(struct.unpack('>HHL', fp.read(8)))
|
subtables.append(
|
||||||
char2gid = {}
|
cast(Tuple[int, int, int], struct.unpack('>HHL', fp.read(8))))
|
||||||
|
char2gid: Dict[int, int] = {}
|
||||||
# Only supports subtable type 0, 2 and 4.
|
# Only supports subtable type 0, 2 and 4.
|
||||||
for (_1, _2, st_offset) in subtables:
|
for (_1, _2, st_offset) in subtables:
|
||||||
fp.seek(base_offset+st_offset)
|
fp.seek(base_offset+st_offset)
|
||||||
(fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
|
(fmttype, fmtlen, fmtlang) = \
|
||||||
|
cast(Tuple[int, int, int], struct.unpack('>HHH', fp.read(6)))
|
||||||
if fmttype == 0:
|
if fmttype == 0:
|
||||||
char2gid.update(enumerate(struct.unpack('>256B',
|
char2gid.update(enumerate(
|
||||||
fp.read(256))))
|
cast(Tuple[int, ...],
|
||||||
|
struct.unpack('>256B', fp.read(256)))))
|
||||||
elif fmttype == 2:
|
elif fmttype == 2:
|
||||||
subheaderkeys = struct.unpack('>256H', fp.read(512))
|
subheaderkeys = cast(Tuple[int, ...],
|
||||||
|
struct.unpack('>256H', fp.read(512)))
|
||||||
firstbytes = [0]*8192
|
firstbytes = [0]*8192
|
||||||
for (i, k) in enumerate(subheaderkeys):
|
for (i, k) in enumerate(subheaderkeys):
|
||||||
firstbytes[k//8] = i
|
firstbytes[k//8] = i
|
||||||
nhdrs = max(subheaderkeys)//8 + 1
|
nhdrs = max(subheaderkeys)//8 + 1
|
||||||
hdrs = []
|
hdrs: List[Tuple[int, int, int, int, int]] = []
|
||||||
for i in range(nhdrs):
|
for i in range(nhdrs):
|
||||||
(firstcode, entcount, delta, offset) = \
|
(firstcode, entcount, delta, offset) = \
|
||||||
struct.unpack('>HHhH', fp.read(8))
|
cast(Tuple[int, int, int, int],
|
||||||
|
struct.unpack('>HHhH', fp.read(8)))
|
||||||
hdrs.append((i, firstcode, entcount, delta,
|
hdrs.append((i, firstcode, entcount, delta,
|
||||||
fp.tell()-2+offset))
|
fp.tell()-2+offset))
|
||||||
for (i, firstcode, entcount, delta, pos) in hdrs:
|
for (i, firstcode, entcount, delta, pos) in hdrs:
|
||||||
|
@ -438,24 +464,36 @@ class TrueTypeFont:
|
||||||
first = firstcode + (firstbytes[i] << 8)
|
first = firstcode + (firstbytes[i] << 8)
|
||||||
fp.seek(pos)
|
fp.seek(pos)
|
||||||
for c in range(entcount):
|
for c in range(entcount):
|
||||||
gid = struct.unpack('>H', fp.read(2))
|
gid = cast(Tuple[int],
|
||||||
|
struct.unpack('>H', fp.read(2)))[0]
|
||||||
if gid:
|
if gid:
|
||||||
gid += delta
|
gid += delta
|
||||||
char2gid[first+c] = gid
|
char2gid[first+c] = gid
|
||||||
elif fmttype == 4:
|
elif fmttype == 4:
|
||||||
(segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
|
(segcount, _1, _2, _3) = \
|
||||||
|
cast(Tuple[int, int, int, int],
|
||||||
|
struct.unpack('>HHHH', fp.read(8)))
|
||||||
segcount //= 2
|
segcount //= 2
|
||||||
ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
ecs = cast(Tuple[int, ...],
|
||||||
|
struct.unpack('>%dH' % segcount,
|
||||||
|
fp.read(2*segcount)))
|
||||||
fp.read(2)
|
fp.read(2)
|
||||||
scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
scs = cast(Tuple[int, ...],
|
||||||
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
|
struct.unpack('>%dH' % segcount,
|
||||||
|
fp.read(2*segcount)))
|
||||||
|
idds = cast(Tuple[int, ...],
|
||||||
|
struct.unpack('>%dh' % segcount,
|
||||||
|
fp.read(2*segcount)))
|
||||||
pos = fp.tell()
|
pos = fp.tell()
|
||||||
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
|
idrs = cast(Tuple[int, ...],
|
||||||
|
struct.unpack('>%dH' % segcount,
|
||||||
|
fp.read(2*segcount)))
|
||||||
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
|
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
|
||||||
if idr:
|
if idr:
|
||||||
fp.seek(pos+idr)
|
fp.seek(pos+idr)
|
||||||
for c in range(sc, ec+1):
|
for c in range(sc, ec+1):
|
||||||
b = struct.unpack('>H', fp.read(2))[0]
|
b = cast(Tuple[int],
|
||||||
|
struct.unpack('>H', fp.read(2)))[0]
|
||||||
char2gid[c] = (b + idd) & 0xffff
|
char2gid[c] = (b + idd) & 0xffff
|
||||||
else:
|
else:
|
||||||
for c in range(sc, ec+1):
|
for c in range(sc, ec+1):
|
||||||
|
@ -480,12 +518,21 @@ class PDFUnicodeNotDefined(PDFFontError):
|
||||||
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
||||||
LITERAL_TYPE1C = LIT('Type1C')
|
LITERAL_TYPE1C = LIT('Type1C')
|
||||||
|
|
||||||
|
# Font widths are maintained in a dict type that maps from *either* unicode
|
||||||
|
# chars or integer character IDs.
|
||||||
|
FontWidthDict = Union[Dict[int, float], Dict[str, float]]
|
||||||
|
|
||||||
|
|
||||||
class PDFFont:
|
class PDFFont:
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, default_width=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
descriptor: Mapping[str, Any],
|
||||||
|
widths: FontWidthDict,
|
||||||
|
default_width: Optional[float] = None
|
||||||
|
) -> None:
|
||||||
self.descriptor = descriptor
|
self.descriptor = descriptor
|
||||||
self.widths = resolve_all(widths)
|
self.widths: FontWidthDict = resolve_all(widths)
|
||||||
self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
|
self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
|
||||||
if isinstance(self.fontname, PSLiteral):
|
if isinstance(self.fontname, PSLiteral):
|
||||||
self.fontname = literal_name(self.fontname)
|
self.fontname = literal_name(self.fontname)
|
||||||
|
@ -498,8 +545,8 @@ class PDFFont:
|
||||||
else:
|
else:
|
||||||
self.default_width = default_width
|
self.default_width = default_width
|
||||||
self.leading = num_value(descriptor.get('Leading', 0))
|
self.leading = num_value(descriptor.get('Leading', 0))
|
||||||
self.bbox = list_value(resolve_all(descriptor.get('FontBBox',
|
self.bbox = cast(Rect, list_value(
|
||||||
(0, 0, 0, 0))))
|
resolve_all(descriptor.get('FontBBox', (0, 0, 0, 0)))))
|
||||||
self.hscale = self.vscale = .001
|
self.hscale = self.vscale = .001
|
||||||
|
|
||||||
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
||||||
|
@ -510,57 +557,72 @@ class PDFFont:
|
||||||
self.descent = -self.descent
|
self.descent = -self.descent
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFFont>'
|
return '<PDFFont>'
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def is_multibyte(self):
|
def is_multibyte(self) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def decode(self, bytes):
|
def decode(self, bytes: bytes) -> Iterable[int]:
|
||||||
return bytearray(bytes) # map(ord, bytes)
|
return bytearray(bytes) # map(ord, bytes)
|
||||||
|
|
||||||
def get_ascent(self):
|
def get_ascent(self) -> float:
|
||||||
"""Ascent above the baseline, in text space units"""
|
"""Ascent above the baseline, in text space units"""
|
||||||
return self.ascent * self.vscale
|
return self.ascent * self.vscale
|
||||||
|
|
||||||
def get_descent(self):
|
def get_descent(self) -> float:
|
||||||
"""Descent below the baseline, in text space units; always negative"""
|
"""Descent below the baseline, in text space units; always negative"""
|
||||||
return self.descent * self.vscale
|
return self.descent * self.vscale
|
||||||
|
|
||||||
def get_width(self):
|
def get_width(self) -> float:
|
||||||
w = self.bbox[2]-self.bbox[0]
|
w = self.bbox[2]-self.bbox[0]
|
||||||
if w == 0:
|
if w == 0:
|
||||||
w = -self.default_width
|
w = -self.default_width
|
||||||
return w * self.hscale
|
return w * self.hscale
|
||||||
|
|
||||||
def get_height(self):
|
def get_height(self) -> float:
|
||||||
h = self.bbox[3]-self.bbox[1]
|
h = self.bbox[3]-self.bbox[1]
|
||||||
if h == 0:
|
if h == 0:
|
||||||
h = self.ascent - self.descent
|
h = self.ascent - self.descent
|
||||||
return h * self.vscale
|
return h * self.vscale
|
||||||
|
|
||||||
def char_width(self, cid):
|
def char_width(self, cid: int) -> float:
|
||||||
|
# Because character widths may be mapping either IDs or strings,
|
||||||
|
# we try to lookup the character ID first, then its str equivalent.
|
||||||
try:
|
try:
|
||||||
return self.widths[cid] * self.hscale
|
return cast(Dict[int, float], self.widths)[cid] * self.hscale
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
str_widths = cast(Dict[str, float], self.widths)
|
||||||
try:
|
try:
|
||||||
return self.widths[self.to_unichr(cid)] * self.hscale
|
return str_widths[self.to_unichr(cid)] * self.hscale
|
||||||
except (KeyError, PDFUnicodeNotDefined):
|
except (KeyError, PDFUnicodeNotDefined):
|
||||||
return self.default_width * self.hscale
|
return self.default_width * self.hscale
|
||||||
|
|
||||||
def char_disp(self, cid):
|
def char_disp(
|
||||||
|
self,
|
||||||
|
cid: int
|
||||||
|
) -> Union[float, Tuple[Optional[float], float]]:
|
||||||
|
"Returns an integer for horizontal fonts, a tuple for vertical fonts."
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def string_width(self, s):
|
def string_width(self, s: bytes) -> float:
|
||||||
return sum(self.char_width(cid) for cid in self.decode(s))
|
return sum(self.char_width(cid) for cid in self.decode(s))
|
||||||
|
|
||||||
|
def to_unichr(self, cid: int) -> str:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class PDFSimpleFont(PDFFont):
|
class PDFSimpleFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, spec):
|
def __init__(
|
||||||
|
self,
|
||||||
|
descriptor: Mapping[str, Any],
|
||||||
|
widths: FontWidthDict,
|
||||||
|
spec: Mapping[str, Any]
|
||||||
|
) -> None:
|
||||||
# Font encoding is specified either by a name of
|
# Font encoding is specified either by a name of
|
||||||
# built-in encoding or a dictionary that describes
|
# built-in encoding or a dictionary that describes
|
||||||
# the differences.
|
# the differences.
|
||||||
|
@ -575,7 +637,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
self.cid2unicode = EncodingDB.get_encoding(name, diff)
|
self.cid2unicode = EncodingDB.get_encoding(name, diff)
|
||||||
else:
|
else:
|
||||||
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
|
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
|
||||||
self.unicode_map = None
|
self.unicode_map: Optional[UnicodeMap] = None
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.unicode_map = FileUnicodeMap()
|
self.unicode_map = FileUnicodeMap()
|
||||||
|
@ -583,7 +645,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
PDFFont.__init__(self, descriptor, widths)
|
PDFFont.__init__(self, descriptor, widths)
|
||||||
return
|
return
|
||||||
|
|
||||||
def to_unichr(self, cid):
|
def to_unichr(self, cid: int) -> str:
|
||||||
if self.unicode_map:
|
if self.unicode_map:
|
||||||
try:
|
try:
|
||||||
return self.unicode_map.get_unichr(cid)
|
return self.unicode_map.get_unichr(cid)
|
||||||
|
@ -597,21 +659,28 @@ class PDFSimpleFont(PDFFont):
|
||||||
|
|
||||||
class PDFType1Font(PDFSimpleFont):
|
class PDFType1Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, spec):
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: "PDFResourceManager",
|
||||||
|
spec: Mapping[str, Any]
|
||||||
|
) -> None:
|
||||||
try:
|
try:
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFFontError('BaseFont is missing')
|
raise PDFFontError('BaseFont is missing')
|
||||||
self.basefont = 'unknown'
|
self.basefont = 'unknown'
|
||||||
|
|
||||||
|
widths: FontWidthDict
|
||||||
try:
|
try:
|
||||||
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
(descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
|
||||||
|
widths = cast(Dict[str, float], int_widths) # implicit int->float
|
||||||
except KeyError:
|
except KeyError:
|
||||||
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
||||||
firstchar = int_value(spec.get('FirstChar', 0))
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
# lastchar = int_value(spec.get('LastChar', 255))
|
# lastchar = int_value(spec.get('LastChar', 255))
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
width_list = list_value(spec.get('Widths', [0]*256))
|
||||||
widths = {i+firstchar: w for (i, w) in enumerate(widths)}
|
widths = {i+firstchar: w for (i, w) in enumerate(width_list)}
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
if 'Encoding' not in spec and 'FontFile' in descriptor:
|
if 'Encoding' not in spec and 'FontFile' in descriptor:
|
||||||
# try to recover the missing encoding info from the font file.
|
# try to recover the missing encoding info from the font file.
|
||||||
|
@ -622,41 +691,51 @@ class PDFType1Font(PDFSimpleFont):
|
||||||
self.cid2unicode = parser.get_encoding()
|
self.cid2unicode = parser.get_encoding()
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
|
||||||
class PDFTrueTypeFont(PDFType1Font):
|
class PDFTrueTypeFont(PDFType1Font):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
|
||||||
class PDFType3Font(PDFSimpleFont):
|
class PDFType3Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, spec):
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: "PDFResourceManager",
|
||||||
|
spec: Mapping[str, Any]
|
||||||
|
) -> None:
|
||||||
firstchar = int_value(spec.get('FirstChar', 0))
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
# lastchar = int_value(spec.get('LastChar', 0))
|
# lastchar = int_value(spec.get('LastChar', 0))
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
width_list = list_value(spec.get('Widths', [0]*256))
|
||||||
widths = {i+firstchar: w for (i, w) in enumerate(widths)}
|
widths = {i+firstchar: w for (i, w) in enumerate(width_list)}
|
||||||
if 'FontDescriptor' in spec:
|
if 'FontDescriptor' in spec:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
else:
|
else:
|
||||||
descriptor = {'Ascent': 0, 'Descent': 0,
|
descriptor = {'Ascent': 0, 'Descent': 0,
|
||||||
'FontBBox': spec['FontBBox']}
|
'FontBBox': spec['FontBBox']}
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
self.matrix = cast(Matrix, tuple(list_value(spec.get('FontMatrix'))))
|
||||||
(_, self.descent, _, self.ascent) = self.bbox
|
(_, self.descent, _, self.ascent) = self.bbox
|
||||||
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
|
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFType3Font>'
|
return '<PDFType3Font>'
|
||||||
|
|
||||||
|
|
||||||
class PDFCIDFont(PDFFont):
|
class PDFCIDFont(PDFFont):
|
||||||
|
default_disp: Union[float, Tuple[Optional[float], float]]
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: "PDFResourceManager",
|
||||||
|
spec: Mapping[str, Any],
|
||||||
|
strict: bool = settings.STRICT
|
||||||
|
) -> None:
|
||||||
try:
|
try:
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -669,7 +748,7 @@ class PDFCIDFont(PDFFont):
|
||||||
cid_ordering = resolve1(
|
cid_ordering = resolve1(
|
||||||
self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")
|
self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")
|
||||||
self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering)
|
self.cidcoding = '{}-{}'.format(cid_registry, cid_ordering)
|
||||||
self.cmap = self.get_cmap_from_spec(spec, strict)
|
self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
|
@ -682,7 +761,7 @@ class PDFCIDFont(PDFFont):
|
||||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||||
ttf = TrueTypeFont(self.basefont,
|
ttf = TrueTypeFont(self.basefont,
|
||||||
BytesIO(self.fontfile.get_data()))
|
BytesIO(self.fontfile.get_data()))
|
||||||
self.unicode_map = None
|
self.unicode_map: Optional[UnicodeMap] = None
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.unicode_map = FileUnicodeMap()
|
self.unicode_map = FileUnicodeMap()
|
||||||
|
@ -703,12 +782,12 @@ class PDFCIDFont(PDFFont):
|
||||||
self.vertical = self.cmap.is_vertical()
|
self.vertical = self.cmap.is_vertical()
|
||||||
if self.vertical:
|
if self.vertical:
|
||||||
# writing mode: vertical
|
# writing mode: vertical
|
||||||
widths = get_widths2(list_value(spec.get('W2', [])))
|
widths2 = get_widths2(list_value(spec.get('W2', [])))
|
||||||
self.disps = {cid: (vx, vy)
|
self.disps = {cid: (vx, vy)
|
||||||
for (cid, (_, (vx, vy))) in widths.items()}
|
for (cid, (_, (vx, vy))) in widths2.items()}
|
||||||
(vy, w) = resolve1(spec.get('DW2', [880, -1000]))
|
(vy, w) = resolve1(spec.get('DW2', [880, -1000]))
|
||||||
self.default_disp = (None, vy)
|
self.default_disp = (None, vy)
|
||||||
widths = {cid: w for (cid, (w, _)) in widths.items()}
|
widths = {cid: w for (cid, (w, _)) in widths2.items()}
|
||||||
default_width = w
|
default_width = w
|
||||||
else:
|
else:
|
||||||
# writing mode: horizontal
|
# writing mode: horizontal
|
||||||
|
@ -719,7 +798,11 @@ class PDFCIDFont(PDFFont):
|
||||||
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cmap_from_spec(self, spec, strict):
|
def get_cmap_from_spec(
|
||||||
|
self,
|
||||||
|
spec: Mapping[str, Any],
|
||||||
|
strict: bool
|
||||||
|
) -> CMapBase:
|
||||||
"""Get cmap from font specification
|
"""Get cmap from font specification
|
||||||
|
|
||||||
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
||||||
|
@ -738,7 +821,7 @@ class PDFCIDFont(PDFFont):
|
||||||
return CMap()
|
return CMap()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_cmap_name(spec, strict):
|
def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
|
||||||
"""Get cmap name from font specification"""
|
"""Get cmap name from font specification"""
|
||||||
cmap_name = 'unknown' # default value
|
cmap_name = 'unknown' # default value
|
||||||
|
|
||||||
|
@ -752,34 +835,37 @@ class PDFCIDFont(PDFFont):
|
||||||
if strict:
|
if strict:
|
||||||
raise PDFFontError('Encoding is unspecified')
|
raise PDFFontError('Encoding is unspecified')
|
||||||
|
|
||||||
if type(cmap_name) is PDFStream:
|
if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
|
||||||
if 'CMapName' in cmap_name:
|
cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
|
||||||
cmap_name = cmap_name.get('CMapName').name
|
if 'CMapName' in cmap_name_stream:
|
||||||
|
cmap_name = cmap_name_stream.get('CMapName').name
|
||||||
else:
|
else:
|
||||||
if strict:
|
if strict:
|
||||||
raise PDFFontError('CMapName unspecified for encoding')
|
raise PDFFontError('CMapName unspecified for encoding')
|
||||||
|
|
||||||
cmap_name = IDENTITY_ENCODER.get(cmap_name, cmap_name)
|
return IDENTITY_ENCODER.get(cmap_name, cmap_name)
|
||||||
return cmap_name
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
|
return '<PDFCIDFont: basefont={!r}, cidcoding={!r}>'\
|
||||||
.format(self.basefont, self.cidcoding)
|
.format(self.basefont, self.cidcoding)
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self) -> bool:
|
||||||
return self.vertical
|
return self.vertical
|
||||||
|
|
||||||
def is_multibyte(self):
|
def is_multibyte(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def decode(self, bytes):
|
def decode(self, bytes: bytes) -> Iterable[int]:
|
||||||
return self.cmap.decode(bytes)
|
return self.cmap.decode(bytes)
|
||||||
|
|
||||||
def char_disp(self, cid):
|
def char_disp(
|
||||||
|
self,
|
||||||
|
cid: int
|
||||||
|
) -> Union[float, Tuple[Optional[float], float]]:
|
||||||
"Returns an integer for horizontal fonts, a tuple for vertical fonts."
|
"Returns an integer for horizontal fonts, a tuple for vertical fonts."
|
||||||
return self.disps.get(cid, self.default_disp)
|
return self.disps.get(cid, self.default_disp)
|
||||||
|
|
||||||
def to_unichr(self, cid):
|
def to_unichr(self, cid: int) -> str:
|
||||||
try:
|
try:
|
||||||
if not self.unicode_map:
|
if not self.unicode_map:
|
||||||
raise KeyError(cid)
|
raise KeyError(cid)
|
||||||
|
@ -788,7 +874,7 @@ class PDFCIDFont(PDFFont):
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv: List[str]) -> None:
|
||||||
for fname in argv[1:]:
|
for fname in argv[1:]:
|
||||||
fp = open(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
font = CFFFont(fname, fp)
|
font = CFFFont(fname, fp)
|
||||||
|
@ -798,4 +884,4 @@ def main(argv):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
main(sys.argv)
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from .cmapdb import CMapDB
|
from .cmapdb import CMapDB
|
||||||
from .cmapdb import CMap
|
from .cmapdb import CMap
|
||||||
from .psparser import PSTypeError
|
from .cmapdb import CMapBase
|
||||||
|
from .psparser import PSLiteral, PSTypeError
|
||||||
|
from .psparser import PSStackType
|
||||||
from .psparser import PSEOF
|
from .psparser import PSEOF
|
||||||
from .psparser import PSKeyword
|
from .psparser import PSKeyword
|
||||||
from .psparser import literal_name
|
from .psparser import literal_name
|
||||||
|
@ -12,6 +15,9 @@ from .psparser import PSStackParser
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
from . import settings
|
from . import settings
|
||||||
|
from .pdfdevice import PDFDevice
|
||||||
|
from .pdfdevice import PDFTextSeq
|
||||||
|
from .pdfpage import PDFPage
|
||||||
from .pdftypes import PDFException
|
from .pdftypes import PDFException
|
||||||
from .pdftypes import PDFStream
|
from .pdftypes import PDFStream
|
||||||
from .pdftypes import PDFObjRef
|
from .pdftypes import PDFObjRef
|
||||||
|
@ -19,6 +25,7 @@ from .pdftypes import resolve1
|
||||||
from .pdftypes import list_value
|
from .pdftypes import list_value
|
||||||
from .pdftypes import dict_value
|
from .pdftypes import dict_value
|
||||||
from .pdftypes import stream_value
|
from .pdftypes import stream_value
|
||||||
|
from .pdffont import PDFFont
|
||||||
from .pdffont import PDFFontError
|
from .pdffont import PDFFontError
|
||||||
from .pdffont import PDFType1Font
|
from .pdffont import PDFType1Font
|
||||||
from .pdffont import PDFTrueTypeFont
|
from .pdffont import PDFTrueTypeFont
|
||||||
|
@ -26,6 +33,7 @@ from .pdffont import PDFType3Font
|
||||||
from .pdffont import PDFCIDFont
|
from .pdffont import PDFCIDFont
|
||||||
from .pdfcolor import PDFColorSpace
|
from .pdfcolor import PDFColorSpace
|
||||||
from .pdfcolor import PREDEFINED_COLORSPACE
|
from .pdfcolor import PREDEFINED_COLORSPACE
|
||||||
|
from .utils import Matrix, Point, PathSegment, Rect
|
||||||
from .utils import choplist
|
from .utils import choplist
|
||||||
from .utils import mult_matrix
|
from .utils import mult_matrix
|
||||||
from .utils import MATRIX_IDENTITY
|
from .utils import MATRIX_IDENTITY
|
||||||
|
@ -50,22 +58,24 @@ LITERAL_IMAGE = LIT('Image')
|
||||||
|
|
||||||
|
|
||||||
class PDFTextState:
|
class PDFTextState:
|
||||||
|
matrix: Matrix
|
||||||
|
linematrix: Point
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.font = None
|
self.font: Optional[PDFFont] = None
|
||||||
self.fontsize = 0
|
self.fontsize: float = 0
|
||||||
self.charspace = 0
|
self.charspace: float = 0
|
||||||
self.wordspace = 0
|
self.wordspace: float = 0
|
||||||
self.scaling = 100
|
self.scaling: float = 100
|
||||||
self.leading = 0
|
self.leading: float = 0
|
||||||
self.render = 0
|
self.render: int = 0
|
||||||
self.rise = 0
|
self.rise: float = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
# self.matrix is set
|
# self.matrix is set
|
||||||
# self.linematrix is set
|
# self.linematrix is set
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \
|
return '<PDFTextState: font=%r, fontsize=%r, charspace=%r, ' \
|
||||||
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \
|
'wordspace=%r, scaling=%r, leading=%r, render=%r, rise=%r, ' \
|
||||||
'matrix=%r, linematrix=%r>' \
|
'matrix=%r, linematrix=%r>' \
|
||||||
|
@ -73,7 +83,7 @@ class PDFTextState:
|
||||||
self.scaling, self.leading, self.render, self.rise,
|
self.scaling, self.leading, self.render, self.rise,
|
||||||
self.matrix, self.linematrix)
|
self.matrix, self.linematrix)
|
||||||
|
|
||||||
def copy(self):
|
def copy(self) -> "PDFTextState":
|
||||||
obj = PDFTextState()
|
obj = PDFTextState()
|
||||||
obj.font = self.font
|
obj.font = self.font
|
||||||
obj.fontsize = self.fontsize
|
obj.fontsize = self.fontsize
|
||||||
|
@ -87,31 +97,37 @@ class PDFTextState:
|
||||||
obj.linematrix = self.linematrix
|
obj.linematrix = self.linematrix
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self.matrix = MATRIX_IDENTITY
|
self.matrix = MATRIX_IDENTITY
|
||||||
self.linematrix = (0, 0)
|
self.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
Color = Union[
|
||||||
|
float, # Greyscale
|
||||||
|
Tuple[float, float, float], # R, G, B
|
||||||
|
Tuple[float, float, float, float]] # C, M, Y, K
|
||||||
|
|
||||||
|
|
||||||
class PDFGraphicState:
|
class PDFGraphicState:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.linewidth = 0
|
self.linewidth: float = 0
|
||||||
self.linecap = None
|
self.linecap: Optional[object] = None
|
||||||
self.linejoin = None
|
self.linejoin: Optional[object] = None
|
||||||
self.miterlimit = None
|
self.miterlimit: Optional[object] = None
|
||||||
self.dash = None
|
self.dash: Optional[Tuple[object, object]] = None
|
||||||
self.intent = None
|
self.intent: Optional[object] = None
|
||||||
self.flatness = None
|
self.flatness: Optional[object] = None
|
||||||
|
|
||||||
# stroking color
|
# stroking color
|
||||||
self.scolor = None
|
self.scolor: Optional[Color] = None
|
||||||
|
|
||||||
# non stroking color
|
# non stroking color
|
||||||
self.ncolor = None
|
self.ncolor: Optional[Color] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def copy(self):
|
def copy(self) -> "PDFGraphicState":
|
||||||
obj = PDFGraphicState()
|
obj = PDFGraphicState()
|
||||||
obj.linewidth = self.linewidth
|
obj.linewidth = self.linewidth
|
||||||
obj.linecap = self.linecap
|
obj.linecap = self.linecap
|
||||||
|
@ -124,7 +140,7 @@ class PDFGraphicState:
|
||||||
obj.ncolor = self.ncolor
|
obj.ncolor = self.ncolor
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
||||||
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, '
|
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, '
|
||||||
' stroking color=%r, non stroking color=%r>' %
|
' stroking color=%r, non stroking color=%r>' %
|
||||||
|
@ -141,12 +157,12 @@ class PDFResourceManager:
|
||||||
allocated multiple times.
|
allocated multiple times.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, caching=True):
|
def __init__(self, caching: bool = True) -> None:
|
||||||
self.caching = caching
|
self.caching = caching
|
||||||
self._cached_fonts = {}
|
self._cached_fonts: Dict[object, PDFFont] = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_procset(self, procs):
|
def get_procset(self, procs: Sequence[object]) -> None:
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
if proc is LITERAL_PDF:
|
if proc is LITERAL_PDF:
|
||||||
pass
|
pass
|
||||||
|
@ -156,7 +172,7 @@ class PDFResourceManager:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cmap(self, cmapname, strict=False):
|
def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase:
|
||||||
try:
|
try:
|
||||||
return CMapDB.get_cmap(cmapname)
|
return CMapDB.get_cmap(cmapname)
|
||||||
except CMapDB.CMapNotFound:
|
except CMapDB.CMapNotFound:
|
||||||
|
@ -164,7 +180,7 @@ class PDFResourceManager:
|
||||||
raise
|
raise
|
||||||
return CMap()
|
return CMap()
|
||||||
|
|
||||||
def get_font(self, objid, spec):
|
def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
|
||||||
if objid and objid in self._cached_fonts:
|
if objid and objid in self._cached_fonts:
|
||||||
font = self._cached_fonts[objid]
|
font = self._cached_fonts[objid]
|
||||||
else:
|
else:
|
||||||
|
@ -209,15 +225,18 @@ class PDFResourceManager:
|
||||||
return font
|
return font
|
||||||
|
|
||||||
|
|
||||||
class PDFContentParser(PSStackParser):
|
class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
|
||||||
|
|
||||||
def __init__(self, streams):
|
def __init__(self, streams: Sequence[object]) -> None:
|
||||||
self.streams = streams
|
self.streams = streams
|
||||||
self.istream = 0
|
self.istream = 0
|
||||||
PSStackParser.__init__(self, None)
|
# PSStackParser.__init__(fp=None) is safe only because we've overloaded
|
||||||
|
# all the methods that would attempt to access self.fp without first
|
||||||
|
# calling self.fillfp().
|
||||||
|
PSStackParser.__init__(self, None) # type: ignore[arg-type]
|
||||||
return
|
return
|
||||||
|
|
||||||
def fillfp(self):
|
def fillfp(self) -> None:
|
||||||
if not self.fp:
|
if not self.fp:
|
||||||
if self.istream < len(self.streams):
|
if self.istream < len(self.streams):
|
||||||
strm = stream_value(self.streams[self.istream])
|
strm = stream_value(self.streams[self.istream])
|
||||||
|
@ -227,12 +246,12 @@ class PDFContentParser(PSStackParser):
|
||||||
self.fp = BytesIO(strm.get_data())
|
self.fp = BytesIO(strm.get_data())
|
||||||
return
|
return
|
||||||
|
|
||||||
def seek(self, pos):
|
def seek(self, pos: int) -> None:
|
||||||
self.fillfp()
|
self.fillfp()
|
||||||
PSStackParser.seek(self, pos)
|
PSStackParser.seek(self, pos)
|
||||||
return
|
return
|
||||||
|
|
||||||
def fillbuf(self):
|
def fillbuf(self) -> None:
|
||||||
if self.charpos < len(self.buf):
|
if self.charpos < len(self.buf):
|
||||||
return
|
return
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -241,19 +260,23 @@ class PDFContentParser(PSStackParser):
|
||||||
self.buf = self.fp.read(self.BUFSIZ)
|
self.buf = self.fp.read(self.BUFSIZ)
|
||||||
if self.buf:
|
if self.buf:
|
||||||
break
|
break
|
||||||
self.fp = None
|
self.fp = None # type: ignore[assignment]
|
||||||
self.charpos = 0
|
self.charpos = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_inline_data(self, pos, target=b'EI'):
|
def get_inline_data(
|
||||||
|
self,
|
||||||
|
pos: int,
|
||||||
|
target: bytes = b'EI'
|
||||||
|
) -> Tuple[int, bytes]:
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
i = 0
|
i = 0
|
||||||
data = b''
|
data = b''
|
||||||
while i <= len(target):
|
while i <= len(target):
|
||||||
self.fillbuf()
|
self.fillbuf()
|
||||||
if i:
|
if i:
|
||||||
c = self.buf[self.charpos]
|
ci = self.buf[self.charpos]
|
||||||
c = bytes((c,))
|
c = bytes((ci,))
|
||||||
data += c
|
data += c
|
||||||
self.charpos += 1
|
self.charpos += 1
|
||||||
if len(target) <= i and c.isspace():
|
if len(target) <= i and c.isspace():
|
||||||
|
@ -275,7 +298,7 @@ class PDFContentParser(PSStackParser):
|
||||||
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
|
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
|
||||||
return (pos, data)
|
return (pos, data)
|
||||||
|
|
||||||
def flush(self):
|
def flush(self) -> None:
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -283,7 +306,7 @@ class PDFContentParser(PSStackParser):
|
||||||
KEYWORD_ID = KWD(b'ID')
|
KEYWORD_ID = KWD(b'ID')
|
||||||
KEYWORD_EI = KWD(b'EI')
|
KEYWORD_EI = KWD(b'EI')
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||||
if token is self.KEYWORD_BI:
|
if token is self.KEYWORD_BI:
|
||||||
# inline image within a content stream
|
# inline image within a content stream
|
||||||
self.start_type(pos, 'inline')
|
self.start_type(pos, 'inline')
|
||||||
|
@ -307,30 +330,34 @@ class PDFContentParser(PSStackParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
PDFStackT = PSStackType[PDFStream]
|
||||||
|
"""Types that may appear on the PDF argument stack."""
|
||||||
|
|
||||||
|
|
||||||
class PDFPageInterpreter:
|
class PDFPageInterpreter:
|
||||||
"""Processor for the content of a PDF page
|
"""Processor for the content of a PDF page
|
||||||
|
|
||||||
Reference: PDF Reference, Appendix A, Operator Summary
|
Reference: PDF Reference, Appendix A, Operator Summary
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, device):
|
def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice) -> None:
|
||||||
self.rsrcmgr = rsrcmgr
|
self.rsrcmgr = rsrcmgr
|
||||||
self.device = device
|
self.device = device
|
||||||
return
|
return
|
||||||
|
|
||||||
def dup(self):
|
def dup(self) -> "PDFPageInterpreter":
|
||||||
return self.__class__(self.rsrcmgr, self.device)
|
return self.__class__(self.rsrcmgr, self.device)
|
||||||
|
|
||||||
def init_resources(self, resources):
|
def init_resources(self, resources: Dict[object, object]) -> None:
|
||||||
"""Prepare the fonts and XObjects listed in the Resource attribute."""
|
"""Prepare the fonts and XObjects listed in the Resource attribute."""
|
||||||
self.resources = resources
|
self.resources = resources
|
||||||
self.fontmap = {}
|
self.fontmap: Dict[object, PDFFont] = {}
|
||||||
self.xobjmap = {}
|
self.xobjmap = {}
|
||||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
|
||||||
if not resources:
|
if not resources:
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_colorspace(spec):
|
def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
|
||||||
if isinstance(spec, list):
|
if isinstance(spec, list):
|
||||||
name = literal_name(spec[0])
|
name = literal_name(spec[0])
|
||||||
else:
|
else:
|
||||||
|
@ -343,6 +370,7 @@ class PDFPageInterpreter:
|
||||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||||
else:
|
else:
|
||||||
return PREDEFINED_COLORSPACE.get(name)
|
return PREDEFINED_COLORSPACE.get(name)
|
||||||
|
|
||||||
for (k, v) in dict_value(resources).items():
|
for (k, v) in dict_value(resources).items():
|
||||||
log.debug('Resource: %r: %r', k, v)
|
log.debug('Resource: %r: %r', k, v)
|
||||||
if k == 'Font':
|
if k == 'Font':
|
||||||
|
@ -354,7 +382,9 @@ class PDFPageInterpreter:
|
||||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||||
elif k == 'ColorSpace':
|
elif k == 'ColorSpace':
|
||||||
for (csid, spec) in dict_value(v).items():
|
for (csid, spec) in dict_value(v).items():
|
||||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
colorspace = get_colorspace(resolve1(spec))
|
||||||
|
if colorspace is not None:
|
||||||
|
self.csmap[csid] = colorspace
|
||||||
elif k == 'ProcSet':
|
elif k == 'ProcSet':
|
||||||
self.rsrcmgr.get_procset(list_value(v))
|
self.rsrcmgr.get_procset(list_value(v))
|
||||||
elif k == 'XObject':
|
elif k == 'XObject':
|
||||||
|
@ -362,130 +392,180 @@ class PDFPageInterpreter:
|
||||||
self.xobjmap[xobjid] = xobjstrm
|
self.xobjmap[xobjid] = xobjstrm
|
||||||
return
|
return
|
||||||
|
|
||||||
def init_state(self, ctm):
|
def init_state(self, ctm: Matrix) -> None:
|
||||||
"""Initialize the text and graphic states for rendering a page."""
|
"""Initialize the text and graphic states for rendering a page."""
|
||||||
self.gstack = [] # stack for graphical states.
|
# gstack: stack for graphical states.
|
||||||
|
self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = []
|
||||||
self.ctm = ctm
|
self.ctm = ctm
|
||||||
self.device.set_ctm(self.ctm)
|
self.device.set_ctm(self.ctm)
|
||||||
self.textstate = PDFTextState()
|
self.textstate = PDFTextState()
|
||||||
self.graphicstate = PDFGraphicState()
|
self.graphicstate = PDFGraphicState()
|
||||||
self.curpath = []
|
self.curpath: List[PathSegment] = []
|
||||||
# argstack: stack for command arguments.
|
# argstack: stack for command arguments.
|
||||||
self.argstack = []
|
self.argstack: List[PDFStackT] = []
|
||||||
# set some global states.
|
# set some global states.
|
||||||
self.scs = self.ncs = None
|
self.scs: Optional[PDFColorSpace] = None
|
||||||
|
self.ncs: Optional[PDFColorSpace] = None
|
||||||
if self.csmap:
|
if self.csmap:
|
||||||
self.scs = self.ncs = next(iter(self.csmap.values()))
|
self.scs = self.ncs = next(iter(self.csmap.values()))
|
||||||
return
|
return
|
||||||
|
|
||||||
def push(self, obj):
|
def push(self, obj: PDFStackT) -> None:
|
||||||
self.argstack.append(obj)
|
self.argstack.append(obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def pop(self, n):
|
def pop(self, n: int) -> List[PDFStackT]:
|
||||||
if n == 0:
|
if n == 0:
|
||||||
return []
|
return []
|
||||||
x = self.argstack[-n:]
|
x = self.argstack[-n:]
|
||||||
self.argstack = self.argstack[:-n]
|
self.argstack = self.argstack[:-n]
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def get_current_state(self):
|
def get_current_state(
|
||||||
|
self
|
||||||
|
) -> Tuple[Matrix, PDFTextState, PDFGraphicState]:
|
||||||
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
|
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
|
||||||
|
|
||||||
def set_current_state(self, state):
|
def set_current_state(
|
||||||
|
self,
|
||||||
|
state: Tuple[Matrix, PDFTextState, PDFGraphicState]
|
||||||
|
) -> None:
|
||||||
(self.ctm, self.textstate, self.graphicstate) = state
|
(self.ctm, self.textstate, self.graphicstate) = state
|
||||||
self.device.set_ctm(self.ctm)
|
self.device.set_ctm(self.ctm)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_q(self):
|
def do_q(self) -> None:
|
||||||
"""Save graphics state"""
|
"""Save graphics state"""
|
||||||
self.gstack.append(self.get_current_state())
|
self.gstack.append(self.get_current_state())
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Q(self):
|
def do_Q(self) -> None:
|
||||||
"""Restore graphics state"""
|
"""Restore graphics state"""
|
||||||
if self.gstack:
|
if self.gstack:
|
||||||
self.set_current_state(self.gstack.pop())
|
self.set_current_state(self.gstack.pop())
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
def do_cm(
|
||||||
|
self,
|
||||||
|
a1: PDFStackT,
|
||||||
|
b1: PDFStackT,
|
||||||
|
c1: PDFStackT,
|
||||||
|
d1: PDFStackT,
|
||||||
|
e1: PDFStackT,
|
||||||
|
f1: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Concatenate matrix to current transformation matrix"""
|
"""Concatenate matrix to current transformation matrix"""
|
||||||
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
|
self.ctm = \
|
||||||
|
mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm)
|
||||||
self.device.set_ctm(self.ctm)
|
self.device.set_ctm(self.ctm)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_w(self, linewidth):
|
def do_w(self, linewidth: PDFStackT) -> None:
|
||||||
"""Set line width"""
|
"""Set line width"""
|
||||||
self.graphicstate.linewidth = linewidth
|
self.graphicstate.linewidth = cast(float, linewidth)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_J(self, linecap):
|
def do_J(self, linecap: PDFStackT) -> None:
|
||||||
"""Set line cap style"""
|
"""Set line cap style"""
|
||||||
self.graphicstate.linecap = linecap
|
self.graphicstate.linecap = linecap
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_j(self, linejoin):
|
def do_j(self, linejoin: PDFStackT) -> None:
|
||||||
"""Set line join style"""
|
"""Set line join style"""
|
||||||
self.graphicstate.linejoin = linejoin
|
self.graphicstate.linejoin = linejoin
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_M(self, miterlimit):
|
def do_M(self, miterlimit: PDFStackT) -> None:
|
||||||
"""Set miter limit"""
|
"""Set miter limit"""
|
||||||
self.graphicstate.miterlimit = miterlimit
|
self.graphicstate.miterlimit = miterlimit
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_d(self, dash, phase):
|
def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None:
|
||||||
"""Set line dash pattern"""
|
"""Set line dash pattern"""
|
||||||
self.graphicstate.dash = (dash, phase)
|
self.graphicstate.dash = (dash, phase)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_ri(self, intent):
|
def do_ri(self, intent: PDFStackT) -> None:
|
||||||
"""Set color rendering intent"""
|
"""Set color rendering intent"""
|
||||||
self.graphicstate.intent = intent
|
self.graphicstate.intent = intent
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_i(self, flatness):
|
def do_i(self, flatness: PDFStackT) -> None:
|
||||||
"""Set flatness tolerance"""
|
"""Set flatness tolerance"""
|
||||||
self.graphicstate.flatness = flatness
|
self.graphicstate.flatness = flatness
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_gs(self, name):
|
def do_gs(self, name: PDFStackT) -> None:
|
||||||
"""Set parameters from graphics state parameter dictionary"""
|
"""Set parameters from graphics state parameter dictionary"""
|
||||||
# todo
|
# todo
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_m(self, x, y):
|
def do_m(self, x: PDFStackT, y: PDFStackT) -> None:
|
||||||
"""Begin new subpath"""
|
"""Begin new subpath"""
|
||||||
self.curpath.append(('m', x, y))
|
self.curpath.append(('m', cast(float, x), cast(float, y)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_l(self, x, y):
|
def do_l(self, x: PDFStackT, y: PDFStackT) -> None:
|
||||||
"""Append straight line segment to path"""
|
"""Append straight line segment to path"""
|
||||||
self.curpath.append(('l', x, y))
|
self.curpath.append(('l', cast(float, x), cast(float, y)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_c(self, x1, y1, x2, y2, x3, y3):
|
def do_c(
|
||||||
|
self,
|
||||||
|
x1: PDFStackT,
|
||||||
|
y1: PDFStackT,
|
||||||
|
x2: PDFStackT,
|
||||||
|
y2: PDFStackT,
|
||||||
|
x3: PDFStackT,
|
||||||
|
y3: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Append curved segment to path (three control points)"""
|
"""Append curved segment to path (three control points)"""
|
||||||
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
|
self.curpath.append(('c', cast(float, x1), cast(float, y1),
|
||||||
|
cast(float, x2), cast(float, y2),
|
||||||
|
cast(float, x3), cast(float, y3)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_v(self, x2, y2, x3, y3):
|
def do_v(
|
||||||
|
self,
|
||||||
|
x2: PDFStackT,
|
||||||
|
y2: PDFStackT,
|
||||||
|
x3: PDFStackT,
|
||||||
|
y3: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Append curved segment to path (initial point replicated)"""
|
"""Append curved segment to path (initial point replicated)"""
|
||||||
self.curpath.append(('v', x2, y2, x3, y3))
|
self.curpath.append(('v', cast(float, x2), cast(float, y2),
|
||||||
|
cast(float, x3), cast(float, y3)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_y(self, x1, y1, x3, y3):
|
def do_y(
|
||||||
|
self,
|
||||||
|
x1: PDFStackT,
|
||||||
|
y1: PDFStackT,
|
||||||
|
x3: PDFStackT,
|
||||||
|
y3: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Append curved segment to path (final point replicated)"""
|
"""Append curved segment to path (final point replicated)"""
|
||||||
self.curpath.append(('y', x1, y1, x3, y3))
|
self.curpath.append(('y', cast(float, x1), cast(float, y1),
|
||||||
|
cast(float, x3), cast(float, y3)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_h(self):
|
def do_h(self) -> None:
|
||||||
"""Close subpath"""
|
"""Close subpath"""
|
||||||
self.curpath.append(('h',))
|
self.curpath.append(('h',))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_re(self, x, y, w, h):
|
def do_re(
|
||||||
|
self,
|
||||||
|
x: PDFStackT,
|
||||||
|
y: PDFStackT,
|
||||||
|
w: PDFStackT,
|
||||||
|
h: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Append rectangle to path"""
|
"""Append rectangle to path"""
|
||||||
|
x = cast(float, x)
|
||||||
|
y = cast(float, y)
|
||||||
|
w = cast(float, w)
|
||||||
|
h = cast(float, h)
|
||||||
self.curpath.append(('m', x, y))
|
self.curpath.append(('m', x, y))
|
||||||
self.curpath.append(('l', x+w, y))
|
self.curpath.append(('l', x+w, y))
|
||||||
self.curpath.append(('l', x+w, y+h))
|
self.curpath.append(('l', x+w, y+h))
|
||||||
|
@ -493,77 +573,77 @@ class PDFPageInterpreter:
|
||||||
self.curpath.append(('h',))
|
self.curpath.append(('h',))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_S(self):
|
def do_S(self) -> None:
|
||||||
"""Stroke path"""
|
"""Stroke path"""
|
||||||
self.device.paint_path(self.graphicstate, True, False, False,
|
self.device.paint_path(self.graphicstate, True, False, False,
|
||||||
self.curpath)
|
self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_s(self):
|
def do_s(self) -> None:
|
||||||
"""Close and stroke path"""
|
"""Close and stroke path"""
|
||||||
self.do_h()
|
self.do_h()
|
||||||
self.do_S()
|
self.do_S()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_f(self):
|
def do_f(self) -> None:
|
||||||
"""Fill path using nonzero winding number rule"""
|
"""Fill path using nonzero winding number rule"""
|
||||||
self.device.paint_path(self.graphicstate, False, True, False,
|
self.device.paint_path(self.graphicstate, False, True, False,
|
||||||
self.curpath)
|
self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_F(self):
|
def do_F(self) -> None:
|
||||||
"""Fill path using nonzero winding number rule (obsolete)"""
|
"""Fill path using nonzero winding number rule (obsolete)"""
|
||||||
return self.do_f()
|
return self.do_f()
|
||||||
|
|
||||||
def do_f_a(self):
|
def do_f_a(self) -> None:
|
||||||
"""Fill path using even-odd rule"""
|
"""Fill path using even-odd rule"""
|
||||||
self.device.paint_path(self.graphicstate, False, True, True,
|
self.device.paint_path(self.graphicstate, False, True, True,
|
||||||
self.curpath)
|
self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_B(self):
|
def do_B(self) -> None:
|
||||||
"""Fill and stroke path using nonzero winding number rule"""
|
"""Fill and stroke path using nonzero winding number rule"""
|
||||||
self.device.paint_path(self.graphicstate, True, True, False,
|
self.device.paint_path(self.graphicstate, True, True, False,
|
||||||
self.curpath)
|
self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_B_a(self):
|
def do_B_a(self) -> None:
|
||||||
"""Fill and stroke path using even-odd rule"""
|
"""Fill and stroke path using even-odd rule"""
|
||||||
self.device.paint_path(self.graphicstate, True, True, True,
|
self.device.paint_path(self.graphicstate, True, True, True,
|
||||||
self.curpath)
|
self.curpath)
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_b(self):
|
def do_b(self) -> None:
|
||||||
"""Close, fill, and stroke path using nonzero winding number rule"""
|
"""Close, fill, and stroke path using nonzero winding number rule"""
|
||||||
self.do_h()
|
self.do_h()
|
||||||
self.do_B()
|
self.do_B()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_b_a(self):
|
def do_b_a(self) -> None:
|
||||||
"""Close, fill, and stroke path using even-odd rule"""
|
"""Close, fill, and stroke path using even-odd rule"""
|
||||||
self.do_h()
|
self.do_h()
|
||||||
self.do_B_a()
|
self.do_B_a()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_n(self):
|
def do_n(self) -> None:
|
||||||
"""End path without filling or stroking"""
|
"""End path without filling or stroking"""
|
||||||
self.curpath = []
|
self.curpath = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_W(self):
|
def do_W(self) -> None:
|
||||||
"""Set clipping path using nonzero winding number rule"""
|
"""Set clipping path using nonzero winding number rule"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_W_a(self):
|
def do_W_a(self) -> None:
|
||||||
"""Set clipping path using even-odd rule"""
|
"""Set clipping path using even-odd rule"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_CS(self, name):
|
def do_CS(self, name: PDFStackT) -> None:
|
||||||
"""Set color space for stroking operations
|
"""Set color space for stroking operations
|
||||||
|
|
||||||
Introduced in PDF 1.1
|
Introduced in PDF 1.1
|
||||||
|
@ -575,7 +655,7 @@ class PDFPageInterpreter:
|
||||||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_cs(self, name):
|
def do_cs(self, name: PDFStackT) -> None:
|
||||||
"""Set color space for nonstroking operations"""
|
"""Set color space for nonstroking operations"""
|
||||||
try:
|
try:
|
||||||
self.ncs = self.csmap[literal_name(name)]
|
self.ncs = self.csmap[literal_name(name)]
|
||||||
|
@ -584,37 +664,53 @@ class PDFPageInterpreter:
|
||||||
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_G(self, gray):
|
def do_G(self, gray: PDFStackT) -> None:
|
||||||
"""Set gray level for stroking operations"""
|
"""Set gray level for stroking operations"""
|
||||||
self.graphicstate.scolor = gray
|
self.graphicstate.scolor = cast(float, gray)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_g(self, gray):
|
def do_g(self, gray: PDFStackT) -> None:
|
||||||
"""Set gray level for nonstroking operations"""
|
"""Set gray level for nonstroking operations"""
|
||||||
self.graphicstate.ncolor = gray
|
self.graphicstate.ncolor = cast(float, gray)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_RG(self, r, g, b):
|
def do_RG(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
|
||||||
"""Set RGB color for stroking operations"""
|
"""Set RGB color for stroking operations"""
|
||||||
self.graphicstate.scolor = (r, g, b)
|
self.graphicstate.scolor = \
|
||||||
|
(cast(float, r), cast(float, g), cast(float, b))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_rg(self, r, g, b):
|
def do_rg(self, r: PDFStackT, g: PDFStackT, b: PDFStackT) -> None:
|
||||||
"""Set RGB color for nonstroking operations"""
|
"""Set RGB color for nonstroking operations"""
|
||||||
self.graphicstate.ncolor = (r, g, b)
|
self.graphicstate.ncolor = \
|
||||||
|
(cast(float, r), cast(float, g), cast(float, b))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_K(self, c, m, y, k):
|
def do_K(
|
||||||
|
self,
|
||||||
|
c: PDFStackT,
|
||||||
|
m: PDFStackT,
|
||||||
|
y: PDFStackT,
|
||||||
|
k: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Set CMYK color for stroking operations"""
|
"""Set CMYK color for stroking operations"""
|
||||||
self.graphicstate.scolor = (c, m, y, k)
|
self.graphicstate.scolor = \
|
||||||
|
(cast(float, c), cast(float, m), cast(float, y), cast(float, k))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_k(self, c, m, y, k):
|
def do_k(
|
||||||
|
self,
|
||||||
|
c: PDFStackT,
|
||||||
|
m: PDFStackT,
|
||||||
|
y: PDFStackT,
|
||||||
|
k: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Set CMYK color for nonstroking operations"""
|
"""Set CMYK color for nonstroking operations"""
|
||||||
self.graphicstate.ncolor = (c, m, y, k)
|
self.graphicstate.ncolor = \
|
||||||
|
(cast(float, c), cast(float, m), cast(float, y), cast(float, k))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_SCN(self):
|
def do_SCN(self) -> None:
|
||||||
"""Set color for stroking operations."""
|
"""Set color for stroking operations."""
|
||||||
if self.scs:
|
if self.scs:
|
||||||
n = self.scs.ncomponents
|
n = self.scs.ncomponents
|
||||||
|
@ -622,10 +718,10 @@ class PDFPageInterpreter:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFInterpreterError('No colorspace specified!')
|
raise PDFInterpreterError('No colorspace specified!')
|
||||||
n = 1
|
n = 1
|
||||||
self.graphicstate.scolor = self.pop(n)
|
self.graphicstate.scolor = cast(Color, self.pop(n))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_scn(self):
|
def do_scn(self) -> None:
|
||||||
"""Set color for nonstroking operations"""
|
"""Set color for nonstroking operations"""
|
||||||
if self.ncs:
|
if self.ncs:
|
||||||
n = self.ncs.ncomponents
|
n = self.ncs.ncomponents
|
||||||
|
@ -633,24 +729,24 @@ class PDFPageInterpreter:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFInterpreterError('No colorspace specified!')
|
raise PDFInterpreterError('No colorspace specified!')
|
||||||
n = 1
|
n = 1
|
||||||
self.graphicstate.ncolor = self.pop(n)
|
self.graphicstate.ncolor = cast(Color, self.pop(n))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_SC(self):
|
def do_SC(self) -> None:
|
||||||
"""Set color for stroking operations"""
|
"""Set color for stroking operations"""
|
||||||
self.do_SCN()
|
self.do_SCN()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_sc(self):
|
def do_sc(self) -> None:
|
||||||
"""Set color for nonstroking operations"""
|
"""Set color for nonstroking operations"""
|
||||||
self.do_scn()
|
self.do_scn()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_sh(self, name):
|
def do_sh(self, name: object) -> None:
|
||||||
"""Paint area defined by shading pattern"""
|
"""Paint area defined by shading pattern"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_BT(self):
|
def do_BT(self) -> None:
|
||||||
"""Begin text object
|
"""Begin text object
|
||||||
|
|
||||||
Initializing the text matrix, Tm, and the text line matrix, Tlm, to
|
Initializing the text matrix, Tm, and the text line matrix, Tlm, to
|
||||||
|
@ -660,82 +756,82 @@ class PDFPageInterpreter:
|
||||||
self.textstate.reset()
|
self.textstate.reset()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_ET(self):
|
def do_ET(self) -> None:
|
||||||
"""End a text object"""
|
"""End a text object"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_BX(self):
|
def do_BX(self) -> None:
|
||||||
"""Begin compatibility section"""
|
"""Begin compatibility section"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_EX(self):
|
def do_EX(self) -> None:
|
||||||
"""End compatibility section"""
|
"""End compatibility section"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_MP(self, tag):
|
def do_MP(self, tag: PDFStackT) -> None:
|
||||||
"""Define marked-content point"""
|
"""Define marked-content point"""
|
||||||
self.device.do_tag(tag)
|
self.device.do_tag(cast(PSLiteral, tag))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_DP(self, tag, props):
|
def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None:
|
||||||
"""Define marked-content point with property list"""
|
"""Define marked-content point with property list"""
|
||||||
self.device.do_tag(tag, props)
|
self.device.do_tag(cast(PSLiteral, tag), props)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_BMC(self, tag):
|
def do_BMC(self, tag: PDFStackT) -> None:
|
||||||
"""Begin marked-content sequence"""
|
"""Begin marked-content sequence"""
|
||||||
self.device.begin_tag(tag)
|
self.device.begin_tag(cast(PSLiteral, tag))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_BDC(self, tag, props):
|
def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None:
|
||||||
"""Begin marked-content sequence with property list"""
|
"""Begin marked-content sequence with property list"""
|
||||||
self.device.begin_tag(tag, props)
|
self.device.begin_tag(cast(PSLiteral, tag), props)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_EMC(self):
|
def do_EMC(self) -> None:
|
||||||
"""End marked-content sequence"""
|
"""End marked-content sequence"""
|
||||||
self.device.end_tag()
|
self.device.end_tag()
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Tc(self, space):
|
def do_Tc(self, space: PDFStackT) -> None:
|
||||||
"""Set character spacing.
|
"""Set character spacing.
|
||||||
|
|
||||||
Character spacing is used by the Tj, TJ, and ' operators.
|
Character spacing is used by the Tj, TJ, and ' operators.
|
||||||
|
|
||||||
:param space: a number expressed in unscaled text space units.
|
:param space: a number expressed in unscaled text space units.
|
||||||
"""
|
"""
|
||||||
self.textstate.charspace = space
|
self.textstate.charspace = cast(float, space)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Tw(self, space):
|
def do_Tw(self, space: PDFStackT) -> None:
|
||||||
"""Set the word spacing.
|
"""Set the word spacing.
|
||||||
|
|
||||||
Word spacing is used by the Tj, TJ, and ' operators.
|
Word spacing is used by the Tj, TJ, and ' operators.
|
||||||
|
|
||||||
:param space: a number expressed in unscaled text space units
|
:param space: a number expressed in unscaled text space units
|
||||||
"""
|
"""
|
||||||
self.textstate.wordspace = space
|
self.textstate.wordspace = cast(float, space)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Tz(self, scale):
|
def do_Tz(self, scale: PDFStackT) -> None:
|
||||||
"""Set the horizontal scaling.
|
"""Set the horizontal scaling.
|
||||||
|
|
||||||
:param scale: is a number specifying the percentage of the normal width
|
:param scale: is a number specifying the percentage of the normal width
|
||||||
"""
|
"""
|
||||||
self.textstate.scaling = scale
|
self.textstate.scaling = cast(float, scale)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_TL(self, leading):
|
def do_TL(self, leading: PDFStackT) -> None:
|
||||||
"""Set the text leading.
|
"""Set the text leading.
|
||||||
|
|
||||||
Text leading is used only by the T*, ', and " operators.
|
Text leading is used only by the T*, ', and " operators.
|
||||||
|
|
||||||
:param leading: a number expressed in unscaled text space units
|
:param leading: a number expressed in unscaled text space units
|
||||||
"""
|
"""
|
||||||
self.textstate.leading = -leading
|
self.textstate.leading = -cast(float, leading)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Tf(self, fontid, fontsize):
|
def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None:
|
||||||
"""Set the text font
|
"""Set the text font
|
||||||
|
|
||||||
:param fontid: the name of a font resource in the Font subdictionary
|
:param fontid: the name of a font resource in the Font subdictionary
|
||||||
|
@ -748,44 +844,56 @@ class PDFPageInterpreter:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
||||||
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||||
self.textstate.fontsize = fontsize
|
self.textstate.fontsize = cast(float, fontsize)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Tr(self, render):
|
def do_Tr(self, render: PDFStackT) -> None:
|
||||||
"""Set the text rendering mode"""
|
"""Set the text rendering mode"""
|
||||||
self.textstate.render = render
|
self.textstate.render = cast(int, render)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Ts(self, rise):
|
def do_Ts(self, rise: PDFStackT) -> None:
|
||||||
"""Set the text rise
|
"""Set the text rise
|
||||||
|
|
||||||
:param rise: a number expressed in unscaled text space units
|
:param rise: a number expressed in unscaled text space units
|
||||||
"""
|
"""
|
||||||
self.textstate.rise = rise
|
self.textstate.rise = cast(float, rise)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Td(self, tx, ty):
|
def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None:
|
||||||
"""Move text position"""
|
"""Move text position"""
|
||||||
|
tx = cast(float, tx)
|
||||||
|
ty = cast(float, ty)
|
||||||
(a, b, c, d, e, f) = self.textstate.matrix
|
(a, b, c, d, e, f) = self.textstate.matrix
|
||||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_TD(self, tx, ty):
|
def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None:
|
||||||
"""Move text position and set leading"""
|
"""Move text position and set leading"""
|
||||||
|
tx = cast(float, tx)
|
||||||
|
ty = cast(float, ty)
|
||||||
(a, b, c, d, e, f) = self.textstate.matrix
|
(a, b, c, d, e, f) = self.textstate.matrix
|
||||||
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
||||||
self.textstate.leading = ty
|
self.textstate.leading = ty
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Tm(self, a, b, c, d, e, f):
|
def do_Tm(
|
||||||
|
self,
|
||||||
|
a: PDFStackT,
|
||||||
|
b: PDFStackT,
|
||||||
|
c: PDFStackT,
|
||||||
|
d: PDFStackT,
|
||||||
|
e: PDFStackT,
|
||||||
|
f: PDFStackT
|
||||||
|
) -> None:
|
||||||
"""Set text matrix and text line matrix"""
|
"""Set text matrix and text line matrix"""
|
||||||
self.textstate.matrix = (a, b, c, d, e, f)
|
self.textstate.matrix = cast(Matrix, (a, b, c, d, e, f))
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_T_a(self):
|
def do_T_a(self) -> None:
|
||||||
"""Move to start of next text line"""
|
"""Move to start of next text line"""
|
||||||
(a, b, c, d, e, f) = self.textstate.matrix
|
(a, b, c, d, e, f) = self.textstate.matrix
|
||||||
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e,
|
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e,
|
||||||
|
@ -793,22 +901,23 @@ class PDFPageInterpreter:
|
||||||
self.textstate.linematrix = (0, 0)
|
self.textstate.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_TJ(self, seq):
|
def do_TJ(self, seq: PDFStackT) -> None:
|
||||||
"""Show text, allowing individual glyph positioning"""
|
"""Show text, allowing individual glyph positioning"""
|
||||||
if self.textstate.font is None:
|
if self.textstate.font is None:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFInterpreterError('No font specified!')
|
raise PDFInterpreterError('No font specified!')
|
||||||
return
|
return
|
||||||
self.device.render_string(self.textstate, seq, self.ncs,
|
assert self.ncs is not None
|
||||||
self.graphicstate.copy())
|
self.device.render_string(self.textstate, cast(PDFTextSeq, seq),
|
||||||
|
self.ncs, self.graphicstate.copy())
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Tj(self, s):
|
def do_Tj(self, s: PDFStackT) -> None:
|
||||||
"""Show text"""
|
"""Show text"""
|
||||||
self.do_TJ([s])
|
self.do_TJ([s])
|
||||||
return
|
return
|
||||||
|
|
||||||
def do__q(self, s):
|
def do__q(self, s: PDFStackT) -> None:
|
||||||
"""Move to next line and show text
|
"""Move to next line and show text
|
||||||
|
|
||||||
The ' (single quote) operator.
|
The ' (single quote) operator.
|
||||||
|
@ -817,7 +926,7 @@ class PDFPageInterpreter:
|
||||||
self.do_TJ([s])
|
self.do_TJ([s])
|
||||||
return
|
return
|
||||||
|
|
||||||
def do__w(self, aw, ac, s):
|
def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None:
|
||||||
"""Set word and character spacing, move to next line, and show text
|
"""Set word and character spacing, move to next line, and show text
|
||||||
|
|
||||||
The " (double quote) operator.
|
The " (double quote) operator.
|
||||||
|
@ -827,15 +936,15 @@ class PDFPageInterpreter:
|
||||||
self.do_TJ([s])
|
self.do_TJ([s])
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_BI(self):
|
def do_BI(self) -> None:
|
||||||
"""Begin inline image object"""
|
"""Begin inline image object"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_ID(self):
|
def do_ID(self) -> None:
|
||||||
"""Begin inline image data"""
|
"""Begin inline image data"""
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_EI(self, obj):
|
def do_EI(self, obj: PDFStackT) -> None:
|
||||||
"""End inline image object"""
|
"""End inline image object"""
|
||||||
if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj:
|
if isinstance(obj, PDFStream) and 'W' in obj and 'H' in obj:
|
||||||
iobjid = str(id(obj))
|
iobjid = str(id(obj))
|
||||||
|
@ -844,9 +953,9 @@ class PDFPageInterpreter:
|
||||||
self.device.end_figure(iobjid)
|
self.device.end_figure(iobjid)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_Do(self, xobjid):
|
def do_Do(self, xobjid_arg: PDFStackT) -> None:
|
||||||
"""Invoke named XObject"""
|
"""Invoke named XObject"""
|
||||||
xobjid = literal_name(xobjid)
|
xobjid = cast(str, literal_name(xobjid_arg))
|
||||||
try:
|
try:
|
||||||
xobj = stream_value(self.xobjmap[xobjid])
|
xobj = stream_value(self.xobjmap[xobjid])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -857,8 +966,9 @@ class PDFPageInterpreter:
|
||||||
subtype = xobj.get('Subtype')
|
subtype = xobj.get('Subtype')
|
||||||
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
||||||
interpreter = self.dup()
|
interpreter = self.dup()
|
||||||
bbox = list_value(xobj['BBox'])
|
bbox = cast(Rect, list_value(xobj['BBox']))
|
||||||
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
|
matrix = cast(Matrix, list_value(
|
||||||
|
xobj.get('Matrix', MATRIX_IDENTITY)))
|
||||||
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
||||||
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
||||||
# instead of having their own Resources entry.
|
# instead of having their own Resources entry.
|
||||||
|
@ -880,7 +990,7 @@ class PDFPageInterpreter:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
def process_page(self, page):
|
def process_page(self, page: PDFPage) -> None:
|
||||||
log.info('Processing page: %r', page)
|
log.info('Processing page: %r', page)
|
||||||
(x0, y0, x1, y1) = page.mediabox
|
(x0, y0, x1, y1) = page.mediabox
|
||||||
if page.rotate == 90:
|
if page.rotate == 90:
|
||||||
|
@ -896,7 +1006,12 @@ class PDFPageInterpreter:
|
||||||
self.device.end_page(page)
|
self.device.end_page(page)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
def render_contents(
|
||||||
|
self,
|
||||||
|
resources: Dict[object, object],
|
||||||
|
streams: Sequence[object],
|
||||||
|
ctm: Matrix = MATRIX_IDENTITY
|
||||||
|
) -> None:
|
||||||
"""Render the content streams.
|
"""Render the content streams.
|
||||||
|
|
||||||
This method may be called recursively.
|
This method may be called recursively.
|
||||||
|
@ -908,7 +1023,7 @@ class PDFPageInterpreter:
|
||||||
self.execute(list_value(streams))
|
self.execute(list_value(streams))
|
||||||
return
|
return
|
||||||
|
|
||||||
def execute(self, streams):
|
def execute(self, streams: Sequence[object]) -> None:
|
||||||
try:
|
try:
|
||||||
parser = PDFContentParser(streams)
|
parser = PDFContentParser(streams)
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
|
from pdfminer.utils import Rect
|
||||||
|
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
|
||||||
import warnings
|
import warnings
|
||||||
from . import settings
|
from . import settings
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
|
@ -32,7 +34,7 @@ class PDFPage:
|
||||||
attrs: a dictionary of page attributes.
|
attrs: a dictionary of page attributes.
|
||||||
contents: a list of PDFStream objects that represents the page content.
|
contents: a list of PDFStream objects that represents the page content.
|
||||||
lastmod: the last modified time of the page.
|
lastmod: the last modified time of the page.
|
||||||
resources: a list of resources used by the page.
|
resources: a dictionary of resources used by the page.
|
||||||
mediabox: the physical size of the page.
|
mediabox: the physical size of the page.
|
||||||
cropbox: the crop rectangle of the page.
|
cropbox: the crop rectangle of the page.
|
||||||
rotate: the page rotation (in degree).
|
rotate: the page rotation (in degree).
|
||||||
|
@ -40,7 +42,12 @@ class PDFPage:
|
||||||
beads: a chain that represents natural reading order.
|
beads: a chain that represents natural reading order.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, doc, pageid, attrs):
|
def __init__(
|
||||||
|
self,
|
||||||
|
doc: PDFDocument,
|
||||||
|
pageid: object,
|
||||||
|
attrs: object
|
||||||
|
) -> None:
|
||||||
"""Initialize a page object.
|
"""Initialize a page object.
|
||||||
|
|
||||||
doc: a PDFDocument object.
|
doc: a PDFDocument object.
|
||||||
|
@ -51,10 +58,11 @@ class PDFPage:
|
||||||
self.pageid = pageid
|
self.pageid = pageid
|
||||||
self.attrs = dict_value(attrs)
|
self.attrs = dict_value(attrs)
|
||||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||||
self.resources = resolve1(self.attrs.get('Resources', dict()))
|
self.resources: Dict[object, object] = \
|
||||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
resolve1(self.attrs.get('Resources', dict()))
|
||||||
|
self.mediabox: Rect = resolve1(self.attrs['MediaBox'])
|
||||||
if 'CropBox' in self.attrs:
|
if 'CropBox' in self.attrs:
|
||||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
self.cropbox: Rect = resolve1(self.attrs['CropBox'])
|
||||||
else:
|
else:
|
||||||
self.cropbox = self.mediabox
|
self.cropbox = self.mediabox
|
||||||
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
|
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
|
||||||
|
@ -66,23 +74,28 @@ class PDFPage:
|
||||||
contents = []
|
contents = []
|
||||||
if not isinstance(contents, list):
|
if not isinstance(contents, list):
|
||||||
contents = [contents]
|
contents = [contents]
|
||||||
self.contents = contents
|
self.contents: List[object] = contents
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
|
return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
|
||||||
.format(self.resources, self.mediabox)
|
.format(self.resources, self.mediabox)
|
||||||
|
|
||||||
INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
|
INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_pages(cls, document):
|
def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
|
||||||
def search(obj, parent):
|
def search(
|
||||||
|
obj: object,
|
||||||
|
parent: Dict[str, object]
|
||||||
|
) -> Iterator[Tuple[int, Dict[object, Dict[object, object]]]]:
|
||||||
if isinstance(obj, int):
|
if isinstance(obj, int):
|
||||||
objid = obj
|
objid = obj
|
||||||
tree = dict_value(document.getobj(objid)).copy()
|
tree = dict_value(document.getobj(objid)).copy()
|
||||||
else:
|
else:
|
||||||
objid = obj.objid
|
# This looks broken. obj.objid means obj could be either
|
||||||
|
# PDFObjRef or PDFStream, but neither is valid for dict_value.
|
||||||
|
objid = obj.objid # type: ignore[attr-defined]
|
||||||
tree = dict_value(obj).copy()
|
tree = dict_value(obj).copy()
|
||||||
for (k, v) in parent.items():
|
for (k, v) in parent.items():
|
||||||
if k in cls.INHERITABLE_ATTRS and k not in tree:
|
if k in cls.INHERITABLE_ATTRS and k not in tree:
|
||||||
|
@ -119,9 +132,15 @@ class PDFPage:
|
||||||
return
|
return
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_pages(cls, fp,
|
def get_pages(
|
||||||
pagenos=None, maxpages=0, password='',
|
cls,
|
||||||
caching=True, check_extractable=False):
|
fp: BinaryIO,
|
||||||
|
pagenos: Optional[Container[int]] = None,
|
||||||
|
maxpages: int = 0,
|
||||||
|
password: str = '',
|
||||||
|
caching: bool = True,
|
||||||
|
check_extractable: bool = False
|
||||||
|
) -> Iterator["PDFPage"]:
|
||||||
# Create a PDF parser object associated with the file object.
|
# Create a PDF parser object associated with the file object.
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
# Create a PDF document object that stores the document structure.
|
# Create a PDF document object that stores the document structure.
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from typing import BinaryIO, TYPE_CHECKING, Optional, Union
|
||||||
from .psparser import PSStackParser
|
from .psparser import PSStackParser
|
||||||
|
from .psparser import PSKeyword
|
||||||
from .psparser import PSSyntaxError
|
from .psparser import PSSyntaxError
|
||||||
from .psparser import PSEOF
|
from .psparser import PSEOF
|
||||||
from .psparser import KWD
|
from .psparser import KWD
|
||||||
|
@ -11,6 +13,9 @@ from .pdftypes import PDFObjRef
|
||||||
from .pdftypes import int_value
|
from .pdftypes import int_value
|
||||||
from .pdftypes import dict_value
|
from .pdftypes import dict_value
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .pdfdocument import PDFDocument
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,7 +23,8 @@ class PDFSyntaxError(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PDFParser(PSStackParser):
|
# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
|
||||||
|
class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
|
||||||
"""
|
"""
|
||||||
PDFParser fetch PDF objects from a file stream.
|
PDFParser fetch PDF objects from a file stream.
|
||||||
It can handle indirect references by referring to
|
It can handle indirect references by referring to
|
||||||
|
@ -35,13 +41,13 @@ class PDFParser(PSStackParser):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp: BinaryIO) -> None:
|
||||||
PSStackParser.__init__(self, fp)
|
PSStackParser.__init__(self, fp)
|
||||||
self.doc = None
|
self.doc: Optional["PDFDocument"] = None
|
||||||
self.fallback = False
|
self.fallback = False
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_document(self, doc):
|
def set_document(self, doc: "PDFDocument") -> None:
|
||||||
"""Associates the parser with a PDFDocument object."""
|
"""Associates the parser with a PDFDocument object."""
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
return
|
return
|
||||||
|
@ -53,7 +59,7 @@ class PDFParser(PSStackParser):
|
||||||
KEYWORD_XREF = KWD(b'xref')
|
KEYWORD_XREF = KWD(b'xref')
|
||||||
KEYWORD_STARTXREF = KWD(b'startxref')
|
KEYWORD_STARTXREF = KWD(b'startxref')
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||||
"""Handles PDF-related keywords."""
|
"""Handles PDF-related keywords."""
|
||||||
|
|
||||||
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
||||||
|
@ -71,7 +77,9 @@ class PDFParser(PSStackParser):
|
||||||
if len(self.curstack) >= 2:
|
if len(self.curstack) >= 2:
|
||||||
try:
|
try:
|
||||||
((_, objid), (_, genno)) = self.pop(2)
|
((_, objid), (_, genno)) = self.pop(2)
|
||||||
(objid, genno) = (int(objid), int(genno))
|
(objid, genno) = (
|
||||||
|
int(objid), int(genno)) # type: ignore[arg-type]
|
||||||
|
assert self.doc is not None
|
||||||
obj = PDFObjRef(self.doc, objid, genno)
|
obj = PDFObjRef(self.doc, objid, genno)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
|
@ -114,13 +122,13 @@ class PDFParser(PSStackParser):
|
||||||
objlen += len(line)
|
objlen += len(line)
|
||||||
if self.fallback:
|
if self.fallback:
|
||||||
data += line
|
data += line
|
||||||
data = bytes(data)
|
|
||||||
self.seek(pos+objlen)
|
self.seek(pos+objlen)
|
||||||
# XXX limit objlen not to exceed object boundary
|
# XXX limit objlen not to exceed object boundary
|
||||||
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
|
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
|
||||||
objlen, dic, data[:10])
|
objlen, dic, data[:10])
|
||||||
obj = PDFStream(dic, data, self.doc.decipher)
|
assert self.doc is not None
|
||||||
self.push((pos, obj))
|
stream = PDFStream(dic, bytes(data), self.doc.decipher)
|
||||||
|
self.push((pos, stream))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# others
|
# others
|
||||||
|
@ -138,22 +146,23 @@ class PDFStreamParser(PDFParser):
|
||||||
indirect references to other objects in the same document.
|
indirect references to other objects in the same document.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data):
|
def __init__(self, data: bytes) -> None:
|
||||||
PDFParser.__init__(self, BytesIO(data))
|
PDFParser.__init__(self, BytesIO(data))
|
||||||
return
|
return
|
||||||
|
|
||||||
def flush(self):
|
def flush(self) -> None:
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
return
|
return
|
||||||
|
|
||||||
KEYWORD_OBJ = KWD(b'obj')
|
KEYWORD_OBJ = KWD(b'obj')
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||||
if token is self.KEYWORD_R:
|
if token is self.KEYWORD_R:
|
||||||
# reference to indirect object
|
# reference to indirect object
|
||||||
try:
|
try:
|
||||||
((_, objid), (_, genno)) = self.pop(2)
|
((_, objid), (_, genno)) = self.pop(2)
|
||||||
(objid, genno) = (int(objid), int(genno))
|
(objid, genno) = (
|
||||||
|
int(objid), int(genno)) # type: ignore[arg-type]
|
||||||
obj = PDFObjRef(self.doc, objid, genno)
|
obj = PDFObjRef(self.doc, objid, genno)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import zlib
|
import zlib
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
|
from typing import (TYPE_CHECKING, Any, Dict, Iterable, Optional, Union, List,
|
||||||
|
Tuple, cast)
|
||||||
from .lzw import lzwdecode
|
from .lzw import lzwdecode
|
||||||
from .ascii85 import ascii85decode
|
from .ascii85 import ascii85decode
|
||||||
from .ascii85 import asciihexdecode
|
from .ascii85 import asciihexdecode
|
||||||
|
@ -10,7 +13,9 @@ from .psparser import PSObject
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
from . import settings
|
from . import settings
|
||||||
from .utils import apply_png_predictor
|
from .utils import apply_png_predictor
|
||||||
from .utils import isnumber
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .pdfdocument import PDFDocument
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
@ -28,6 +33,21 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
||||||
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
|
LITERALS_JBIG2_DECODE = (LIT('JBIG2Decode'),)
|
||||||
|
|
||||||
|
|
||||||
|
if sys.version_info >= (3, 8):
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
class DecipherCallable(Protocol):
|
||||||
|
"""Fully typed a decipher callback, with optional parameter."""
|
||||||
|
def __call__(self, objid: int, genno: int, data: bytes,
|
||||||
|
attrs: Optional[Dict[str, Any]] = None) -> bytes:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
else: # Fallback for older Python
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
DecipherCallable = Callable[..., bytes]
|
||||||
|
|
||||||
|
|
||||||
class PDFObject(PSObject):
|
class PDFObject(PSObject):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -54,7 +74,12 @@ class PDFNotImplementedError(PDFException):
|
||||||
|
|
||||||
class PDFObjRef(PDFObject):
|
class PDFObjRef(PDFObject):
|
||||||
|
|
||||||
def __init__(self, doc, objid, _):
|
def __init__(
|
||||||
|
self,
|
||||||
|
doc: Optional["PDFDocument"],
|
||||||
|
objid: int,
|
||||||
|
_: object
|
||||||
|
) -> None:
|
||||||
if objid == 0:
|
if objid == 0:
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFValueError('PDF object id cannot be 0.')
|
raise PDFValueError('PDF object id cannot be 0.')
|
||||||
|
@ -62,17 +87,18 @@ class PDFObjRef(PDFObject):
|
||||||
self.objid = objid
|
self.objid = objid
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<PDFObjRef:%d>' % (self.objid)
|
return '<PDFObjRef:%d>' % (self.objid)
|
||||||
|
|
||||||
def resolve(self, default=None):
|
def resolve(self, default: object = None) -> Any:
|
||||||
|
assert self.doc is not None
|
||||||
try:
|
try:
|
||||||
return self.doc.getobj(self.objid)
|
return self.doc.getobj(self.objid)
|
||||||
except PDFObjectNotFound:
|
except PDFObjectNotFound:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
def resolve1(x, default=None):
|
def resolve1(x: object, default: object = None) -> Any:
|
||||||
"""Resolves an object.
|
"""Resolves an object.
|
||||||
|
|
||||||
If this is an array or dictionary, it may still contains
|
If this is an array or dictionary, it may still contains
|
||||||
|
@ -83,7 +109,7 @@ def resolve1(x, default=None):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def resolve_all(x, default=None):
|
def resolve_all(x: object, default: object = None) -> Any:
|
||||||
"""Recursively resolves the given object and all the internals.
|
"""Recursively resolves the given object and all the internals.
|
||||||
|
|
||||||
Make sure there is no indirect reference within the nested object.
|
Make sure there is no indirect reference within the nested object.
|
||||||
|
@ -99,7 +125,12 @@ def resolve_all(x, default=None):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def decipher_all(decipher, objid, genno, x):
|
def decipher_all(
|
||||||
|
decipher: DecipherCallable,
|
||||||
|
objid: int,
|
||||||
|
genno: int,
|
||||||
|
x: object
|
||||||
|
) -> Any:
|
||||||
"""Recursively deciphers the given object.
|
"""Recursively deciphers the given object.
|
||||||
"""
|
"""
|
||||||
if isinstance(x, bytes):
|
if isinstance(x, bytes):
|
||||||
|
@ -112,7 +143,7 @@ def decipher_all(decipher, objid, genno, x):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def int_value(x):
|
def int_value(x: object) -> int:
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, int):
|
if not isinstance(x, int):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
|
@ -121,7 +152,7 @@ def int_value(x):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def float_value(x):
|
def float_value(x: object) -> float:
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, float):
|
if not isinstance(x, float):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
|
@ -130,34 +161,34 @@ def float_value(x):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def num_value(x):
|
def num_value(x: object) -> float:
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isnumber(x):
|
if not isinstance(x, (int, float)): # == utils.isnumber(x)
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFTypeError('Int or Float required: %r' % x)
|
raise PDFTypeError('Int or Float required: %r' % x)
|
||||||
return 0
|
return 0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def uint_value(x, n_bits):
|
def uint_value(x: object, n_bits: int) -> int:
|
||||||
"""Resolve number and interpret it as a two's-complement unsigned number"""
|
"""Resolve number and interpret it as a two's-complement unsigned number"""
|
||||||
x = int_value(x)
|
xi = int_value(x)
|
||||||
if x > 0:
|
if xi > 0:
|
||||||
return x
|
return xi
|
||||||
else:
|
else:
|
||||||
return x + 2**n_bits
|
return xi + cast(int, 2**n_bits)
|
||||||
|
|
||||||
|
|
||||||
def str_value(x):
|
def str_value(x: object) -> bytes:
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, bytes):
|
if not isinstance(x, bytes):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PDFTypeError('String required: %r' % x)
|
raise PDFTypeError('String required: %r' % x)
|
||||||
return ''
|
return b''
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def list_value(x):
|
def list_value(x: object) -> Union[List[Any], Tuple[Any, ...]]:
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, (list, tuple)):
|
if not isinstance(x, (list, tuple)):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
|
@ -166,7 +197,7 @@ def list_value(x):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def dict_value(x):
|
def dict_value(x: object) -> Dict[Any, Any]:
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, dict):
|
if not isinstance(x, dict):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
|
@ -176,7 +207,7 @@ def dict_value(x):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def stream_value(x):
|
def stream_value(x: object) -> "PDFStream":
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, PDFStream):
|
if not isinstance(x, PDFStream):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
|
@ -187,22 +218,27 @@ def stream_value(x):
|
||||||
|
|
||||||
class PDFStream(PDFObject):
|
class PDFStream(PDFObject):
|
||||||
|
|
||||||
def __init__(self, attrs, rawdata, decipher=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
attrs: Dict[str, Any],
|
||||||
|
rawdata: bytes,
|
||||||
|
decipher: Optional[DecipherCallable] = None
|
||||||
|
) -> None:
|
||||||
assert isinstance(attrs, dict), str(type(attrs))
|
assert isinstance(attrs, dict), str(type(attrs))
|
||||||
self.attrs = attrs
|
self.attrs = attrs
|
||||||
self.rawdata = rawdata
|
self.rawdata: Optional[bytes] = rawdata
|
||||||
self.decipher = decipher
|
self.decipher = decipher
|
||||||
self.data = None
|
self.data: Optional[bytes] = None
|
||||||
self.objid = None
|
self.objid: Optional[int] = None
|
||||||
self.genno = None
|
self.genno: Optional[int] = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_objid(self, objid, genno):
|
def set_objid(self, objid: int, genno: int) -> None:
|
||||||
self.objid = objid
|
self.objid = objid
|
||||||
self.genno = genno
|
self.genno = genno
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
if self.data is None:
|
if self.data is None:
|
||||||
assert self.rawdata is not None
|
assert self.rawdata is not None
|
||||||
return '<PDFStream(%r): raw=%d, %r>' % \
|
return '<PDFStream(%r): raw=%d, %r>' % \
|
||||||
|
@ -212,22 +248,22 @@ class PDFStream(PDFObject):
|
||||||
return '<PDFStream(%r): len=%d, %r>' % \
|
return '<PDFStream(%r): len=%d, %r>' % \
|
||||||
(self.objid, len(self.data), self.attrs)
|
(self.objid, len(self.data), self.attrs)
|
||||||
|
|
||||||
def __contains__(self, name):
|
def __contains__(self, name: object) -> bool:
|
||||||
return name in self.attrs
|
return name in self.attrs
|
||||||
|
|
||||||
def __getitem__(self, name):
|
def __getitem__(self, name: str) -> Any:
|
||||||
return self.attrs[name]
|
return self.attrs[name]
|
||||||
|
|
||||||
def get(self, name, default=None):
|
def get(self, name: str, default: object = None) -> Any:
|
||||||
return self.attrs.get(name, default)
|
return self.attrs.get(name, default)
|
||||||
|
|
||||||
def get_any(self, names, default=None):
|
def get_any(self, names: Iterable[str], default: object = None) -> Any:
|
||||||
for name in names:
|
for name in names:
|
||||||
if name in self.attrs:
|
if name in self.attrs:
|
||||||
return self.attrs[name]
|
return self.attrs[name]
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def get_filters(self):
|
def get_filters(self) -> List[Tuple[Any, Any]]:
|
||||||
filters = self.get_any(('F', 'Filter'))
|
filters = self.get_any(('F', 'Filter'))
|
||||||
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
||||||
if not filters:
|
if not filters:
|
||||||
|
@ -248,12 +284,14 @@ class PDFStream(PDFObject):
|
||||||
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15
|
# return list solves https://github.com/pdfminer/pdfminer.six/issues/15
|
||||||
return list(zip(_filters, params))
|
return list(zip(_filters, params))
|
||||||
|
|
||||||
def decode(self):
|
def decode(self) -> None:
|
||||||
assert self.data is None \
|
assert self.data is None \
|
||||||
and self.rawdata is not None, str((self.data, self.rawdata))
|
and self.rawdata is not None, str((self.data, self.rawdata))
|
||||||
data = self.rawdata
|
data = self.rawdata
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
# Handle encryption
|
# Handle encryption
|
||||||
|
assert self.objid is not None
|
||||||
|
assert self.genno is not None
|
||||||
data = self.decipher(self.objid, self.genno, data, self.attrs)
|
data = self.decipher(self.objid, self.genno, data, self.attrs)
|
||||||
filters = self.get_filters()
|
filters = self.get_filters()
|
||||||
if not filters:
|
if not filters:
|
||||||
|
@ -314,10 +352,11 @@ class PDFStream(PDFObject):
|
||||||
self.rawdata = None
|
self.rawdata = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self) -> bytes:
|
||||||
if self.data is None:
|
if self.data is None:
|
||||||
self.decode()
|
self.decode()
|
||||||
|
assert self.data is not None
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
def get_rawdata(self):
|
def get_rawdata(self) -> Optional[bytes]:
|
||||||
return self.rawdata
|
return self.rawdata
|
||||||
|
|
|
@ -4,7 +4,8 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
from typing import (Any, BinaryIO, Dict, Generic, Iterator, List,
|
||||||
|
Optional, Tuple, Type, TypeVar, Union)
|
||||||
|
|
||||||
from . import settings
|
from . import settings
|
||||||
from .utils import choplist
|
from .utils import choplist
|
||||||
|
@ -51,10 +52,12 @@ class PSLiteral(PSObject):
|
||||||
Always use PSLiteralTable.intern().
|
Always use PSLiteralTable.intern().
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name):
|
NameType = Union[str, bytes]
|
||||||
|
|
||||||
|
def __init__(self, name: NameType) -> None:
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
name = self.name
|
name = self.name
|
||||||
return '/%r' % name
|
return '/%r' % name
|
||||||
|
|
||||||
|
@ -71,31 +74,36 @@ class PSKeyword(PSObject):
|
||||||
Always use PSKeywordTable.intern().
|
Always use PSKeywordTable.intern().
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name: bytes) -> None:
|
||||||
self.name = name
|
self.name = name
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
name = self.name
|
name = self.name
|
||||||
return '/%r' % name
|
return '/%r' % name
|
||||||
|
|
||||||
|
|
||||||
class PSSymbolTable:
|
_SymbolT = TypeVar('_SymbolT', PSLiteral, PSKeyword)
|
||||||
|
|
||||||
|
|
||||||
|
class PSSymbolTable(Generic[_SymbolT]):
|
||||||
"""A utility class for storing PSLiteral/PSKeyword objects.
|
"""A utility class for storing PSLiteral/PSKeyword objects.
|
||||||
|
|
||||||
Interned objects can be checked its identity with "is" operator.
|
Interned objects can be checked its identity with "is" operator.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, klass):
|
def __init__(self, klass: Type[_SymbolT]) -> None:
|
||||||
self.dict = {}
|
self.dict: Dict[PSLiteral.NameType, _SymbolT] = {}
|
||||||
self.klass = klass
|
self.klass: Type[_SymbolT] = klass
|
||||||
return
|
return
|
||||||
|
|
||||||
def intern(self, name):
|
def intern(self, name: PSLiteral.NameType) -> _SymbolT:
|
||||||
if name in self.dict:
|
if name in self.dict:
|
||||||
lit = self.dict[name]
|
lit = self.dict[name]
|
||||||
else:
|
else:
|
||||||
lit = self.klass(name)
|
# Type confusion issue: PSKeyword always takes bytes as name
|
||||||
|
# PSLiteral uses either str or bytes
|
||||||
|
lit = self.klass(name) # type: ignore[arg-type]
|
||||||
self.dict[name] = lit
|
self.dict[name] = lit
|
||||||
return lit
|
return lit
|
||||||
|
|
||||||
|
@ -112,7 +120,7 @@ KEYWORD_DICT_BEGIN = KWD(b'<<')
|
||||||
KEYWORD_DICT_END = KWD(b'>>')
|
KEYWORD_DICT_END = KWD(b'>>')
|
||||||
|
|
||||||
|
|
||||||
def literal_name(x):
|
def literal_name(x: object) -> Any:
|
||||||
if not isinstance(x, PSLiteral):
|
if not isinstance(x, PSLiteral):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PSTypeError('Literal required: {!r}'.format(x))
|
raise PSTypeError('Literal required: {!r}'.format(x))
|
||||||
|
@ -120,6 +128,7 @@ def literal_name(x):
|
||||||
name = x
|
name = x
|
||||||
else:
|
else:
|
||||||
name = x.name
|
name = x.name
|
||||||
|
if not isinstance(name, str):
|
||||||
try:
|
try:
|
||||||
name = str(name, 'utf-8')
|
name = str(name, 'utf-8')
|
||||||
except Exception:
|
except Exception:
|
||||||
|
@ -127,7 +136,7 @@ def literal_name(x):
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
def keyword_name(x):
|
def keyword_name(x: object) -> Any:
|
||||||
if not isinstance(x, PSKeyword):
|
if not isinstance(x, PSKeyword):
|
||||||
if settings.STRICT:
|
if settings.STRICT:
|
||||||
raise PSTypeError('Keyword required: %r' % x)
|
raise PSTypeError('Keyword required: %r' % x)
|
||||||
|
@ -161,32 +170,35 @@ ESC_STRING = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
|
||||||
|
|
||||||
|
|
||||||
class PSBaseParser:
|
class PSBaseParser:
|
||||||
|
|
||||||
"""Most basic PostScript parser that performs only tokenization.
|
"""Most basic PostScript parser that performs only tokenization.
|
||||||
"""
|
"""
|
||||||
BUFSIZ = 4096
|
BUFSIZ = 4096
|
||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp: BinaryIO) -> None:
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.seek(0)
|
self.seek(0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
|
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp,
|
||||||
self.bufpos)
|
self.bufpos)
|
||||||
|
|
||||||
def flush(self):
|
def flush(self) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
self.flush()
|
self.flush()
|
||||||
return
|
return
|
||||||
|
|
||||||
def tell(self):
|
def tell(self) -> int:
|
||||||
return self.bufpos+self.charpos
|
return self.bufpos+self.charpos
|
||||||
|
|
||||||
def poll(self, pos=None, n=80):
|
def poll(self, pos: Optional[int] = None, n: int = 80) -> None:
|
||||||
pos0 = self.fp.tell()
|
pos0 = self.fp.tell()
|
||||||
if not pos:
|
if not pos:
|
||||||
pos = self.bufpos+self.charpos
|
pos = self.bufpos+self.charpos
|
||||||
|
@ -195,7 +207,7 @@ class PSBaseParser:
|
||||||
self.fp.seek(pos0)
|
self.fp.seek(pos0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def seek(self, pos):
|
def seek(self, pos: int) -> None:
|
||||||
"""Seeks the parser to the given position.
|
"""Seeks the parser to the given position.
|
||||||
"""
|
"""
|
||||||
log.debug('seek: %r', pos)
|
log.debug('seek: %r', pos)
|
||||||
|
@ -208,10 +220,10 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
self._curtoken = b''
|
self._curtoken = b''
|
||||||
self._curtokenpos = 0
|
self._curtokenpos = 0
|
||||||
self._tokens = []
|
self._tokens: List[Tuple[int, PSBaseParserToken]] = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def fillbuf(self):
|
def fillbuf(self) -> None:
|
||||||
if self.charpos < len(self.buf):
|
if self.charpos < len(self.buf):
|
||||||
return
|
return
|
||||||
# fetch next chunk.
|
# fetch next chunk.
|
||||||
|
@ -222,7 +234,7 @@ class PSBaseParser:
|
||||||
self.charpos = 0
|
self.charpos = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
def nextline(self):
|
def nextline(self) -> Tuple[int, bytes]:
|
||||||
"""Fetches a next line that ends either with \\r or \\n.
|
"""Fetches a next line that ends either with \\r or \\n.
|
||||||
"""
|
"""
|
||||||
linebuf = b''
|
linebuf = b''
|
||||||
|
@ -252,7 +264,7 @@ class PSBaseParser:
|
||||||
|
|
||||||
return (linepos, linebuf)
|
return (linepos, linebuf)
|
||||||
|
|
||||||
def revreadlines(self):
|
def revreadlines(self) -> Iterator[bytes]:
|
||||||
"""Fetches a next line backword.
|
"""Fetches a next line backword.
|
||||||
|
|
||||||
This is used to locate the trailers at the end of a file.
|
This is used to locate the trailers at the end of a file.
|
||||||
|
@ -277,7 +289,7 @@ class PSBaseParser:
|
||||||
buf = b''
|
buf = b''
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_main(self, s, i):
|
def _parse_main(self, s: bytes, i: int) -> int:
|
||||||
m = NONSPC.search(s, i)
|
m = NONSPC.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
return len(s)
|
return len(s)
|
||||||
|
@ -321,11 +333,11 @@ class PSBaseParser:
|
||||||
self._add_token(KWD(c))
|
self._add_token(KWD(c))
|
||||||
return j+1
|
return j+1
|
||||||
|
|
||||||
def _add_token(self, obj):
|
def _add_token(self, obj: PSBaseParserToken) -> None:
|
||||||
self._tokens.append((self._curtokenpos, obj))
|
self._tokens.append((self._curtokenpos, obj))
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_comment(self, s, i):
|
def _parse_comment(self, s: bytes, i: int) -> int:
|
||||||
m = EOL.search(s, i)
|
m = EOL.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
|
@ -337,7 +349,7 @@ class PSBaseParser:
|
||||||
# self._tokens.append(self._curtoken)
|
# self._tokens.append(self._curtoken)
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def _parse_literal(self, s, i):
|
def _parse_literal(self, s: bytes, i: int) -> int:
|
||||||
m = END_LITERAL.search(s, i)
|
m = END_LITERAL.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
|
@ -350,14 +362,14 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_literal_hex
|
self._parse1 = self._parse_literal_hex
|
||||||
return j+1
|
return j+1
|
||||||
try:
|
try:
|
||||||
self._curtoken = str(self._curtoken, 'utf-8')
|
name: Union[str, bytes] = str(self._curtoken, 'utf-8')
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
name = self._curtoken
|
||||||
self._add_token(LIT(self._curtoken))
|
self._add_token(LIT(name))
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def _parse_literal_hex(self, s, i):
|
def _parse_literal_hex(self, s: bytes, i: int) -> int:
|
||||||
c = s[i:i+1]
|
c = s[i:i+1]
|
||||||
if HEX.match(c) and len(self.hex) < 2:
|
if HEX.match(c) and len(self.hex) < 2:
|
||||||
self.hex += c
|
self.hex += c
|
||||||
|
@ -367,7 +379,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_literal
|
self._parse1 = self._parse_literal
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def _parse_number(self, s, i):
|
def _parse_number(self, s: bytes, i: int) -> int:
|
||||||
m = END_NUMBER.search(s, i)
|
m = END_NUMBER.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
|
@ -386,7 +398,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def _parse_float(self, s, i):
|
def _parse_float(self, s: bytes, i: int) -> int:
|
||||||
m = END_NUMBER.search(s, i)
|
m = END_NUMBER.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
|
@ -400,7 +412,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def _parse_keyword(self, s, i):
|
def _parse_keyword(self, s: bytes, i: int) -> int:
|
||||||
m = END_KEYWORD.search(s, i)
|
m = END_KEYWORD.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
|
@ -408,7 +420,7 @@ class PSBaseParser:
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += s[i:j]
|
||||||
if self._curtoken == b'true':
|
if self._curtoken == b'true':
|
||||||
token = True
|
token: Union[bool, PSKeyword] = True
|
||||||
elif self._curtoken == b'false':
|
elif self._curtoken == b'false':
|
||||||
token = False
|
token = False
|
||||||
else:
|
else:
|
||||||
|
@ -417,7 +429,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def _parse_string(self, s, i):
|
def _parse_string(self, s: bytes, i: int) -> int:
|
||||||
m = END_STRING.search(s, i)
|
m = END_STRING.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
|
@ -443,7 +455,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j+1
|
return j+1
|
||||||
|
|
||||||
def _parse_string_1(self, s, i):
|
def _parse_string_1(self, s: bytes, i: int) -> int:
|
||||||
"""Parse literal strings
|
"""Parse literal strings
|
||||||
|
|
||||||
PDF Reference 3.2.3
|
PDF Reference 3.2.3
|
||||||
|
@ -470,7 +482,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_string
|
self._parse1 = self._parse_string
|
||||||
return i+1
|
return i+1
|
||||||
|
|
||||||
def _parse_wopen(self, s, i):
|
def _parse_wopen(self, s: bytes, i: int) -> int:
|
||||||
c = s[i:i+1]
|
c = s[i:i+1]
|
||||||
if c == b'<':
|
if c == b'<':
|
||||||
self._add_token(KEYWORD_DICT_BEGIN)
|
self._add_token(KEYWORD_DICT_BEGIN)
|
||||||
|
@ -480,7 +492,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_hexstring
|
self._parse1 = self._parse_hexstring
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def _parse_wclose(self, s, i):
|
def _parse_wclose(self, s: bytes, i: int) -> int:
|
||||||
c = s[i:i+1]
|
c = s[i:i+1]
|
||||||
if c == b'>':
|
if c == b'>':
|
||||||
self._add_token(KEYWORD_DICT_END)
|
self._add_token(KEYWORD_DICT_END)
|
||||||
|
@ -488,7 +500,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def _parse_hexstring(self, s, i):
|
def _parse_hexstring(self, s: bytes, i: int) -> int:
|
||||||
m = END_HEX_STRING.search(s, i)
|
m = END_HEX_STRING.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += s[i:]
|
||||||
|
@ -501,7 +513,7 @@ class PSBaseParser:
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def nexttoken(self):
|
def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
|
||||||
while not self._tokens:
|
while not self._tokens:
|
||||||
self.fillbuf()
|
self.fillbuf()
|
||||||
self.charpos = self._parse1(self.buf, self.charpos)
|
self.charpos = self._parse1(self.buf, self.charpos)
|
||||||
|
@ -510,39 +522,51 @@ class PSBaseParser:
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
|
||||||
class PSStackParser(PSBaseParser):
|
# Stack slots may by occupied by any of:
|
||||||
def __init__(self, fp):
|
# * the PSBaseParserToken types
|
||||||
|
# * list (via KEYWORD_ARRAY)
|
||||||
|
# * dict (via KEYWORD_DICT)
|
||||||
|
# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT
|
||||||
|
ExtraT = TypeVar("ExtraT")
|
||||||
|
PSStackType = Union[float, bool, PSLiteral, bytes, List, Dict, ExtraT]
|
||||||
|
PSStackEntry = Tuple[int, PSStackType[ExtraT]]
|
||||||
|
|
||||||
|
|
||||||
|
class PSStackParser(PSBaseParser, Generic[ExtraT]):
|
||||||
|
|
||||||
|
def __init__(self, fp: BinaryIO) -> None:
|
||||||
PSBaseParser.__init__(self, fp)
|
PSBaseParser.__init__(self, fp)
|
||||||
self.reset()
|
self.reset()
|
||||||
return
|
return
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self.context = []
|
self.context: List[Tuple[int, Optional[str],
|
||||||
self.curtype = None
|
List[PSStackEntry[ExtraT]]]] = []
|
||||||
self.curstack = []
|
self.curtype: Optional[str] = None
|
||||||
self.results = []
|
self.curstack: List[PSStackEntry[ExtraT]] = []
|
||||||
|
self.results: List[PSStackEntry[ExtraT]] = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def seek(self, pos):
|
def seek(self, pos: int) -> None:
|
||||||
PSBaseParser.seek(self, pos)
|
PSBaseParser.seek(self, pos)
|
||||||
self.reset()
|
self.reset()
|
||||||
return
|
return
|
||||||
|
|
||||||
def push(self, *objs):
|
def push(self, *objs: PSStackEntry[ExtraT]) -> None:
|
||||||
self.curstack.extend(objs)
|
self.curstack.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
def pop(self, n):
|
def pop(self, n: int) -> List[PSStackEntry[ExtraT]]:
|
||||||
objs = self.curstack[-n:]
|
objs = self.curstack[-n:]
|
||||||
self.curstack[-n:] = []
|
self.curstack[-n:] = []
|
||||||
return objs
|
return objs
|
||||||
|
|
||||||
def popall(self):
|
def popall(self) -> List[PSStackEntry[ExtraT]]:
|
||||||
objs = self.curstack
|
objs = self.curstack
|
||||||
self.curstack = []
|
self.curstack = []
|
||||||
return objs
|
return objs
|
||||||
|
|
||||||
def add_results(self, *objs):
|
def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
|
||||||
try:
|
try:
|
||||||
log.debug('add_results: %r', objs)
|
log.debug('add_results: %r', objs)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
@ -550,13 +574,13 @@ class PSStackParser(PSBaseParser):
|
||||||
self.results.extend(objs)
|
self.results.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
def start_type(self, pos, type):
|
def start_type(self, pos: int, type: str) -> None:
|
||||||
self.context.append((pos, self.curtype, self.curstack))
|
self.context.append((pos, self.curtype, self.curstack))
|
||||||
(self.curtype, self.curstack) = (type, [])
|
(self.curtype, self.curstack) = (type, [])
|
||||||
log.debug('start_type: pos=%r, type=%r', pos, type)
|
log.debug('start_type: pos=%r, type=%r', pos, type)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_type(self, type):
|
def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
|
||||||
if self.curtype != type:
|
if self.curtype != type:
|
||||||
raise PSTypeError('Type mismatch: {!r} != {!r}'
|
raise PSTypeError('Type mismatch: {!r} != {!r}'
|
||||||
.format(self.curtype, type))
|
.format(self.curtype, type))
|
||||||
|
@ -565,10 +589,10 @@ class PSStackParser(PSBaseParser):
|
||||||
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
|
log.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
|
||||||
return (pos, objs)
|
return (pos, objs)
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
||||||
return
|
return
|
||||||
|
|
||||||
def nextobject(self):
|
def nextobject(self) -> PSStackEntry[ExtraT]:
|
||||||
"""Yields a list of objects.
|
"""Yields a list of objects.
|
||||||
|
|
||||||
Arrays and dictionaries are represented as Python lists and
|
Arrays and dictionaries are represented as Python lists and
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
def rldecode(data):
|
def rldecode(data: bytes) -> bytes:
|
||||||
"""
|
"""
|
||||||
RunLength decoder (Adobe version) implementation based on PDF Reference
|
RunLength decoder (Adobe version) implementation based on PDF Reference
|
||||||
version 1.4 section 3.3.4:
|
version 1.4 section 3.3.4:
|
||||||
|
|
|
@ -4,8 +4,15 @@ Miscellaneous Routines.
|
||||||
import io
|
import io
|
||||||
import pathlib
|
import pathlib
|
||||||
import struct
|
import struct
|
||||||
|
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
|
||||||
|
List, Optional, Set, TextIO, Tuple, TypeVar, Union,
|
||||||
|
TYPE_CHECKING, cast)
|
||||||
|
from typing_extensions import Literal
|
||||||
from html import escape
|
from html import escape
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .layout import LTComponent
|
||||||
|
|
||||||
import chardet # For str encoding detection
|
import chardet # For str encoding detection
|
||||||
|
|
||||||
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
|
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
|
||||||
|
@ -13,40 +20,54 @@ import chardet # For str encoding detection
|
||||||
INF = (1 << 31) - 1
|
INF = (1 << 31) - 1
|
||||||
|
|
||||||
|
|
||||||
|
FileOrName = Union[pathlib.PurePath, str, io.IOBase]
|
||||||
|
AnyIO = Union[TextIO, BinaryIO]
|
||||||
|
|
||||||
|
|
||||||
class open_filename(object):
|
class open_filename(object):
|
||||||
"""
|
"""
|
||||||
Context manager that allows opening a filename
|
Context manager that allows opening a filename
|
||||||
(str or pathlib.PurePath type is supported) and closes it on exit,
|
(str or pathlib.PurePath type is supported) and closes it on exit,
|
||||||
(just like `open`), but does nothing for file-like objects.
|
(just like `open`), but does nothing for file-like objects.
|
||||||
"""
|
"""
|
||||||
def __init__(self, filename, *args, **kwargs):
|
def __init__(
|
||||||
|
self,
|
||||||
|
filename: FileOrName,
|
||||||
|
*args: Any,
|
||||||
|
**kwargs: Any
|
||||||
|
) -> None:
|
||||||
if isinstance(filename, pathlib.PurePath):
|
if isinstance(filename, pathlib.PurePath):
|
||||||
filename = str(filename)
|
filename = str(filename)
|
||||||
if isinstance(filename, str):
|
if isinstance(filename, str):
|
||||||
self.file_handler = open(filename, *args, **kwargs)
|
self.file_handler: AnyIO = open(filename, *args, **kwargs)
|
||||||
self.closing = True
|
self.closing = True
|
||||||
elif isinstance(filename, io.IOBase):
|
elif isinstance(filename, io.IOBase):
|
||||||
self.file_handler = filename
|
self.file_handler = cast(AnyIO, filename)
|
||||||
self.closing = False
|
self.closing = False
|
||||||
else:
|
else:
|
||||||
raise TypeError('Unsupported input type: %s' % type(filename))
|
raise TypeError('Unsupported input type: %s' % type(filename))
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> AnyIO:
|
||||||
return self.file_handler
|
return self.file_handler
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: object,
|
||||||
|
exc_val: object,
|
||||||
|
exc_tb: object
|
||||||
|
) -> Literal[False]:
|
||||||
if self.closing:
|
if self.closing:
|
||||||
self.file_handler.close()
|
self.file_handler.close()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def make_compat_bytes(in_str):
|
def make_compat_bytes(in_str: str) -> bytes:
|
||||||
"Converts to bytes, encoding to unicode."
|
"Converts to bytes, encoding to unicode."
|
||||||
assert isinstance(in_str, str), str(type(in_str))
|
assert isinstance(in_str, str), str(type(in_str))
|
||||||
return in_str.encode()
|
return in_str.encode()
|
||||||
|
|
||||||
|
|
||||||
def make_compat_str(o):
|
def make_compat_str(o: object) -> str:
|
||||||
"""Converts everything to string, if bytes guessing the encoding."""
|
"""Converts everything to string, if bytes guessing the encoding."""
|
||||||
if isinstance(o, bytes):
|
if isinstance(o, bytes):
|
||||||
enc = chardet.detect(o)
|
enc = chardet.detect(o)
|
||||||
|
@ -55,7 +76,7 @@ def make_compat_str(o):
|
||||||
return str(o)
|
return str(o)
|
||||||
|
|
||||||
|
|
||||||
def shorten_str(s, size):
|
def shorten_str(s: str, size: int) -> str:
|
||||||
if size < 7:
|
if size < 7:
|
||||||
return s[:size]
|
return s[:size]
|
||||||
if len(s) > size:
|
if len(s) > size:
|
||||||
|
@ -65,8 +86,11 @@ def shorten_str(s, size):
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
def compatible_encode_method(bytesorstring, encoding='utf-8',
|
def compatible_encode_method(
|
||||||
erraction='ignore'):
|
bytesorstring: Union[bytes, str],
|
||||||
|
encoding: str = 'utf-8',
|
||||||
|
erraction: str = 'ignore'
|
||||||
|
) -> str:
|
||||||
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
|
"""When Py2 str.encode is called, it often means bytes.encode in Py3.
|
||||||
|
|
||||||
This does either.
|
This does either.
|
||||||
|
@ -77,7 +101,7 @@ def compatible_encode_method(bytesorstring, encoding='utf-8',
|
||||||
return bytesorstring.decode(encoding, erraction)
|
return bytesorstring.decode(encoding, erraction)
|
||||||
|
|
||||||
|
|
||||||
def paeth_predictor(left, above, upper_left):
|
def paeth_predictor(left: int, above: int, upper_left: int) -> int:
|
||||||
# From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
|
# From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
|
||||||
# Initial estimate
|
# Initial estimate
|
||||||
p = left + above - upper_left
|
p = left + above - upper_left
|
||||||
|
@ -95,7 +119,13 @@ def paeth_predictor(left, above, upper_left):
|
||||||
return upper_left
|
return upper_left
|
||||||
|
|
||||||
|
|
||||||
def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
def apply_png_predictor(
|
||||||
|
pred: int,
|
||||||
|
colors: int,
|
||||||
|
columns: int,
|
||||||
|
bitspercomponent: int,
|
||||||
|
data: bytes
|
||||||
|
) -> bytes:
|
||||||
"""Reverse the effect of the PNG predictor
|
"""Reverse the effect of the PNG predictor
|
||||||
|
|
||||||
Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
|
Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
|
||||||
|
@ -190,11 +220,20 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
|
|
||||||
|
Point = Tuple[float, float]
|
||||||
|
Rect = Tuple[float, float, float, float]
|
||||||
|
Matrix = Tuple[float, float, float, float, float, float]
|
||||||
|
PathSegment = Union[
|
||||||
|
Tuple[str], # Literal['h']
|
||||||
|
Tuple[str, float, float], # Literal['m', 'l']
|
||||||
|
Tuple[str, float, float, float, float], # Literal['v', 'y']
|
||||||
|
Tuple[str, float, float, float, float, float, float]] # Literal['c']
|
||||||
|
|
||||||
# Matrix operations
|
# Matrix operations
|
||||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def mult_matrix(m1, m0):
|
def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
|
||||||
(a1, b1, c1, d1, e1, f1) = m1
|
(a1, b1, c1, d1, e1, f1) = m1
|
||||||
(a0, b0, c0, d0, e0, f0) = m0
|
(a0, b0, c0, d0, e0, f0) = m0
|
||||||
"""Returns the multiplication of two matrices."""
|
"""Returns the multiplication of two matrices."""
|
||||||
|
@ -203,21 +242,21 @@ def mult_matrix(m1, m0):
|
||||||
a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
|
a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
|
||||||
|
|
||||||
|
|
||||||
def translate_matrix(m, v):
|
def translate_matrix(m: Matrix, v: Point) -> Matrix:
|
||||||
"""Translates a matrix by (x, y)."""
|
"""Translates a matrix by (x, y)."""
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
(x, y) = v
|
(x, y) = v
|
||||||
return a, b, c, d, x * a + y * c + e, x * b + y * d + f
|
return a, b, c, d, x * a + y * c + e, x * b + y * d + f
|
||||||
|
|
||||||
|
|
||||||
def apply_matrix_pt(m, v):
|
def apply_matrix_pt(m: Matrix, v: Point) -> Point:
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
(x, y) = v
|
(x, y) = v
|
||||||
"""Applies a matrix to a point."""
|
"""Applies a matrix to a point."""
|
||||||
return a * x + c * y + e, b * x + d * y + f
|
return a * x + c * y + e, b * x + d * y + f
|
||||||
|
|
||||||
|
|
||||||
def apply_matrix_norm(m, v):
|
def apply_matrix_norm(m: Matrix, v: Point) -> Point:
|
||||||
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
(p, q) = v
|
(p, q) = v
|
||||||
|
@ -226,11 +265,14 @@ def apply_matrix_norm(m, v):
|
||||||
|
|
||||||
# Utility functions
|
# Utility functions
|
||||||
|
|
||||||
def isnumber(x):
|
def isnumber(x: object) -> bool:
|
||||||
return isinstance(x, (int, float))
|
return isinstance(x, (int, float))
|
||||||
|
|
||||||
|
|
||||||
def uniq(objs):
|
_T = TypeVar('_T')
|
||||||
|
|
||||||
|
|
||||||
|
def uniq(objs: Iterable[_T]) -> Iterator[_T]:
|
||||||
"""Eliminates duplicated elements."""
|
"""Eliminates duplicated elements."""
|
||||||
done = set()
|
done = set()
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
|
@ -241,7 +283,10 @@ def uniq(objs):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def fsplit(pred, objs):
|
def fsplit(
|
||||||
|
pred: Callable[[_T], bool],
|
||||||
|
objs: Iterable[_T]
|
||||||
|
) -> Tuple[List[_T], List[_T]]:
|
||||||
"""Split a list into two classes according to the predicate."""
|
"""Split a list into two classes according to the predicate."""
|
||||||
t = []
|
t = []
|
||||||
f = []
|
f = []
|
||||||
|
@ -253,14 +298,15 @@ def fsplit(pred, objs):
|
||||||
return t, f
|
return t, f
|
||||||
|
|
||||||
|
|
||||||
def drange(v0, v1, d):
|
def drange(v0: float, v1: float, d: int) -> range:
|
||||||
"""Returns a discrete range."""
|
"""Returns a discrete range."""
|
||||||
return range(int(v0) // d, int(v1 + d) // d)
|
return range(int(v0) // d, int(v1 + d) // d)
|
||||||
|
|
||||||
|
|
||||||
def get_bound(pts):
|
def get_bound(pts: Iterable[Point]) -> Rect:
|
||||||
"""Compute a minimal rectangle that covers all the points."""
|
"""Compute a minimal rectangle that covers all the points."""
|
||||||
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
limit: Rect = (INF, INF, -INF, -INF)
|
||||||
|
(x0, y0, x1, y1) = limit
|
||||||
for (x, y) in pts:
|
for (x, y) in pts:
|
||||||
x0 = min(x0, x)
|
x0 = min(x0, x)
|
||||||
y0 = min(y0, y)
|
y0 = min(y0, y)
|
||||||
|
@ -269,7 +315,11 @@ def get_bound(pts):
|
||||||
return x0, y0, x1, y1
|
return x0, y0, x1, y1
|
||||||
|
|
||||||
|
|
||||||
def pick(seq, func, maxobj=None):
|
def pick(
|
||||||
|
seq: Iterable[_T],
|
||||||
|
func: Callable[[_T], float],
|
||||||
|
maxobj: Optional[_T] = None
|
||||||
|
) -> Optional[_T]:
|
||||||
"""Picks the object obj where func(obj) has the highest value."""
|
"""Picks the object obj where func(obj) has the highest value."""
|
||||||
maxscore = None
|
maxscore = None
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
|
@ -279,7 +329,7 @@ def pick(seq, func, maxobj=None):
|
||||||
return maxobj
|
return maxobj
|
||||||
|
|
||||||
|
|
||||||
def choplist(n, seq):
|
def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
|
||||||
"""Groups every n elements of the list."""
|
"""Groups every n elements of the list."""
|
||||||
r = []
|
r = []
|
||||||
for x in seq:
|
for x in seq:
|
||||||
|
@ -290,7 +340,7 @@ def choplist(n, seq):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def nunpack(s, default=0):
|
def nunpack(s: bytes, default: int = 0) -> int:
|
||||||
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
"""Unpacks 1 to 4 or 8 byte integers (big endian)."""
|
||||||
length = len(s)
|
length = len(s)
|
||||||
if not length:
|
if not length:
|
||||||
|
@ -298,13 +348,13 @@ def nunpack(s, default=0):
|
||||||
elif length == 1:
|
elif length == 1:
|
||||||
return ord(s)
|
return ord(s)
|
||||||
elif length == 2:
|
elif length == 2:
|
||||||
return struct.unpack('>H', s)[0]
|
return cast(int, struct.unpack('>H', s)[0])
|
||||||
elif length == 3:
|
elif length == 3:
|
||||||
return struct.unpack('>L', b'\x00' + s)[0]
|
return cast(int, struct.unpack('>L', b'\x00' + s)[0])
|
||||||
elif length == 4:
|
elif length == 4:
|
||||||
return struct.unpack('>L', s)[0]
|
return cast(int, struct.unpack('>L', s)[0])
|
||||||
elif length == 8:
|
elif length == 8:
|
||||||
return struct.unpack('>Q', s)[0]
|
return cast(int, struct.unpack('>Q', s)[0])
|
||||||
else:
|
else:
|
||||||
raise TypeError('invalid length: %d' % length)
|
raise TypeError('invalid length: %d' % length)
|
||||||
|
|
||||||
|
@ -345,7 +395,7 @@ PDFDocEncoding = ''.join(chr(x) for x in (
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
def decode_text(s):
|
def decode_text(s: bytes) -> str:
|
||||||
"""Decodes a PDFDocEncoding string to Unicode."""
|
"""Decodes a PDFDocEncoding string to Unicode."""
|
||||||
if s.startswith(b'\xfe\xff'):
|
if s.startswith(b'\xfe\xff'):
|
||||||
return str(s[2:], 'utf-16be', 'ignore')
|
return str(s[2:], 'utf-16be', 'ignore')
|
||||||
|
@ -353,25 +403,25 @@ def decode_text(s):
|
||||||
return ''.join(PDFDocEncoding[c] for c in s)
|
return ''.join(PDFDocEncoding[c] for c in s)
|
||||||
|
|
||||||
|
|
||||||
def enc(x):
|
def enc(x: str) -> str:
|
||||||
"""Encodes a string for SGML/XML/HTML"""
|
"""Encodes a string for SGML/XML/HTML"""
|
||||||
if isinstance(x, bytes):
|
if isinstance(x, bytes):
|
||||||
return ''
|
return ''
|
||||||
return escape(x)
|
return escape(x)
|
||||||
|
|
||||||
|
|
||||||
def bbox2str(bbox):
|
def bbox2str(bbox: Rect) -> str:
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1)
|
return '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x0, y0, x1, y1)
|
||||||
|
|
||||||
|
|
||||||
def matrix2str(m):
|
def matrix2str(m: Matrix) -> str:
|
||||||
(a, b, c, d, e, f) = m
|
(a, b, c, d, e, f) = m
|
||||||
return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\
|
return '[{:.2f},{:.2f},{:.2f},{:.2f}, ({:.2f},{:.2f})]'\
|
||||||
.format(a, b, c, d, e, f)
|
.format(a, b, c, d, e, f)
|
||||||
|
|
||||||
|
|
||||||
def vecBetweenBoxes(obj1, obj2):
|
def vecBetweenBoxes(obj1: "LTComponent", obj2: "LTComponent") -> Point:
|
||||||
"""A distance function between two TextBoxes.
|
"""A distance function between two TextBoxes.
|
||||||
|
|
||||||
Consider the bounding rectangle for obj1 and obj2.
|
Consider the bounding rectangle for obj1 and obj2.
|
||||||
|
@ -397,7 +447,10 @@ def vecBetweenBoxes(obj1, obj2):
|
||||||
return max(0, iw), max(0, ih)
|
return max(0, iw), max(0, ih)
|
||||||
|
|
||||||
|
|
||||||
class Plane:
|
LTComponentT = TypeVar('LTComponentT', bound='LTComponent')
|
||||||
|
|
||||||
|
|
||||||
|
class Plane(Generic[LTComponentT]):
|
||||||
"""A set-like data structure for objects placed on a plane.
|
"""A set-like data structure for objects placed on a plane.
|
||||||
|
|
||||||
Can efficiently find objects in a certain rectangular area.
|
Can efficiently find objects in a certain rectangular area.
|
||||||
|
@ -405,26 +458,26 @@ class Plane:
|
||||||
which is sorted by its x or y coordinate.
|
which is sorted by its x or y coordinate.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, bbox, gridsize=50):
|
def __init__(self, bbox: Rect, gridsize: int = 50) -> None:
|
||||||
self._seq = [] # preserve the object order.
|
self._seq: List[LTComponentT] = [] # preserve the object order.
|
||||||
self._objs = set()
|
self._objs: Set[LTComponentT] = set()
|
||||||
self._grid = {}
|
self._grid: Dict[Point, List[LTComponentT]] = {}
|
||||||
self.gridsize = gridsize
|
self.gridsize = gridsize
|
||||||
(self.x0, self.y0, self.x1, self.y1) = bbox
|
(self.x0, self.y0, self.x1, self.y1) = bbox
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return '<Plane objs=%r>' % list(self)
|
return '<Plane objs=%r>' % list(self)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self) -> Iterator[LTComponentT]:
|
||||||
return (obj for obj in self._seq if obj in self._objs)
|
return (obj for obj in self._seq if obj in self._objs)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self._objs)
|
return len(self._objs)
|
||||||
|
|
||||||
def __contains__(self, obj):
|
def __contains__(self, obj: object) -> bool:
|
||||||
return obj in self._objs
|
return obj in self._objs
|
||||||
|
|
||||||
def _getrange(self, bbox):
|
def _getrange(self, bbox: Rect) -> Iterator[Point]:
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
|
if x1 <= self.x0 or self.x1 <= x0 or y1 <= self.y0 or self.y1 <= y0:
|
||||||
return
|
return
|
||||||
|
@ -436,15 +489,15 @@ class Plane:
|
||||||
for grid_x in drange(x0, x1, self.gridsize):
|
for grid_x in drange(x0, x1, self.gridsize):
|
||||||
yield (grid_x, grid_y)
|
yield (grid_x, grid_y)
|
||||||
|
|
||||||
def extend(self, objs):
|
def extend(self, objs: Iterable[LTComponentT]) -> None:
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
self.add(obj)
|
self.add(obj)
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj: LTComponentT) -> None:
|
||||||
"""place an object."""
|
"""place an object."""
|
||||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||||
if k not in self._grid:
|
if k not in self._grid:
|
||||||
r = []
|
r: List[LTComponentT] = []
|
||||||
self._grid[k] = r
|
self._grid[k] = r
|
||||||
else:
|
else:
|
||||||
r = self._grid[k]
|
r = self._grid[k]
|
||||||
|
@ -452,7 +505,7 @@ class Plane:
|
||||||
self._seq.append(obj)
|
self._seq.append(obj)
|
||||||
self._objs.add(obj)
|
self._objs.add(obj)
|
||||||
|
|
||||||
def remove(self, obj):
|
def remove(self, obj: LTComponentT) -> None:
|
||||||
"""displace an object."""
|
"""displace an object."""
|
||||||
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
|
||||||
try:
|
try:
|
||||||
|
@ -461,7 +514,7 @@ class Plane:
|
||||||
pass
|
pass
|
||||||
self._objs.remove(obj)
|
self._objs.remove(obj)
|
||||||
|
|
||||||
def find(self, bbox):
|
def find(self, bbox: Rect) -> Iterator[LTComponentT]:
|
||||||
"""finds objects that are in a certain area."""
|
"""finds objects that are in a certain area."""
|
||||||
(x0, y0, x1, y1) = bbox
|
(x0, y0, x1, y1) = bbox
|
||||||
done = set()
|
done = set()
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -17,7 +17,7 @@ setup(
|
||||||
'cryptography',
|
'cryptography',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
"dev": ["nose", "tox"],
|
"dev": ["nose", "tox", "mypy == 0.910"],
|
||||||
"docs": ["sphinx", "sphinx-argparse"],
|
"docs": ["sphinx", "sphinx-argparse"],
|
||||||
},
|
},
|
||||||
description='PDF parser and analyzer',
|
description='PDF parser and analyzer',
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
from nose.tools import raises
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
from tempfilepath import TemporaryFilePath
|
from tempfilepath import TemporaryFilePath
|
||||||
from pdfminer.pdfdocument import PDFNoValidXRefWarning
|
from pdfminer.pdfdocument import PDFNoValidXRefWarning
|
||||||
|
@ -51,3 +51,13 @@ class TestDumpPDF():
|
||||||
|
|
||||||
def test_6(self):
|
def test_6(self):
|
||||||
run('nonfree/naacl06-shinyama.pdf', '-t -a')
|
run('nonfree/naacl06-shinyama.pdf', '-t -a')
|
||||||
|
|
||||||
|
@raises(TypeError)
|
||||||
|
def test_simple1_raw(self):
|
||||||
|
"""Known issue: crash in dumpxml writing binary to text stream."""
|
||||||
|
run('simple1.pdf', '-r -a')
|
||||||
|
|
||||||
|
@raises(TypeError)
|
||||||
|
def test_simple1_binary(self):
|
||||||
|
"""Known issue: crash in dumpxml writing binary to text stream."""
|
||||||
|
run('simple1.pdf', '-b -a')
|
||||||
|
|
|
@ -42,4 +42,4 @@ def main(argv):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
|
||||||
|
|
|
@ -199,4 +199,4 @@ def main(argv):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
|
||||||
|
|
|
@ -24,4 +24,4 @@ def main(argv):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv))
|
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
|
||||||
|
|
|
@ -4,6 +4,8 @@ import logging
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \
|
||||||
|
Union, cast
|
||||||
import warnings
|
import warnings
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
@ -22,13 +24,15 @@ logging.basicConfig()
|
||||||
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
||||||
|
|
||||||
|
|
||||||
def escape(s):
|
def escape(s: Union[str, bytes]) -> str:
|
||||||
if isinstance(s, bytes):
|
if isinstance(s, bytes):
|
||||||
s = str(s, 'latin-1')
|
us = str(s, 'latin-1')
|
||||||
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s)
|
else:
|
||||||
|
us = s
|
||||||
|
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), us)
|
||||||
|
|
||||||
|
|
||||||
def dumpxml(out, obj, codec=None):
|
def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
|
||||||
if obj is None:
|
if obj is None:
|
||||||
out.write('<null />')
|
out.write('<null />')
|
||||||
return
|
return
|
||||||
|
@ -51,15 +55,17 @@ def dumpxml(out, obj, codec=None):
|
||||||
out.write('</list>')
|
out.write('</list>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, ((str,), bytes)):
|
if isinstance(obj, (str, bytes)):
|
||||||
out.write('<string size="%d">%s</string>' % (len(obj), escape(obj)))
|
out.write('<string size="%d">%s</string>' % (len(obj), escape(obj)))
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
if codec == 'raw':
|
if codec == 'raw':
|
||||||
out.write(obj.get_rawdata())
|
# Bug: writing bytes to text I/O. This will raise TypeError.
|
||||||
|
out.write(obj.get_rawdata()) # type: ignore [arg-type]
|
||||||
elif codec == 'binary':
|
elif codec == 'binary':
|
||||||
out.write(obj.get_data())
|
# Bug: writing bytes to text I/O. This will raise TypeError.
|
||||||
|
out.write(obj.get_data()) # type: ignore [arg-type]
|
||||||
else:
|
else:
|
||||||
out.write('<stream>\n<props>\n')
|
out.write('<stream>\n<props>\n')
|
||||||
dumpxml(out, obj.attrs)
|
dumpxml(out, obj.attrs)
|
||||||
|
@ -76,11 +82,15 @@ def dumpxml(out, obj, codec=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PSKeyword):
|
if isinstance(obj, PSKeyword):
|
||||||
out.write('<keyword>%s</keyword>' % obj.name)
|
# Likely bug: obj.name is bytes, not str
|
||||||
|
out.write('<keyword>%s</keyword>'
|
||||||
|
% obj.name) # type: ignore [str-bytes-safe]
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PSLiteral):
|
if isinstance(obj, PSLiteral):
|
||||||
out.write('<literal>%s</literal>' % obj.name)
|
# Likely bug: obj.name may be bytes, not str
|
||||||
|
out.write('<literal>%s</literal>'
|
||||||
|
% obj.name) # type: ignore [str-bytes-safe]
|
||||||
return
|
return
|
||||||
|
|
||||||
if isnumber(obj):
|
if isnumber(obj):
|
||||||
|
@ -90,11 +100,15 @@ def dumpxml(out, obj, codec=None):
|
||||||
raise TypeError(obj)
|
raise TypeError(obj)
|
||||||
|
|
||||||
|
|
||||||
def dumptrailers(out, doc, show_fallback_xref=False):
|
def dumptrailers(
|
||||||
|
out: TextIO,
|
||||||
|
doc: PDFDocument,
|
||||||
|
show_fallback_xref: bool = False
|
||||||
|
) -> None:
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
|
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
|
||||||
out.write('<trailer>\n')
|
out.write('<trailer>\n')
|
||||||
dumpxml(out, xref.trailer)
|
dumpxml(out, xref.get_trailer())
|
||||||
out.write('\n</trailer>\n\n')
|
out.write('\n</trailer>\n\n')
|
||||||
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
|
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
|
||||||
if no_xrefs and not show_fallback_xref:
|
if no_xrefs and not show_fallback_xref:
|
||||||
|
@ -105,7 +119,12 @@ def dumptrailers(out, doc, show_fallback_xref=False):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
|
def dumpallobjs(
|
||||||
|
out: TextIO,
|
||||||
|
doc: PDFDocument,
|
||||||
|
codec: Optional[str] = None,
|
||||||
|
show_fallback_xref: bool = False
|
||||||
|
) -> None:
|
||||||
visited = set()
|
visited = set()
|
||||||
out.write('<pdf>')
|
out.write('<pdf>')
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
|
@ -127,15 +146,23 @@ def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
def dumpoutline(
|
||||||
dumpall=False, codec=None, extractdir=None):
|
outfp: TextIO,
|
||||||
|
fname: str,
|
||||||
|
objids: Any,
|
||||||
|
pagenos: Container[int],
|
||||||
|
password: str = '',
|
||||||
|
dumpall: bool = False,
|
||||||
|
codec: Optional[str] = None,
|
||||||
|
extractdir: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
fp = open(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser, password)
|
doc = PDFDocument(parser, password)
|
||||||
pages = {page.pageid: pageno for (pageno, page)
|
pages = {page.pageid: pageno for (pageno, page)
|
||||||
in enumerate(PDFPage.create_pages(doc), 1)}
|
in enumerate(PDFPage.create_pages(doc), 1)}
|
||||||
|
|
||||||
def resolve_dest(dest):
|
def resolve_dest(dest: object) -> Any:
|
||||||
if isinstance(dest, (str, bytes)):
|
if isinstance(dest, (str, bytes)):
|
||||||
dest = resolve1(doc.get_dest(dest))
|
dest = resolve1(doc.get_dest(dest))
|
||||||
elif isinstance(dest, PSLiteral):
|
elif isinstance(dest, PSLiteral):
|
||||||
|
@ -183,10 +210,10 @@ LITERAL_FILESPEC = LIT('Filespec')
|
||||||
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
|
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
|
||||||
|
|
||||||
|
|
||||||
def extractembedded(outfp, fname, objids, pagenos, password='',
|
def extractembedded(fname: str, password: str, extractdir: str) -> None:
|
||||||
dumpall=False, codec=None, extractdir=None):
|
def extract1(objid: int, obj: Dict[str, Any]) -> None:
|
||||||
def extract1(objid, obj):
|
filename = os.path.basename(obj.get('UF') or
|
||||||
filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
|
cast(bytes, obj.get('F')).decode())
|
||||||
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
|
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
|
||||||
fileobj = doc.getobj(fileref.objid)
|
fileobj = doc.getobj(fileref.objid)
|
||||||
if not isinstance(fileobj, PDFStream):
|
if not isinstance(fileobj, PDFStream):
|
||||||
|
@ -221,8 +248,17 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
|
def dumppdf(
|
||||||
codec=None, extractdir=None, show_fallback_xref=False):
|
outfp: TextIO,
|
||||||
|
fname: str,
|
||||||
|
objids: Iterable[int],
|
||||||
|
pagenos: Container[int],
|
||||||
|
password: str = '',
|
||||||
|
dumpall: bool = False,
|
||||||
|
codec: Optional[str] = None,
|
||||||
|
extractdir: Optional[str] = None,
|
||||||
|
show_fallback_xref: bool = False
|
||||||
|
) -> None:
|
||||||
fp = open(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser, password)
|
doc = PDFDocument(parser, password)
|
||||||
|
@ -249,7 +285,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def create_parser():
|
def create_parser() -> ArgumentParser:
|
||||||
parser = ArgumentParser(description=__doc__, add_help=True)
|
parser = ArgumentParser(description=__doc__, add_help=True)
|
||||||
parser.add_argument('files', type=str, default=None, nargs='+',
|
parser.add_argument('files', type=str, default=None, nargs='+',
|
||||||
help='One or more paths to PDF files.')
|
help='One or more paths to PDF files.')
|
||||||
|
@ -313,7 +349,7 @@ def create_parser():
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv: Optional[List[str]] = None) -> None:
|
||||||
parser = create_parser()
|
parser = create_parser()
|
||||||
args = parser.parse_args(args=argv)
|
args = parser.parse_args(args=argv)
|
||||||
|
|
||||||
|
@ -340,7 +376,7 @@ def main(argv=None):
|
||||||
password = args.password
|
password = args.password
|
||||||
|
|
||||||
if args.raw_stream:
|
if args.raw_stream:
|
||||||
codec = 'raw'
|
codec: Optional[str] = 'raw'
|
||||||
elif args.binary_stream:
|
elif args.binary_stream:
|
||||||
codec = 'binary'
|
codec = 'binary'
|
||||||
elif args.text_stream:
|
elif args.text_stream:
|
||||||
|
@ -356,8 +392,7 @@ def main(argv=None):
|
||||||
)
|
)
|
||||||
elif args.extract_embedded:
|
elif args.extract_embedded:
|
||||||
extractembedded(
|
extractembedded(
|
||||||
outfp, fname, objids, pagenos, password=password,
|
fname, password=password, extractdir=args.extract_embedded
|
||||||
dumpall=args.all, codec=codec, extractdir=args.extract_embedded
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
dumppdf(
|
dumppdf(
|
||||||
|
@ -370,4 +405,4 @@ def main(argv=None):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
main()
|
||||||
|
|
|
@ -4,9 +4,12 @@ output it to plain text, html, xml or tags."""
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Any, Container, Iterable, List, Optional, Union
|
||||||
|
from typing_extensions import Literal
|
||||||
|
|
||||||
import pdfminer.high_level
|
import pdfminer.high_level
|
||||||
import pdfminer.layout
|
from pdfminer.layout import LAParams
|
||||||
|
from pdfminer.utils import AnyIO
|
||||||
|
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
|
|
||||||
|
@ -15,24 +18,42 @@ OUTPUT_TYPES = ((".htm", "html"),
|
||||||
(".xml", "xml"),
|
(".xml", "xml"),
|
||||||
(".tag", "tag"))
|
(".tag", "tag"))
|
||||||
|
|
||||||
|
FloatOrDisabled = Union[float, Literal["disabled"]]
|
||||||
|
|
||||||
def float_or_disabled(x):
|
|
||||||
|
def float_or_disabled(x: str) -> FloatOrDisabled:
|
||||||
if x.lower().strip() == "disabled":
|
if x.lower().strip() == "disabled":
|
||||||
return x
|
return "disabled"
|
||||||
try:
|
try:
|
||||||
x = float(x)
|
return float(x)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise argparse.ArgumentTypeError("invalid float value: {}".format(x))
|
raise argparse.ArgumentTypeError("invalid float value: {}".format(x))
|
||||||
|
|
||||||
|
|
||||||
def extract_text(files=[], outfile='-',
|
def extract_text(
|
||||||
no_laparams=False, all_texts=None, detect_vertical=None,
|
files: Iterable[str] = [],
|
||||||
word_margin=None, char_margin=None, line_margin=None,
|
outfile: str = '-',
|
||||||
boxes_flow=None, output_type='text', codec='utf-8',
|
no_laparams: bool = False,
|
||||||
strip_control=False, maxpages=0, page_numbers=None,
|
all_texts: Optional[bool] = None,
|
||||||
password="", scale=1.0, rotation=0, layoutmode='normal',
|
detect_vertical: Optional[bool] = None,
|
||||||
output_dir=None, debug=False, disable_caching=False,
|
word_margin: Optional[float] = None,
|
||||||
**kwargs):
|
char_margin: Optional[float] = None,
|
||||||
|
line_margin: Optional[float] = None,
|
||||||
|
boxes_flow: Optional[FloatOrDisabled] = None,
|
||||||
|
output_type: str = 'text',
|
||||||
|
codec: str = 'utf-8',
|
||||||
|
strip_control: bool = False,
|
||||||
|
maxpages: int = 0,
|
||||||
|
page_numbers: Optional[Container[int]] = None,
|
||||||
|
password: str = "",
|
||||||
|
scale: float = 1.0,
|
||||||
|
rotation: int = 0,
|
||||||
|
layoutmode: str = 'normal',
|
||||||
|
output_dir: Optional[str] = None,
|
||||||
|
debug: bool = False,
|
||||||
|
disable_caching: bool = False,
|
||||||
|
**kwargs: Any
|
||||||
|
) -> AnyIO:
|
||||||
if not files:
|
if not files:
|
||||||
raise ValueError("Must provide files to work upon!")
|
raise ValueError("Must provide files to work upon!")
|
||||||
|
|
||||||
|
@ -40,7 +61,7 @@ def extract_text(files=[], outfile='-',
|
||||||
# create an LAParams object and
|
# create an LAParams object and
|
||||||
# populate with given args. Otherwise, set it to None.
|
# populate with given args. Otherwise, set it to None.
|
||||||
if not no_laparams:
|
if not no_laparams:
|
||||||
laparams = pdfminer.layout.LAParams()
|
laparams: Optional[LAParams] = LAParams()
|
||||||
for param in ("all_texts", "detect_vertical", "word_margin",
|
for param in ("all_texts", "detect_vertical", "word_margin",
|
||||||
"char_margin", "line_margin", "boxes_flow"):
|
"char_margin", "line_margin", "boxes_flow"):
|
||||||
paramv = locals().get(param, None)
|
paramv = locals().get(param, None)
|
||||||
|
@ -55,8 +76,8 @@ def extract_text(files=[], outfile='-',
|
||||||
output_type = alttype
|
output_type = alttype
|
||||||
|
|
||||||
if outfile == "-":
|
if outfile == "-":
|
||||||
outfp = sys.stdout
|
outfp: AnyIO = sys.stdout
|
||||||
if outfp.encoding is not None:
|
if sys.stdout.encoding is not None:
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
else:
|
else:
|
||||||
outfp = open(outfile, "wb")
|
outfp = open(outfile, "wb")
|
||||||
|
@ -67,7 +88,7 @@ def extract_text(files=[], outfile='-',
|
||||||
return outfp
|
return outfp
|
||||||
|
|
||||||
|
|
||||||
def maketheparser():
|
def maketheparser() -> argparse.ArgumentParser:
|
||||||
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"files", type=str, default=None, nargs="+",
|
"files", type=str, default=None, nargs="+",
|
||||||
|
@ -180,7 +201,7 @@ def maketheparser():
|
||||||
# main
|
# main
|
||||||
|
|
||||||
|
|
||||||
def main(args=None):
|
def main(args: Optional[List[str]] = None) -> int:
|
||||||
|
|
||||||
P = maketheparser()
|
P = maketheparser()
|
||||||
A = P.parse_args(args=args)
|
A = P.parse_args(args=args)
|
||||||
|
|
|
@ -6,6 +6,7 @@ compares two pdf files.
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Any, Iterable, List, Optional
|
||||||
|
|
||||||
import pdfminer.settings
|
import pdfminer.settings
|
||||||
from pdfminer import high_level, layout
|
from pdfminer import high_level, layout
|
||||||
|
@ -16,7 +17,7 @@ pdfminer.settings.STRICT = False
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
|
|
||||||
|
|
||||||
def compare(file1, file2, **kwargs):
|
def compare(file1: str, file2: str, **kwargs: Any) -> Iterable[str]:
|
||||||
# If any LAParams group arguments were passed,
|
# If any LAParams group arguments were passed,
|
||||||
# create an LAParams object and
|
# create an LAParams object and
|
||||||
# populate with given args. Otherwise, set it to None.
|
# populate with given args. Otherwise, set it to None.
|
||||||
|
@ -26,7 +27,7 @@ def compare(file1, file2, **kwargs):
|
||||||
"char_margin", "line_margin", "boxes_flow"):
|
"char_margin", "line_margin", "boxes_flow"):
|
||||||
paramv = kwargs.get(param, None)
|
paramv = kwargs.get(param, None)
|
||||||
if paramv is not None:
|
if paramv is not None:
|
||||||
laparams[param] = paramv
|
setattr(laparams, param, paramv)
|
||||||
kwargs['laparams'] = laparams
|
kwargs['laparams'] = laparams
|
||||||
|
|
||||||
s1 = io.StringIO()
|
s1 = io.StringIO()
|
||||||
|
@ -40,20 +41,20 @@ def compare(file1, file2, **kwargs):
|
||||||
import difflib
|
import difflib
|
||||||
s1.seek(0)
|
s1.seek(0)
|
||||||
s2.seek(0)
|
s2.seek(0)
|
||||||
s1, s2 = s1.readlines(), s2.readlines()
|
s1_lines, s2_lines = s1.readlines(), s2.readlines()
|
||||||
|
|
||||||
import os.path
|
import os.path
|
||||||
try:
|
try:
|
||||||
extension = os.path.splitext(kwargs['outfile'])[1][1:4]
|
extension = os.path.splitext(kwargs['outfile'])[1][1:4]
|
||||||
if extension.lower() == 'htm':
|
if extension.lower() == 'htm':
|
||||||
return difflib.HtmlDiff().make_file(s1, s2)
|
return difflib.HtmlDiff().make_file(s1_lines, s2_lines)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])
|
return difflib.unified_diff(s1_lines, s2_lines, n=kwargs['context_lines'])
|
||||||
|
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(args=None):
|
def main(args: Optional[List[str]] = None) -> int:
|
||||||
import argparse
|
import argparse
|
||||||
P = argparse.ArgumentParser(description=__doc__)
|
P = argparse.ArgumentParser(description=__doc__)
|
||||||
P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
|
P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
|
||||||
|
|
|
@ -7,10 +7,11 @@
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import collections
|
import collections
|
||||||
|
from typing import Any, Counter, Iterator, List
|
||||||
|
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
|
||||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
from pdfminer.pdfpage import PDFPage
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
from pdfminer.layout import LAParams, LTContainer
|
from pdfminer.layout import LAParams, LTContainer
|
||||||
|
@ -19,18 +20,18 @@ from pdfminer.layout import LAParams, LTContainer
|
||||||
_, SCRIPT = os.path.split(__file__)
|
_, SCRIPT = os.path.split(__file__)
|
||||||
|
|
||||||
|
|
||||||
def msg(*args, **kwargs):
|
def msg(*args: object, **kwargs: Any) -> None:
|
||||||
print(' '.join(map(str, args)), **kwargs) # noqa E999
|
print(' '.join(map(str, args)), **kwargs) # noqa E999
|
||||||
|
|
||||||
|
|
||||||
def flat_iter(obj):
|
def flat_iter(obj: object) -> Iterator[object]:
|
||||||
yield obj
|
yield obj
|
||||||
if isinstance(obj, LTContainer):
|
if isinstance(obj, LTContainer):
|
||||||
for ob in obj:
|
for ob in obj:
|
||||||
yield from flat_iter(ob)
|
yield from flat_iter(ob)
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args: List[str]) -> int:
|
||||||
msg(SCRIPT, args)
|
msg(SCRIPT, args)
|
||||||
|
|
||||||
if len(args) != 1:
|
if len(args) != 1:
|
||||||
|
@ -40,7 +41,7 @@ def main(args):
|
||||||
|
|
||||||
infilename, = args
|
infilename, = args
|
||||||
|
|
||||||
lt_types = collections.Counter()
|
lt_types: Counter[str] = collections.Counter()
|
||||||
|
|
||||||
with open(infilename, 'rb') as pdf_file:
|
with open(infilename, 'rb') as pdf_file:
|
||||||
|
|
||||||
|
@ -77,6 +78,8 @@ def main(args):
|
||||||
msg('page_count', page_count)
|
msg('page_count', page_count)
|
||||||
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
|
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main(sys.argv[1:]))
|
sys.exit(main(sys.argv[1:]))
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
def prof_main(argv):
|
def prof_main(argv: List[str]) -> int:
|
||||||
import hotshot.stats
|
import hotshot.stats # type: ignore[import]
|
||||||
|
|
||||||
def usage():
|
def usage() -> int:
|
||||||
print('usage: %s module.function [args ...]' % argv[0])
|
print('usage: %s module.function [args ...]' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
|
@ -15,19 +16,24 @@ def prof_main(argv):
|
||||||
prof = name+'.prof'
|
prof = name+'.prof'
|
||||||
i = name.rindex('.')
|
i = name.rindex('.')
|
||||||
(modname, funcname) = (name[:i], name[i+1:])
|
(modname, funcname) = (name[:i], name[i+1:])
|
||||||
module = __import__(modname, fromlist=1)
|
|
||||||
|
# Type error: fromlist expects sequence of strings; presumably the intent
|
||||||
|
# is to retrieve the named module rather than a top-level package (as in
|
||||||
|
# "when a non-empty fromlist argument is given...").
|
||||||
|
module = __import__(modname, fromlist=1) # type: ignore[arg-type]
|
||||||
|
|
||||||
func = getattr(module, funcname)
|
func = getattr(module, funcname)
|
||||||
if args:
|
if args:
|
||||||
args.insert(0, argv[0])
|
args.insert(0, argv[0])
|
||||||
prof = hotshot.Profile(prof)
|
profile = hotshot.Profile(prof)
|
||||||
prof.runcall(lambda: func(args))
|
profile.runcall(lambda: func(args))
|
||||||
prof.close()
|
profile.close()
|
||||||
else:
|
else:
|
||||||
stats = hotshot.stats.load(prof)
|
stats = hotshot.stats.load(prof)
|
||||||
stats.strip_dirs()
|
stats.strip_dirs()
|
||||||
stats.sort_stats('time', 'calls')
|
stats.sort_stats('time', 'calls')
|
||||||
stats.print_stats(1000)
|
stats.print_stats(1000)
|
||||||
return
|
return 0
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
1
tox.ini
1
tox.ini
|
@ -9,6 +9,7 @@ whitelist_externals =
|
||||||
flake8
|
flake8
|
||||||
commands =
|
commands =
|
||||||
flake8 pdfminer/ tools/ tests/ --count --statistics
|
flake8 pdfminer/ tools/ tests/ --count --statistics
|
||||||
|
mypy --install-types --non-interactive --show-error-codes .
|
||||||
nosetests --nologcapture
|
nosetests --nologcapture
|
||||||
python -m sphinx -b html docs/source docs/build/html
|
python -m sphinx -b html docs/source docs/build/html
|
||||||
python -m sphinx -b doctest docs/source docs/build/doctest
|
python -m sphinx -b doctest docs/source docs/build/doctest
|
||||||
|
|
Loading…
Reference in New Issue