Added feature: page labels (#680)
* port page label code from pdfannots * add tests and clean up * more cleanup; harden against non-conforming input * one more test * update CHANGELOG * cleanup & respond to review feedback (incomplete) * Refactor implementation of get_page_labels() into a NumberTree and PageLabels class. * PageLabels *is* a NumberTree and should always behave like one. This justifies inheriting its data and behavior. And it simplifies the code a bit more. * fix type errors and cleanup slightly * fix mypy errors (including tweaking code to avoid problematic dynamic types) * hoist dict_value from NumberTree (where it may not be a dict) to PageLabels (where it must be) * avoid repeated warnings by calling _parse() recursively, and checking sortedness only at the end Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/704/head
parent
b19f9e7270
commit
1d1602e0c5
|
@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
### Added
|
||||
- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
|
||||
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
|
||||
- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
|
||||
|
||||
### Fixed
|
||||
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
import functools
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from pdfminer import settings
|
||||
from pdfminer.pdfparser import PDFSyntaxError
|
||||
from pdfminer.pdftypes import list_value, int_value, dict_value
|
||||
from pdfminer.utils import choplist
|
||||
|
||||
|
||||
class NumberTree:
|
||||
"""A PDF number tree.
|
||||
|
||||
See Section 3.8.6 of the PDF Reference.
|
||||
"""
|
||||
def __init__(self, obj: Any):
|
||||
self._obj = dict_value(obj)
|
||||
self.nums: Optional[Iterable[Any]] = None
|
||||
self.kids: Optional[Iterable[Any]] = None
|
||||
self.limits: Optional[Iterable[Any]] = None
|
||||
|
||||
if 'Nums' in self._obj:
|
||||
self.nums = list_value(self._obj['Nums'])
|
||||
if 'Kids' in self._obj:
|
||||
self.kids = list_value(self._obj['Kids'])
|
||||
if 'Limits' in self._obj:
|
||||
self.limits = list_value(self._obj['Limits'])
|
||||
|
||||
def _parse(self) -> List[Tuple[int, Any]]:
|
||||
l = []
|
||||
if self.nums: # Leaf node
|
||||
for k, v in choplist(2, self.nums):
|
||||
l.append((int_value(k), v))
|
||||
|
||||
if self.kids: # Root or intermediate node
|
||||
for child_ref in self.kids:
|
||||
l += NumberTree(child_ref)._parse()
|
||||
|
||||
return l
|
||||
|
||||
values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy
|
||||
|
||||
@property # type: ignore [no-redef,misc]
|
||||
@functools.lru_cache
|
||||
def values(self) -> List[Tuple[int, Any]]:
|
||||
values = self._parse()
|
||||
|
||||
if settings.STRICT:
|
||||
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
|
||||
raise PDFSyntaxError('Number tree elements are out of order')
|
||||
else:
|
||||
values.sort(key=lambda t: t[0])
|
||||
|
||||
return values
|
|
@ -1,3 +1,4 @@
|
|||
import itertools
|
||||
import logging
|
||||
import re
|
||||
import struct
|
||||
|
@ -10,12 +11,14 @@ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
|||
|
||||
from . import settings
|
||||
from .arcfour import Arcfour
|
||||
from .data_structures import NumberTree
|
||||
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
|
||||
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
|
||||
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
||||
uint_value, dict_value, stream_value
|
||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||
from .utils import choplist, nunpack, decode_text
|
||||
from .utils import choplist, decode_text, nunpack, format_int_roman, \
|
||||
format_int_alpha
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
|
|||
pass
|
||||
|
||||
|
||||
class PDFNoPageLabels(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFDestinationNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
@ -890,6 +897,24 @@ class PDFDocument:
|
|||
return
|
||||
return search(self.catalog['Outlines'], 0)
|
||||
|
||||
def get_page_labels(self) -> Iterator[str]:
|
||||
"""
|
||||
Generate page label strings for the PDF document.
|
||||
|
||||
If the document includes page labels, generates strings, one per page.
|
||||
If not, raises PDFNoPageLabels.
|
||||
|
||||
The resulting iteration is unbounded.
|
||||
"""
|
||||
assert self.catalog is not None
|
||||
|
||||
try:
|
||||
page_labels = PageLabels(self.catalog['PageLabels'])
|
||||
except (PDFTypeError, KeyError):
|
||||
raise PDFNoPageLabels
|
||||
|
||||
return page_labels.labels
|
||||
|
||||
def lookup_name(
|
||||
self,
|
||||
cat: str,
|
||||
|
@ -989,3 +1014,61 @@ class PDFDocument:
|
|||
pos = int_value(trailer['Prev'])
|
||||
self.read_xref_from(parser, pos, xrefs)
|
||||
return
|
||||
|
||||
|
||||
class PageLabels(NumberTree):
|
||||
"""PageLabels from the document catalog.
|
||||
|
||||
See Section 8.3.1 in the PDF Reference.
|
||||
"""
|
||||
|
||||
@property
|
||||
def labels(self) -> Iterator[str]:
|
||||
ranges = self.values
|
||||
|
||||
# The tree must begin with page index 0
|
||||
if len(ranges) == 0 or ranges[0][0] != 0:
|
||||
if settings.STRICT:
|
||||
raise PDFSyntaxError('PageLabels is missing page index 0')
|
||||
else:
|
||||
# Try to cope, by assuming empty labels for the initial pages
|
||||
ranges.insert(0, (0, {}))
|
||||
|
||||
for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
|
||||
label_dict = dict_value(label_dict_unchecked)
|
||||
style = label_dict.get('S')
|
||||
prefix = decode_text(str_value(label_dict.get('P', b'')))
|
||||
first_value = int_value(label_dict.get('St', 1))
|
||||
|
||||
if next == len(ranges):
|
||||
# This is the last specified range. It continues until the end
|
||||
# of the document.
|
||||
values: Iterable[int] = itertools.count(first_value)
|
||||
else:
|
||||
end, _ = ranges[next]
|
||||
range_length = end - start
|
||||
values = range(first_value, first_value + range_length)
|
||||
|
||||
for value in values:
|
||||
label = self._format_page_label(value, style)
|
||||
yield prefix + label
|
||||
|
||||
@staticmethod
|
||||
def _format_page_label(value: int, style: Any) -> str:
|
||||
"""Format page label value in a specific style"""
|
||||
if style is None:
|
||||
label = ''
|
||||
elif style is LIT('D'): # Decimal arabic numerals
|
||||
label = str(value)
|
||||
elif style is LIT('R'): # Uppercase roman numerals
|
||||
label = format_int_roman(value).upper()
|
||||
elif style is LIT('r'): # Lowercase roman numerals
|
||||
label = format_int_roman(value)
|
||||
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
|
||||
label = format_int_alpha(value).upper()
|
||||
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
|
||||
label = format_int_alpha(value)
|
||||
else:
|
||||
log.warning('Unknown page label style: %r', style)
|
||||
label = ''
|
||||
return label
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import itertools
|
||||
import logging
|
||||
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
|
||||
|
||||
from pdfminer.utils import Rect
|
||||
from . import settings
|
||||
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
|
||||
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
|
||||
PDFNoPageLabels
|
||||
from .pdfparser import PDFParser
|
||||
from .pdftypes import PDFObjectNotFound
|
||||
from .pdftypes import dict_value
|
||||
|
@ -38,23 +40,27 @@ class PDFPage:
|
|||
rotate: the page rotation (in degree).
|
||||
annots: the page annotations.
|
||||
beads: a chain that represents natural reading order.
|
||||
label: the page's label (typically, the logical page number).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
doc: PDFDocument,
|
||||
pageid: object,
|
||||
attrs: object
|
||||
attrs: object,
|
||||
label: Optional[str]
|
||||
) -> None:
|
||||
"""Initialize a page object.
|
||||
|
||||
doc: a PDFDocument object.
|
||||
pageid: any Python object that can uniquely identify the page.
|
||||
attrs: a dictionary of page attributes.
|
||||
label: page label string.
|
||||
"""
|
||||
self.doc = doc
|
||||
self.pageid = pageid
|
||||
self.attrs = dict_value(attrs)
|
||||
self.label = label
|
||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||
self.resources: Dict[object, object] = \
|
||||
resolve1(self.attrs.get('Resources', dict()))
|
||||
|
@ -109,11 +115,17 @@ class PDFPage:
|
|||
elif tree_type is LITERAL_PAGE:
|
||||
log.info('Page: %r', tree)
|
||||
yield (objid, tree)
|
||||
|
||||
try:
|
||||
page_labels: Iterator[Optional[str]] = document.get_page_labels()
|
||||
except PDFNoPageLabels:
|
||||
page_labels = itertools.repeat(None)
|
||||
|
||||
pages = False
|
||||
if 'Pages' in document.catalog:
|
||||
objects = search(document.catalog['Pages'], document.catalog)
|
||||
for (objid, tree) in objects:
|
||||
yield cls(document, objid, tree)
|
||||
yield cls(document, objid, tree, next(page_labels))
|
||||
pages = True
|
||||
if not pages:
|
||||
# fallback when /Pages is missing.
|
||||
|
@ -123,7 +135,7 @@ class PDFPage:
|
|||
obj = document.getobj(objid)
|
||||
if isinstance(obj, dict) \
|
||||
and obj.get('Type') is LITERAL_PAGE:
|
||||
yield cls(document, objid, obj)
|
||||
yield cls(document, objid, obj, next(page_labels))
|
||||
except PDFObjectNotFound:
|
||||
pass
|
||||
return
|
||||
|
|
|
@ -3,6 +3,7 @@ Miscellaneous Routines.
|
|||
"""
|
||||
import io
|
||||
import pathlib
|
||||
import string
|
||||
import struct
|
||||
from html import escape
|
||||
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
|
||||
|
@ -527,3 +528,47 @@ class Plane(Generic[LTComponentT]):
|
|||
or y1 <= obj.y0:
|
||||
continue
|
||||
yield obj
|
||||
|
||||
|
||||
ROMAN_ONES = ['i', 'x', 'c', 'm']
|
||||
ROMAN_FIVES = ['v', 'l', 'd']
|
||||
|
||||
|
||||
def format_int_roman(value: int) -> str:
|
||||
"""Format a number as lowercase Roman numerals."""
|
||||
|
||||
assert 0 < value < 4000
|
||||
result: List[str] = []
|
||||
index = 0
|
||||
|
||||
while value != 0:
|
||||
value, remainder = divmod(value, 10)
|
||||
if remainder == 9:
|
||||
result.insert(0, ROMAN_ONES[index])
|
||||
result.insert(1, ROMAN_ONES[index + 1])
|
||||
elif remainder == 4:
|
||||
result.insert(0, ROMAN_ONES[index])
|
||||
result.insert(1, ROMAN_FIVES[index])
|
||||
else:
|
||||
over_five = remainder >= 5
|
||||
if over_five:
|
||||
result.insert(0, ROMAN_FIVES[index])
|
||||
remainder -= 5
|
||||
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
|
||||
index += 1
|
||||
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def format_int_alpha(value: int) -> str:
|
||||
"""Format a number as lowercase letters a-z, aa-zz, etc."""
|
||||
|
||||
assert value > 0
|
||||
result: List[str] = []
|
||||
|
||||
while value != 0:
|
||||
value, remainder = divmod(value - 1, len(string.ascii_lowercase))
|
||||
result.append(string.ascii_lowercase[remainder])
|
||||
|
||||
result.reverse()
|
||||
return ''.join(result)
|
||||
|
|
Binary file not shown.
|
@ -1,9 +1,11 @@
|
|||
import itertools
|
||||
|
||||
from nose.tools import assert_equal, raises
|
||||
|
||||
from helpers import absolute_sample_path
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdftypes import PDFObjectNotFound
|
||||
from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
|
||||
|
||||
|
||||
class TestPdfDocument(object):
|
||||
|
@ -25,3 +27,21 @@ class TestPdfDocument(object):
|
|||
doc = PDFDocument(parser)
|
||||
assert_equal(doc.info,
|
||||
[{'Producer': b'European Patent Office'}])
|
||||
|
||||
def test_page_labels(self):
|
||||
path = absolute_sample_path('contrib/pagelabels.pdf')
|
||||
with open(path, 'rb') as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
|
||||
assert_equal(
|
||||
list(itertools.islice(doc.get_page_labels(), total_pages)),
|
||||
['iii', 'iv', '1', '2', '1'])
|
||||
|
||||
@raises(PDFNoPageLabels)
|
||||
def test_no_page_labels(self):
|
||||
path = absolute_sample_path('simple1.pdf')
|
||||
with open(path, 'rb') as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
doc.get_page_labels()
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
from nose.tools import assert_equal
|
||||
|
||||
from helpers import absolute_sample_path
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
|
||||
|
||||
class TestPdfPage(object):
|
||||
def test_page_labels(self):
|
||||
path = absolute_sample_path('contrib/pagelabels.pdf')
|
||||
expected_labels = ['iii', 'iv', '1', '2', '1']
|
||||
|
||||
with open(path, 'rb') as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
for (i, page) in enumerate(PDFPage.create_pages(doc)):
|
||||
assert_equal(page.label, expected_labels[i])
|
|
@ -3,7 +3,8 @@ import pathlib
|
|||
|
||||
from helpers import absolute_sample_path
|
||||
from pdfminer.layout import LTComponent
|
||||
from pdfminer.utils import open_filename, Plane, shorten_str
|
||||
from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename,
|
||||
Plane, shorten_str)
|
||||
|
||||
|
||||
class TestOpenFilename:
|
||||
|
@ -76,3 +77,34 @@ class TestFunctions(object):
|
|||
|
||||
def test_shorten_to_really_short(self):
|
||||
assert_equal('Hello', shorten_str('Hello World', 5))
|
||||
|
||||
def test_format_int_alpha(self):
|
||||
assert_equal('a', format_int_alpha(1))
|
||||
assert_equal('b', format_int_alpha(2))
|
||||
assert_equal('z', format_int_alpha(26))
|
||||
assert_equal('aa', format_int_alpha(27))
|
||||
assert_equal('ab', format_int_alpha(28))
|
||||
assert_equal('az', format_int_alpha(26*2))
|
||||
assert_equal('ba', format_int_alpha(26*2 + 1))
|
||||
assert_equal('zz', format_int_alpha(26*27))
|
||||
assert_equal('aaa', format_int_alpha(26*27 + 1))
|
||||
|
||||
def test_format_int_roman(self):
|
||||
assert_equal('i', format_int_roman(1))
|
||||
assert_equal('ii', format_int_roman(2))
|
||||
assert_equal('iii', format_int_roman(3))
|
||||
assert_equal('iv', format_int_roman(4))
|
||||
assert_equal('v', format_int_roman(5))
|
||||
assert_equal('vi', format_int_roman(6))
|
||||
assert_equal('vii', format_int_roman(7))
|
||||
assert_equal('viii', format_int_roman(8))
|
||||
assert_equal('ix', format_int_roman(9))
|
||||
assert_equal('x', format_int_roman(10))
|
||||
assert_equal('xi', format_int_roman(11))
|
||||
assert_equal('xx', format_int_roman(20))
|
||||
assert_equal('xl', format_int_roman(40))
|
||||
assert_equal('xlv', format_int_roman(45))
|
||||
assert_equal('l', format_int_roman(50))
|
||||
assert_equal('xc', format_int_roman(90))
|
||||
assert_equal('xci', format_int_roman(91))
|
||||
assert_equal('c', format_int_roman(100))
|
||||
|
|
Loading…
Reference in New Issue