Added feature: page labels (#680)

* port page label code from pdfannots

* add tests and clean up

* more cleanup; harden against non-conforming input

* one more test

* update CHANGELOG

* cleanup & respond to review feedback (incomplete)

* Refactor implementation of get_page_labels() into a NumberTree and PageLabels class.

* PageLabels *is* a NumberTree and should always behave like one. This justifies inheriting its data and behavior. And it simplifies the code a bit more.

* fix type errors and cleanup slightly

 * fix mypy errors (including tweaking code to avoid problematic dynamic types)
 * hoist dict_value from NumberTree (where it may not be a dict) to PageLabels (where it must be)
 * avoid repeated warnings by calling _parse() recursively, and checking sortedness only at the end

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/704/head
Andrew Baumann 2022-02-01 01:08:05 -08:00 committed by GitHub
parent b19f9e7270
commit 1d1602e0c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 272 additions and 8 deletions

View File

@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added
- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))

View File

@ -0,0 +1,53 @@
import functools
from typing import Any, Dict, Iterable, List, Optional, Tuple
from pdfminer import settings
from pdfminer.pdfparser import PDFSyntaxError
from pdfminer.pdftypes import list_value, int_value, dict_value
from pdfminer.utils import choplist
class NumberTree:
"""A PDF number tree.
See Section 3.8.6 of the PDF Reference.
"""
def __init__(self, obj: Any):
self._obj = dict_value(obj)
self.nums: Optional[Iterable[Any]] = None
self.kids: Optional[Iterable[Any]] = None
self.limits: Optional[Iterable[Any]] = None
if 'Nums' in self._obj:
self.nums = list_value(self._obj['Nums'])
if 'Kids' in self._obj:
self.kids = list_value(self._obj['Kids'])
if 'Limits' in self._obj:
self.limits = list_value(self._obj['Limits'])
def _parse(self) -> List[Tuple[int, Any]]:
l = []
if self.nums: # Leaf node
for k, v in choplist(2, self.nums):
l.append((int_value(k), v))
if self.kids: # Root or intermediate node
for child_ref in self.kids:
l += NumberTree(child_ref)._parse()
return l
values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy
@property # type: ignore [no-redef,misc]
@functools.lru_cache
def values(self) -> List[Tuple[int, Any]]:
values = self._parse()
if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError('Number tree elements are out of order')
else:
values.sort(key=lambda t: t[0])
return values

View File

@ -1,3 +1,4 @@
import itertools
import logging
import re
import struct
@ -10,12 +11,14 @@ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from . import settings
from .arcfour import Arcfour
from .data_structures import NumberTree
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
uint_value, dict_value, stream_value
from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, nunpack, decode_text
from .utils import choplist, decode_text, nunpack, format_int_roman, \
format_int_alpha
log = logging.getLogger(__name__)
@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
pass
class PDFNoPageLabels(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
@ -890,6 +897,24 @@ class PDFDocument:
return
return search(self.catalog['Outlines'], 0)
def get_page_labels(self) -> Iterator[str]:
"""
Generate page label strings for the PDF document.
If the document includes page labels, generates strings, one per page.
If not, raises PDFNoPageLabels.
The resulting iteration is unbounded.
"""
assert self.catalog is not None
try:
page_labels = PageLabels(self.catalog['PageLabels'])
except (PDFTypeError, KeyError):
raise PDFNoPageLabels
return page_labels.labels
def lookup_name(
self,
cat: str,
@ -989,3 +1014,61 @@ class PDFDocument:
pos = int_value(trailer['Prev'])
self.read_xref_from(parser, pos, xrefs)
return
class PageLabels(NumberTree):
"""PageLabels from the document catalog.
See Section 8.3.1 in the PDF Reference.
"""
@property
def labels(self) -> Iterator[str]:
ranges = self.values
# The tree must begin with page index 0
if len(ranges) == 0 or ranges[0][0] != 0:
if settings.STRICT:
raise PDFSyntaxError('PageLabels is missing page index 0')
else:
# Try to cope, by assuming empty labels for the initial pages
ranges.insert(0, (0, {}))
for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
label_dict = dict_value(label_dict_unchecked)
style = label_dict.get('S')
prefix = decode_text(str_value(label_dict.get('P', b'')))
first_value = int_value(label_dict.get('St', 1))
if next == len(ranges):
# This is the last specified range. It continues until the end
# of the document.
values: Iterable[int] = itertools.count(first_value)
else:
end, _ = ranges[next]
range_length = end - start
values = range(first_value, first_value + range_length)
for value in values:
label = self._format_page_label(value, style)
yield prefix + label
@staticmethod
def _format_page_label(value: int, style: Any) -> str:
"""Format page label value in a specific style"""
if style is None:
label = ''
elif style is LIT('D'): # Decimal arabic numerals
label = str(value)
elif style is LIT('R'): # Uppercase roman numerals
label = format_int_roman(value).upper()
elif style is LIT('r'): # Lowercase roman numerals
label = format_int_roman(value)
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
label = format_int_alpha(value).upper()
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
label = format_int_alpha(value)
else:
log.warning('Unknown page label style: %r', style)
label = ''
return label

View File

@ -1,9 +1,11 @@
import itertools
import logging
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
from pdfminer.utils import Rect
from . import settings
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
PDFNoPageLabels
from .pdfparser import PDFParser
from .pdftypes import PDFObjectNotFound
from .pdftypes import dict_value
@ -38,23 +40,27 @@ class PDFPage:
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
label: the page's label (typically, the logical page number).
"""
def __init__(
self,
doc: PDFDocument,
pageid: object,
attrs: object
attrs: object,
label: Optional[str]
) -> None:
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
label: page label string.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.label = label
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources: Dict[object, object] = \
resolve1(self.attrs.get('Resources', dict()))
@ -109,11 +115,17 @@ class PDFPage:
elif tree_type is LITERAL_PAGE:
log.info('Page: %r', tree)
yield (objid, tree)
try:
page_labels: Iterator[Optional[str]] = document.get_page_labels()
except PDFNoPageLabels:
page_labels = itertools.repeat(None)
pages = False
if 'Pages' in document.catalog:
objects = search(document.catalog['Pages'], document.catalog)
for (objid, tree) in objects:
yield cls(document, objid, tree)
yield cls(document, objid, tree, next(page_labels))
pages = True
if not pages:
# fallback when /Pages is missing.
@ -123,7 +135,7 @@ class PDFPage:
obj = document.getobj(objid)
if isinstance(obj, dict) \
and obj.get('Type') is LITERAL_PAGE:
yield cls(document, objid, obj)
yield cls(document, objid, obj, next(page_labels))
except PDFObjectNotFound:
pass
return

View File

@ -3,6 +3,7 @@ Miscellaneous Routines.
"""
import io
import pathlib
import string
import struct
from html import escape
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
@ -527,3 +528,47 @@ class Plane(Generic[LTComponentT]):
or y1 <= obj.y0:
continue
yield obj
ROMAN_ONES = ['i', 'x', 'c', 'm']
ROMAN_FIVES = ['v', 'l', 'd']
def format_int_roman(value: int) -> str:
"""Format a number as lowercase Roman numerals."""
assert 0 < value < 4000
result: List[str] = []
index = 0
while value != 0:
value, remainder = divmod(value, 10)
if remainder == 9:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_ONES[index + 1])
elif remainder == 4:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_FIVES[index])
else:
over_five = remainder >= 5
if over_five:
result.insert(0, ROMAN_FIVES[index])
remainder -= 5
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
index += 1
return ''.join(result)
def format_int_alpha(value: int) -> str:
"""Format a number as lowercase letters a-z, aa-zz, etc."""
assert value > 0
result: List[str] = []
while value != 0:
value, remainder = divmod(value - 1, len(string.ascii_lowercase))
result.append(string.ascii_lowercase[remainder])
result.reverse()
return ''.join(result)

Binary file not shown.

View File

@ -1,9 +1,11 @@
import itertools
from nose.tools import assert_equal, raises
from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound
from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
class TestPdfDocument(object):
@ -25,3 +27,21 @@ class TestPdfDocument(object):
doc = PDFDocument(parser)
assert_equal(doc.info,
[{'Producer': b'European Patent Office'}])
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
assert_equal(
list(itertools.islice(doc.get_page_labels(), total_pages)),
['iii', 'iv', '1', '2', '1'])
@raises(PDFNoPageLabels)
def test_no_page_labels(self):
path = absolute_sample_path('simple1.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.get_page_labels()

18
tests/test_pdfpage.py Normal file
View File

@ -0,0 +1,18 @@
from nose.tools import assert_equal
from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
class TestPdfPage(object):
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
expected_labels = ['iii', 'iv', '1', '2', '1']
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
for (i, page) in enumerate(PDFPage.create_pages(doc)):
assert_equal(page.label, expected_labels[i])

View File

@ -3,7 +3,8 @@ import pathlib
from helpers import absolute_sample_path
from pdfminer.layout import LTComponent
from pdfminer.utils import open_filename, Plane, shorten_str
from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename,
Plane, shorten_str)
class TestOpenFilename:
@ -76,3 +77,34 @@ class TestFunctions(object):
def test_shorten_to_really_short(self):
assert_equal('Hello', shorten_str('Hello World', 5))
def test_format_int_alpha(self):
assert_equal('a', format_int_alpha(1))
assert_equal('b', format_int_alpha(2))
assert_equal('z', format_int_alpha(26))
assert_equal('aa', format_int_alpha(27))
assert_equal('ab', format_int_alpha(28))
assert_equal('az', format_int_alpha(26*2))
assert_equal('ba', format_int_alpha(26*2 + 1))
assert_equal('zz', format_int_alpha(26*27))
assert_equal('aaa', format_int_alpha(26*27 + 1))
def test_format_int_roman(self):
assert_equal('i', format_int_roman(1))
assert_equal('ii', format_int_roman(2))
assert_equal('iii', format_int_roman(3))
assert_equal('iv', format_int_roman(4))
assert_equal('v', format_int_roman(5))
assert_equal('vi', format_int_roman(6))
assert_equal('vii', format_int_roman(7))
assert_equal('viii', format_int_roman(8))
assert_equal('ix', format_int_roman(9))
assert_equal('x', format_int_roman(10))
assert_equal('xi', format_int_roman(11))
assert_equal('xx', format_int_roman(20))
assert_equal('xl', format_int_roman(40))
assert_equal('xlv', format_int_roman(45))
assert_equal('l', format_int_roman(50))
assert_equal('xc', format_int_roman(90))
assert_equal('xci', format_int_roman(91))
assert_equal('c', format_int_roman(100))