Added feature: page labels (#680)

* port page label code from pdfannots

* add tests and clean up

* more cleanup; harden against non-conforming input

* one more test

* update CHANGELOG

* cleanup & respond to review feedback (incomplete)

* Refactor implementation of get_page_labels() into a NumberTree and PageLabels class.

* PageLabels *is* a NumberTree and should always behave like one. This justifies inheriting its data and behavior. And it simplifies the code a bit more.

* fix type errors and cleanup slightly

 * fix mypy errors (including tweaking code to avoid problematic dynamic types)
 * hoist dict_value from NumberTree (where it may not be a dict) to PageLabels (where it must be)
 * avoid repeated warnings by calling _parse() recursively, and checking sortedness only at the end

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/704/head
Andrew Baumann 2022-02-01 01:08:05 -08:00 committed by GitHub
parent b19f9e7270
commit 1d1602e0c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 272 additions and 8 deletions

View File

@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added ### Added
- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679)) - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626)) - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
### Fixed ### Fixed
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))

View File

@ -0,0 +1,53 @@
import functools
from typing import Any, Dict, Iterable, List, Optional, Tuple
from pdfminer import settings
from pdfminer.pdfparser import PDFSyntaxError
from pdfminer.pdftypes import list_value, int_value, dict_value
from pdfminer.utils import choplist
class NumberTree:
"""A PDF number tree.
See Section 3.8.6 of the PDF Reference.
"""
def __init__(self, obj: Any):
self._obj = dict_value(obj)
self.nums: Optional[Iterable[Any]] = None
self.kids: Optional[Iterable[Any]] = None
self.limits: Optional[Iterable[Any]] = None
if 'Nums' in self._obj:
self.nums = list_value(self._obj['Nums'])
if 'Kids' in self._obj:
self.kids = list_value(self._obj['Kids'])
if 'Limits' in self._obj:
self.limits = list_value(self._obj['Limits'])
def _parse(self) -> List[Tuple[int, Any]]:
l = []
if self.nums: # Leaf node
for k, v in choplist(2, self.nums):
l.append((int_value(k), v))
if self.kids: # Root or intermediate node
for child_ref in self.kids:
l += NumberTree(child_ref)._parse()
return l
values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy
@property # type: ignore [no-redef,misc]
@functools.lru_cache
def values(self) -> List[Tuple[int, Any]]:
values = self._parse()
if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError('Number tree elements are out of order')
else:
values.sort(key=lambda t: t[0])
return values

View File

@ -1,3 +1,4 @@
import itertools
import logging import logging
import re import re
import struct import struct
@ -10,12 +11,14 @@ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from . import settings from . import settings
from .arcfour import Arcfour from .arcfour import Arcfour
from .data_structures import NumberTree
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \ from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \ PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
uint_value, dict_value, stream_value uint_value, dict_value, stream_value
from .psparser import PSEOF, literal_name, LIT, KWD from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, nunpack, decode_text from .utils import choplist, decode_text, nunpack, format_int_roman, \
format_int_alpha
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
pass pass
class PDFNoPageLabels(PDFException):
pass
class PDFDestinationNotFound(PDFException): class PDFDestinationNotFound(PDFException):
pass pass
@ -890,6 +897,24 @@ class PDFDocument:
return return
return search(self.catalog['Outlines'], 0) return search(self.catalog['Outlines'], 0)
def get_page_labels(self) -> Iterator[str]:
"""
Generate page label strings for the PDF document.
If the document includes page labels, generates strings, one per page.
If not, raises PDFNoPageLabels.
The resulting iteration is unbounded.
"""
assert self.catalog is not None
try:
page_labels = PageLabels(self.catalog['PageLabels'])
except (PDFTypeError, KeyError):
raise PDFNoPageLabels
return page_labels.labels
def lookup_name( def lookup_name(
self, self,
cat: str, cat: str,
@ -989,3 +1014,61 @@ class PDFDocument:
pos = int_value(trailer['Prev']) pos = int_value(trailer['Prev'])
self.read_xref_from(parser, pos, xrefs) self.read_xref_from(parser, pos, xrefs)
return return
class PageLabels(NumberTree):
"""PageLabels from the document catalog.
See Section 8.3.1 in the PDF Reference.
"""
@property
def labels(self) -> Iterator[str]:
ranges = self.values
# The tree must begin with page index 0
if len(ranges) == 0 or ranges[0][0] != 0:
if settings.STRICT:
raise PDFSyntaxError('PageLabels is missing page index 0')
else:
# Try to cope, by assuming empty labels for the initial pages
ranges.insert(0, (0, {}))
for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
label_dict = dict_value(label_dict_unchecked)
style = label_dict.get('S')
prefix = decode_text(str_value(label_dict.get('P', b'')))
first_value = int_value(label_dict.get('St', 1))
if next == len(ranges):
# This is the last specified range. It continues until the end
# of the document.
values: Iterable[int] = itertools.count(first_value)
else:
end, _ = ranges[next]
range_length = end - start
values = range(first_value, first_value + range_length)
for value in values:
label = self._format_page_label(value, style)
yield prefix + label
@staticmethod
def _format_page_label(value: int, style: Any) -> str:
"""Format page label value in a specific style"""
if style is None:
label = ''
elif style is LIT('D'): # Decimal arabic numerals
label = str(value)
elif style is LIT('R'): # Uppercase roman numerals
label = format_int_roman(value).upper()
elif style is LIT('r'): # Lowercase roman numerals
label = format_int_roman(value)
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
label = format_int_alpha(value).upper()
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
label = format_int_alpha(value)
else:
log.warning('Unknown page label style: %r', style)
label = ''
return label

View File

@ -1,9 +1,11 @@
import itertools
import logging import logging
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
from pdfminer.utils import Rect from pdfminer.utils import Rect
from . import settings from . import settings
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
PDFNoPageLabels
from .pdfparser import PDFParser from .pdfparser import PDFParser
from .pdftypes import PDFObjectNotFound from .pdftypes import PDFObjectNotFound
from .pdftypes import dict_value from .pdftypes import dict_value
@ -38,23 +40,27 @@ class PDFPage:
rotate: the page rotation (in degree). rotate: the page rotation (in degree).
annots: the page annotations. annots: the page annotations.
beads: a chain that represents natural reading order. beads: a chain that represents natural reading order.
label: the page's label (typically, the logical page number).
""" """
def __init__( def __init__(
self, self,
doc: PDFDocument, doc: PDFDocument,
pageid: object, pageid: object,
attrs: object attrs: object,
label: Optional[str]
) -> None: ) -> None:
"""Initialize a page object. """Initialize a page object.
doc: a PDFDocument object. doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page. pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes. attrs: a dictionary of page attributes.
label: page label string.
""" """
self.doc = doc self.doc = doc
self.pageid = pageid self.pageid = pageid
self.attrs = dict_value(attrs) self.attrs = dict_value(attrs)
self.label = label
self.lastmod = resolve1(self.attrs.get('LastModified')) self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources: Dict[object, object] = \ self.resources: Dict[object, object] = \
resolve1(self.attrs.get('Resources', dict())) resolve1(self.attrs.get('Resources', dict()))
@ -109,11 +115,17 @@ class PDFPage:
elif tree_type is LITERAL_PAGE: elif tree_type is LITERAL_PAGE:
log.info('Page: %r', tree) log.info('Page: %r', tree)
yield (objid, tree) yield (objid, tree)
try:
page_labels: Iterator[Optional[str]] = document.get_page_labels()
except PDFNoPageLabels:
page_labels = itertools.repeat(None)
pages = False pages = False
if 'Pages' in document.catalog: if 'Pages' in document.catalog:
objects = search(document.catalog['Pages'], document.catalog) objects = search(document.catalog['Pages'], document.catalog)
for (objid, tree) in objects: for (objid, tree) in objects:
yield cls(document, objid, tree) yield cls(document, objid, tree, next(page_labels))
pages = True pages = True
if not pages: if not pages:
# fallback when /Pages is missing. # fallback when /Pages is missing.
@ -123,7 +135,7 @@ class PDFPage:
obj = document.getobj(objid) obj = document.getobj(objid)
if isinstance(obj, dict) \ if isinstance(obj, dict) \
and obj.get('Type') is LITERAL_PAGE: and obj.get('Type') is LITERAL_PAGE:
yield cls(document, objid, obj) yield cls(document, objid, obj, next(page_labels))
except PDFObjectNotFound: except PDFObjectNotFound:
pass pass
return return

View File

@ -3,6 +3,7 @@ Miscellaneous Routines.
""" """
import io import io
import pathlib import pathlib
import string
import struct import struct
from html import escape from html import escape
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator, from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
@ -527,3 +528,47 @@ class Plane(Generic[LTComponentT]):
or y1 <= obj.y0: or y1 <= obj.y0:
continue continue
yield obj yield obj
ROMAN_ONES = ['i', 'x', 'c', 'm']
ROMAN_FIVES = ['v', 'l', 'd']
def format_int_roman(value: int) -> str:
"""Format a number as lowercase Roman numerals."""
assert 0 < value < 4000
result: List[str] = []
index = 0
while value != 0:
value, remainder = divmod(value, 10)
if remainder == 9:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_ONES[index + 1])
elif remainder == 4:
result.insert(0, ROMAN_ONES[index])
result.insert(1, ROMAN_FIVES[index])
else:
over_five = remainder >= 5
if over_five:
result.insert(0, ROMAN_FIVES[index])
remainder -= 5
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
index += 1
return ''.join(result)
def format_int_alpha(value: int) -> str:
"""Format a number as lowercase letters a-z, aa-zz, etc."""
assert value > 0
result: List[str] = []
while value != 0:
value, remainder = divmod(value - 1, len(string.ascii_lowercase))
result.append(string.ascii_lowercase[remainder])
result.reverse()
return ''.join(result)

Binary file not shown.

View File

@ -1,9 +1,11 @@
import itertools
from nose.tools import assert_equal, raises from nose.tools import assert_equal, raises
from helpers import absolute_sample_path from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjectNotFound from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
class TestPdfDocument(object): class TestPdfDocument(object):
@ -25,3 +27,21 @@ class TestPdfDocument(object):
doc = PDFDocument(parser) doc = PDFDocument(parser)
assert_equal(doc.info, assert_equal(doc.info,
[{'Producer': b'European Patent Office'}]) [{'Producer': b'European Patent Office'}])
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
assert_equal(
list(itertools.islice(doc.get_page_labels(), total_pages)),
['iii', 'iv', '1', '2', '1'])
@raises(PDFNoPageLabels)
def test_no_page_labels(self):
path = absolute_sample_path('simple1.pdf')
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
doc.get_page_labels()

18
tests/test_pdfpage.py Normal file
View File

@ -0,0 +1,18 @@
from nose.tools import assert_equal
from helpers import absolute_sample_path
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
class TestPdfPage(object):
def test_page_labels(self):
path = absolute_sample_path('contrib/pagelabels.pdf')
expected_labels = ['iii', 'iv', '1', '2', '1']
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
for (i, page) in enumerate(PDFPage.create_pages(doc)):
assert_equal(page.label, expected_labels[i])

View File

@ -3,7 +3,8 @@ import pathlib
from helpers import absolute_sample_path from helpers import absolute_sample_path
from pdfminer.layout import LTComponent from pdfminer.layout import LTComponent
from pdfminer.utils import open_filename, Plane, shorten_str from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename,
Plane, shorten_str)
class TestOpenFilename: class TestOpenFilename:
@ -76,3 +77,34 @@ class TestFunctions(object):
def test_shorten_to_really_short(self): def test_shorten_to_really_short(self):
assert_equal('Hello', shorten_str('Hello World', 5)) assert_equal('Hello', shorten_str('Hello World', 5))
def test_format_int_alpha(self):
assert_equal('a', format_int_alpha(1))
assert_equal('b', format_int_alpha(2))
assert_equal('z', format_int_alpha(26))
assert_equal('aa', format_int_alpha(27))
assert_equal('ab', format_int_alpha(28))
assert_equal('az', format_int_alpha(26*2))
assert_equal('ba', format_int_alpha(26*2 + 1))
assert_equal('zz', format_int_alpha(26*27))
assert_equal('aaa', format_int_alpha(26*27 + 1))
def test_format_int_roman(self):
assert_equal('i', format_int_roman(1))
assert_equal('ii', format_int_roman(2))
assert_equal('iii', format_int_roman(3))
assert_equal('iv', format_int_roman(4))
assert_equal('v', format_int_roman(5))
assert_equal('vi', format_int_roman(6))
assert_equal('vii', format_int_roman(7))
assert_equal('viii', format_int_roman(8))
assert_equal('ix', format_int_roman(9))
assert_equal('x', format_int_roman(10))
assert_equal('xi', format_int_roman(11))
assert_equal('xx', format_int_roman(20))
assert_equal('xl', format_int_roman(40))
assert_equal('xlv', format_int_roman(45))
assert_equal('l', format_int_roman(50))
assert_equal('xc', format_int_roman(90))
assert_equal('xci', format_int_roman(91))
assert_equal('c', format_int_roman(100))