Added feature: page labels (#680)

* port page label code from pdfannots * add tests and clean up * more cleanup; harden against non-conforming input * one more test * update CHANGELOG * cleanup & respond to review feedback (incomplete) * Refactor implementation of get_page_labels() into a NumberTree and PageLabels class. * PageLabels *is* a NumberTree and should always behave like one. This justifies inheriting its data and behavior. And it simplifies the code a bit more. * fix type errors and cleanup slightly * fix mypy errors (including tweaking code to avoid problematic dynamic types) * hoist dict_value from NumberTree (where it may not be a dict) to PageLabels (where it must be) * avoid repeated warnings by calling _parse() recursively, and checking sortedness only at the end Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2022-02-01 01:08:05 -08:00 · 2022-02-01 01:08:05 -08:00 · 1d1602e0c5
parent b19f9e7270
commit 1d1602e0c5
9 changed files with 272 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Added
 - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
 - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
+- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))

 ### Fixed
 - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
--- a/pdfminer/data_structures.py
+++ b/pdfminer/data_structures.py
@ -0,0 +1,53 @@
+import functools
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+from pdfminer import settings
+from pdfminer.pdfparser import PDFSyntaxError
+from pdfminer.pdftypes import list_value, int_value, dict_value
+from pdfminer.utils import choplist
+
+
+class NumberTree:
+    """A PDF number tree.
+
+    See Section 3.8.6 of the PDF Reference.
+    """
+    def __init__(self, obj: Any):
+        self._obj = dict_value(obj)
+        self.nums: Optional[Iterable[Any]] = None
+        self.kids: Optional[Iterable[Any]] = None
+        self.limits: Optional[Iterable[Any]] = None
+
+        if 'Nums' in self._obj:
+            self.nums = list_value(self._obj['Nums'])
+        if 'Kids' in self._obj:
+            self.kids = list_value(self._obj['Kids'])
+        if 'Limits' in self._obj:
+            self.limits = list_value(self._obj['Limits'])
+
+    def _parse(self) -> List[Tuple[int, Any]]:
+        l = []
+        if self.nums:  # Leaf node
+            for k, v in choplist(2, self.nums):
+                l.append((int_value(k), v))
+
+        if self.kids:  # Root or intermediate node
+            for child_ref in self.kids:
+                l += NumberTree(child_ref)._parse()
+
+        return l
+
+    values: List[Tuple[int, Any]]  # workaround decorators unsupported by mypy
+
+    @property  # type: ignore [no-redef,misc]
+    @functools.lru_cache
+    def values(self) -> List[Tuple[int, Any]]:
+        values = self._parse()
+
+        if settings.STRICT:
+            if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
+                raise PDFSyntaxError('Number tree elements are out of order')
+        else:
+            values.sort(key=lambda t: t[0])
+
+        return values
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -1,3 +1,4 @@
+import itertools
 import logging
 import re
 import struct
@ -10,12 +11,14 @@ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes

 from . import settings
 from .arcfour import Arcfour
+from .data_structures import NumberTree
 from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
 from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
    PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
    uint_value, dict_value, stream_value
 from .psparser import PSEOF, literal_name, LIT, KWD
-from .utils import choplist, nunpack, decode_text
+from .utils import choplist, decode_text, nunpack, format_int_roman, \
+    format_int_alpha

 log = logging.getLogger(__name__)

@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
    pass


+class PDFNoPageLabels(PDFException):
+    pass
+
+
 class PDFDestinationNotFound(PDFException):
    pass

@ -890,6 +897,24 @@ class PDFDocument:
            return
        return search(self.catalog['Outlines'], 0)

+    def get_page_labels(self) -> Iterator[str]:
+        """
+        Generate page label strings for the PDF document.
+
+        If the document includes page labels, generates strings, one per page.
+        If not, raises PDFNoPageLabels.
+
+        The resulting iteration is unbounded.
+        """
+        assert self.catalog is not None
+
+        try:
+            page_labels = PageLabels(self.catalog['PageLabels'])
+        except (PDFTypeError, KeyError):
+            raise PDFNoPageLabels
+
+        return page_labels.labels
+
    def lookup_name(
        self,
        cat: str,
@ -989,3 +1014,61 @@ class PDFDocument:
            pos = int_value(trailer['Prev'])
            self.read_xref_from(parser, pos, xrefs)
        return
+
+
+class PageLabels(NumberTree):
+    """PageLabels from the document catalog.
+
+    See Section 8.3.1 in the PDF Reference.
+    """
+
+    @property
+    def labels(self) -> Iterator[str]:
+        ranges = self.values
+
+        # The tree must begin with page index 0
+        if len(ranges) == 0 or ranges[0][0] != 0:
+            if settings.STRICT:
+                raise PDFSyntaxError('PageLabels is missing page index 0')
+            else:
+                # Try to cope, by assuming empty labels for the initial pages
+                ranges.insert(0, (0, {}))
+
+        for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
+            label_dict = dict_value(label_dict_unchecked)
+            style = label_dict.get('S')
+            prefix = decode_text(str_value(label_dict.get('P', b'')))
+            first_value = int_value(label_dict.get('St', 1))
+
+            if next == len(ranges):
+                # This is the last specified range. It continues until the end
+                # of the document.
+                values: Iterable[int] = itertools.count(first_value)
+            else:
+                end, _ = ranges[next]
+                range_length = end - start
+                values = range(first_value, first_value + range_length)
+
+            for value in values:
+                label = self._format_page_label(value, style)
+                yield prefix + label
+
+    @staticmethod
+    def _format_page_label(value: int, style: Any) -> str:
+        """Format page label value in a specific style"""
+        if style is None:
+            label = ''
+        elif style is LIT('D'):  # Decimal arabic numerals
+            label = str(value)
+        elif style is LIT('R'):  # Uppercase roman numerals
+            label = format_int_roman(value).upper()
+        elif style is LIT('r'):  # Lowercase roman numerals
+            label = format_int_roman(value)
+        elif style is LIT('A'):  # Uppercase letters A-Z, AA-ZZ...
+            label = format_int_alpha(value).upper()
+        elif style is LIT('a'):  # Lowercase letters a-z, aa-zz...
+            label = format_int_alpha(value)
+        else:
+            log.warning('Unknown page label style: %r', style)
+            label = ''
+        return label
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -1,9 +1,11 @@
+import itertools
 import logging
 from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple

 from pdfminer.utils import Rect
 from . import settings
-from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
+from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
+    PDFNoPageLabels
 from .pdfparser import PDFParser
 from .pdftypes import PDFObjectNotFound
 from .pdftypes import dict_value
@ -38,23 +40,27 @@ class PDFPage:
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
+      label: the page's label (typically, the logical page number).
    """

    def __init__(
        self,
        doc: PDFDocument,
        pageid: object,
-        attrs: object
+        attrs: object,
+        label: Optional[str]
    ) -> None:
        """Initialize a page object.

        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
+        label: page label string.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
+        self.label = label
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources: Dict[object, object] = \
            resolve1(self.attrs.get('Resources', dict()))
@ -109,11 +115,17 @@ class PDFPage:
            elif tree_type is LITERAL_PAGE:
                log.info('Page: %r', tree)
                yield (objid, tree)
+
+        try:
+            page_labels: Iterator[Optional[str]] = document.get_page_labels()
+        except PDFNoPageLabels:
+            page_labels = itertools.repeat(None)
+
        pages = False
        if 'Pages' in document.catalog:
            objects = search(document.catalog['Pages'], document.catalog)
            for (objid, tree) in objects:
-                yield cls(document, objid, tree)
+                yield cls(document, objid, tree, next(page_labels))
                pages = True
        if not pages:
            # fallback when /Pages is missing.
@ -123,7 +135,7 @@ class PDFPage:
                        obj = document.getobj(objid)
                        if isinstance(obj, dict) \
                                and obj.get('Type') is LITERAL_PAGE:
-                            yield cls(document, objid, obj)
+                            yield cls(document, objid, obj, next(page_labels))
                    except PDFObjectNotFound:
                        pass
        return
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -3,6 +3,7 @@ Miscellaneous Routines.
 """
 import io
 import pathlib
+import string
 import struct
 from html import escape
 from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
@ -527,3 +528,47 @@ class Plane(Generic[LTComponentT]):
                        or y1 <= obj.y0:
                    continue
                yield obj
+
+
+ROMAN_ONES = ['i', 'x', 'c', 'm']
+ROMAN_FIVES = ['v', 'l', 'd']
+
+
+def format_int_roman(value: int) -> str:
+    """Format a number as lowercase Roman numerals."""
+
+    assert 0 < value < 4000
+    result: List[str] = []
+    index = 0
+
+    while value != 0:
+        value, remainder = divmod(value, 10)
+        if remainder == 9:
+            result.insert(0, ROMAN_ONES[index])
+            result.insert(1, ROMAN_ONES[index + 1])
+        elif remainder == 4:
+            result.insert(0, ROMAN_ONES[index])
+            result.insert(1, ROMAN_FIVES[index])
+        else:
+            over_five = remainder >= 5
+            if over_five:
+                result.insert(0, ROMAN_FIVES[index])
+                remainder -= 5
+            result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
+        index += 1
+
+    return ''.join(result)
+
+
+def format_int_alpha(value: int) -> str:
+    """Format a number as lowercase letters a-z, aa-zz, etc."""
+
+    assert value > 0
+    result: List[str] = []
+
+    while value != 0:
+        value, remainder = divmod(value - 1, len(string.ascii_lowercase))
+        result.append(string.ascii_lowercase[remainder])
+
+    result.reverse()
+    return ''.join(result)
--- a/samples/contrib/pagelabels.pdf
+++ b/samples/contrib/pagelabels.pdf
--- a/tests/test_pdfdocument.py
+++ b/tests/test_pdfdocument.py
@ -1,9 +1,11 @@
+import itertools
+
 from nose.tools import assert_equal, raises

 from helpers import absolute_sample_path
-from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
 from pdfminer.pdfparser import PDFParser
-from pdfminer.pdftypes import PDFObjectNotFound
+from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value


 class TestPdfDocument(object):
@ -25,3 +27,21 @@ class TestPdfDocument(object):
            doc = PDFDocument(parser)
            assert_equal(doc.info,
                         [{'Producer': b'European Patent Office'}])
+
+    def test_page_labels(self):
+        path = absolute_sample_path('contrib/pagelabels.pdf')
+        with open(path, 'rb') as fp:
+            parser = PDFParser(fp)
+            doc = PDFDocument(parser)
+            total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
+            assert_equal(
+                list(itertools.islice(doc.get_page_labels(), total_pages)),
+                ['iii', 'iv', '1', '2', '1'])
+
+    @raises(PDFNoPageLabels)
+    def test_no_page_labels(self):
+        path = absolute_sample_path('simple1.pdf')
+        with open(path, 'rb') as fp:
+            parser = PDFParser(fp)
+            doc = PDFDocument(parser)
+            doc.get_page_labels()
--- a/tests/test_pdfpage.py
+++ b/tests/test_pdfpage.py
@ -0,0 +1,18 @@
+from nose.tools import assert_equal
+
+from helpers import absolute_sample_path
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfpage import PDFPage
+
+
+class TestPdfPage(object):
+    def test_page_labels(self):
+        path = absolute_sample_path('contrib/pagelabels.pdf')
+        expected_labels = ['iii', 'iv', '1', '2', '1']
+
+        with open(path, 'rb') as fp:
+            parser = PDFParser(fp)
+            doc = PDFDocument(parser)
+            for (i, page) in enumerate(PDFPage.create_pages(doc)):
+                assert_equal(page.label, expected_labels[i])
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -3,7 +3,8 @@ import pathlib

 from helpers import absolute_sample_path
 from pdfminer.layout import LTComponent
-from pdfminer.utils import open_filename, Plane, shorten_str
+from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename,
+                            Plane, shorten_str)


 class TestOpenFilename:
@ -76,3 +77,34 @@ class TestFunctions(object):

    def test_shorten_to_really_short(self):
        assert_equal('Hello', shorten_str('Hello World', 5))
+
+    def test_format_int_alpha(self):
+        assert_equal('a', format_int_alpha(1))
+        assert_equal('b', format_int_alpha(2))
+        assert_equal('z', format_int_alpha(26))
+        assert_equal('aa', format_int_alpha(27))
+        assert_equal('ab', format_int_alpha(28))
+        assert_equal('az', format_int_alpha(26*2))
+        assert_equal('ba', format_int_alpha(26*2 + 1))
+        assert_equal('zz', format_int_alpha(26*27))
+        assert_equal('aaa', format_int_alpha(26*27 + 1))
+
+    def test_format_int_roman(self):
+        assert_equal('i', format_int_roman(1))
+        assert_equal('ii', format_int_roman(2))
+        assert_equal('iii', format_int_roman(3))
+        assert_equal('iv', format_int_roman(4))
+        assert_equal('v', format_int_roman(5))
+        assert_equal('vi', format_int_roman(6))
+        assert_equal('vii', format_int_roman(7))
+        assert_equal('viii', format_int_roman(8))
+        assert_equal('ix', format_int_roman(9))
+        assert_equal('x', format_int_roman(10))
+        assert_equal('xi', format_int_roman(11))
+        assert_equal('xx', format_int_roman(20))
+        assert_equal('xl', format_int_roman(40))
+        assert_equal('xlv', format_int_roman(45))
+        assert_equal('l', format_int_roman(50))
+        assert_equal('xc', format_int_roman(90))
+        assert_equal('xci', format_int_roman(91))
+        assert_equal('c', format_int_roman(100))