Added feature: page labels (#680)

* port page label code from pdfannots * add tests and clean up * more cleanup; harden against non-conforming input * one more test * update CHANGELOG * cleanup & respond to review feedback (incomplete) * Refactor implementation of get_page_labels() into a NumberTree and PageLabels class. * PageLabels *is* a NumberTree and should always behave like one. This justifies inheriting its data and behavior. And it simplifies the code a bit more. * fix type errors and cleanup slightly * fix mypy errors (including tweaking code to avoid problematic dynamic types) * hoist dict_value from NumberTree (where it may not be a dict) to PageLabels (where it must be) * avoid repeated warnings by calling _parse() recursively, and checking sortedness only at the end Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2022-02-01 01:08:05 -08:00 · 2022-02-01 01:08:05 -08:00 · 1d1602e0c5
parent b19f9e7270
commit 1d1602e0c5
9 changed files with 272 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Added
 - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
 - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
 - Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
 ### Fixed
 - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
--- a/pdfminer/data_structures.py
+++ b/pdfminer/data_structures.py
@ -0,0 +1,53 @@
 import functools
 from typing import Any, Dict, Iterable, List, Optional, Tuple
 from pdfminer import settings
 from pdfminer.pdfparser import PDFSyntaxError
 from pdfminer.pdftypes import list_value, int_value, dict_value
 from pdfminer.utils import choplist
 class NumberTree:
    """A PDF number tree.
    See Section 3.8.6 of the PDF Reference.
    """
    def __init__(self, obj: Any):
        self._obj = dict_value(obj)
        self.nums: Optional[Iterable[Any]] = None
        self.kids: Optional[Iterable[Any]] = None
        self.limits: Optional[Iterable[Any]] = None
        if 'Nums' in self._obj:
            self.nums = list_value(self._obj['Nums'])
        if 'Kids' in self._obj:
            self.kids = list_value(self._obj['Kids'])
        if 'Limits' in self._obj:
            self.limits = list_value(self._obj['Limits'])
    def _parse(self) -> List[Tuple[int, Any]]:
        l = []
        if self.nums:  # Leaf node
            for k, v in choplist(2, self.nums):
                l.append((int_value(k), v))
        if self.kids:  # Root or intermediate node
            for child_ref in self.kids:
                l += NumberTree(child_ref)._parse()
        return l
    values: List[Tuple[int, Any]]  # workaround decorators unsupported by mypy
    @property  # type: ignore [no-redef,misc]
    @functools.lru_cache
    def values(self) -> List[Tuple[int, Any]]:
        values = self._parse()
        if settings.STRICT:
            if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
                raise PDFSyntaxError('Number tree elements are out of order')
        else:
            values.sort(key=lambda t: t[0])
        return values
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@ -1,3 +1,4 @@
 import itertools
 import logging
 import re
 import struct
@ -10,12 +11,14 @@ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
 from . import settings
 from .arcfour import Arcfour
 from .data_structures import NumberTree
 from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
 from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
    PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
    uint_value, dict_value, stream_value
 from .psparser import PSEOF, literal_name, LIT, KWD
-from .utils import choplist, nunpack, decode_text
+from .utils import choplist, decode_text, nunpack, format_int_roman, \
    format_int_alpha
 log = logging.getLogger(__name__)
@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
    pass
 class PDFNoPageLabels(PDFException):
    pass
 class PDFDestinationNotFound(PDFException):
    pass
@ -890,6 +897,24 @@ class PDFDocument:
            return
        return search(self.catalog['Outlines'], 0)
    def get_page_labels(self) -> Iterator[str]:
        """
        Generate page label strings for the PDF document.
        If the document includes page labels, generates strings, one per page.
        If not, raises PDFNoPageLabels.
        The resulting iteration is unbounded.
        """
        assert self.catalog is not None
        try:
            page_labels = PageLabels(self.catalog['PageLabels'])
        except (PDFTypeError, KeyError):
            raise PDFNoPageLabels
        return page_labels.labels
    def lookup_name(
        self,
        cat: str,
@ -989,3 +1014,61 @@ class PDFDocument:
            pos = int_value(trailer['Prev'])
            self.read_xref_from(parser, pos, xrefs)
        return
 class PageLabels(NumberTree):
    """PageLabels from the document catalog.
    See Section 8.3.1 in the PDF Reference.
    """
    @property
    def labels(self) -> Iterator[str]:
        ranges = self.values
        # The tree must begin with page index 0
        if len(ranges) == 0 or ranges[0][0] != 0:
            if settings.STRICT:
                raise PDFSyntaxError('PageLabels is missing page index 0')
            else:
                # Try to cope, by assuming empty labels for the initial pages
                ranges.insert(0, (0, {}))
        for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
            label_dict = dict_value(label_dict_unchecked)
            style = label_dict.get('S')
            prefix = decode_text(str_value(label_dict.get('P', b'')))
            first_value = int_value(label_dict.get('St', 1))
            if next == len(ranges):
                # This is the last specified range. It continues until the end
                # of the document.
                values: Iterable[int] = itertools.count(first_value)
            else:
                end, _ = ranges[next]
                range_length = end - start
                values = range(first_value, first_value + range_length)
            for value in values:
                label = self._format_page_label(value, style)
                yield prefix + label
    @staticmethod
    def _format_page_label(value: int, style: Any) -> str:
        """Format page label value in a specific style"""
        if style is None:
            label = ''
        elif style is LIT('D'):  # Decimal arabic numerals
            label = str(value)
        elif style is LIT('R'):  # Uppercase roman numerals
            label = format_int_roman(value).upper()
        elif style is LIT('r'):  # Lowercase roman numerals
            label = format_int_roman(value)
        elif style is LIT('A'):  # Uppercase letters A-Z, AA-ZZ...
            label = format_int_alpha(value).upper()
        elif style is LIT('a'):  # Lowercase letters a-z, aa-zz...
            label = format_int_alpha(value)
        else:
            log.warning('Unknown page label style: %r', style)
            label = ''
        return label
--- a/pdfminer/pdfpage.py
+++ b/pdfminer/pdfpage.py
@ -1,9 +1,11 @@
 import itertools
 import logging
 from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
 from pdfminer.utils import Rect
 from . import settings
-from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
+from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
    PDFNoPageLabels
 from .pdfparser import PDFParser
 from .pdftypes import PDFObjectNotFound
 from .pdftypes import dict_value
@ -38,23 +40,27 @@ class PDFPage:
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
      label: the page's label (typically, the logical page number).
    """
    def __init__(
        self,
        doc: PDFDocument,
        pageid: object,
-        attrs: object
+        attrs: object,
        label: Optional[str]
    ) -> None:
        """Initialize a page object.
        doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        label: page label string.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.label = label
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources: Dict[object, object] = \
            resolve1(self.attrs.get('Resources', dict()))
@ -109,11 +115,17 @@ class PDFPage:
            elif tree_type is LITERAL_PAGE:
                log.info('Page: %r', tree)
                yield (objid, tree)
        try:
            page_labels: Iterator[Optional[str]] = document.get_page_labels()
        except PDFNoPageLabels:
            page_labels = itertools.repeat(None)
        pages = False
        if 'Pages' in document.catalog:
            objects = search(document.catalog['Pages'], document.catalog)
            for (objid, tree) in objects:
-                yield cls(document, objid, tree)
+                yield cls(document, objid, tree, next(page_labels))
                pages = True
        if not pages:
            # fallback when /Pages is missing.
@ -123,7 +135,7 @@ class PDFPage:
                        obj = document.getobj(objid)
                        if isinstance(obj, dict) \
                                and obj.get('Type') is LITERAL_PAGE:
-                            yield cls(document, objid, obj)
+                            yield cls(document, objid, obj, next(page_labels))
                    except PDFObjectNotFound:
                        pass
        return
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -3,6 +3,7 @@ Miscellaneous Routines.
 """
 import io
 import pathlib
 import string
 import struct
 from html import escape
 from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
@ -527,3 +528,47 @@ class Plane(Generic[LTComponentT]):
                        or y1 <= obj.y0:
                    continue
                yield obj
 ROMAN_ONES = ['i', 'x', 'c', 'm']
 ROMAN_FIVES = ['v', 'l', 'd']
 def format_int_roman(value: int) -> str:
    """Format a number as lowercase Roman numerals."""
    assert 0 < value < 4000
    result: List[str] = []
    index = 0
    while value != 0:
        value, remainder = divmod(value, 10)
        if remainder == 9:
            result.insert(0, ROMAN_ONES[index])
            result.insert(1, ROMAN_ONES[index + 1])
        elif remainder == 4:
            result.insert(0, ROMAN_ONES[index])
            result.insert(1, ROMAN_FIVES[index])
        else:
            over_five = remainder >= 5
            if over_five:
                result.insert(0, ROMAN_FIVES[index])
                remainder -= 5
            result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
        index += 1
    return ''.join(result)
 def format_int_alpha(value: int) -> str:
    """Format a number as lowercase letters a-z, aa-zz, etc."""
    assert value > 0
    result: List[str] = []
    while value != 0:
        value, remainder = divmod(value - 1, len(string.ascii_lowercase))
        result.append(string.ascii_lowercase[remainder])
    result.reverse()
    return ''.join(result)
--- a/samples/contrib/pagelabels.pdf
+++ b/samples/contrib/pagelabels.pdf
--- a/tests/test_pdfdocument.py
+++ b/tests/test_pdfdocument.py
@ -1,9 +1,11 @@
 import itertools
 from nose.tools import assert_equal, raises
 from helpers import absolute_sample_path
-from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
 from pdfminer.pdfparser import PDFParser
-from pdfminer.pdftypes import PDFObjectNotFound
+from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
 class TestPdfDocument(object):
@ -25,3 +27,21 @@ class TestPdfDocument(object):
            doc = PDFDocument(parser)
            assert_equal(doc.info,
                         [{'Producer': b'European Patent Office'}])
    def test_page_labels(self):
        path = absolute_sample_path('contrib/pagelabels.pdf')
        with open(path, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
            assert_equal(
                list(itertools.islice(doc.get_page_labels(), total_pages)),
                ['iii', 'iv', '1', '2', '1'])
    @raises(PDFNoPageLabels)
    def test_no_page_labels(self):
        path = absolute_sample_path('simple1.pdf')
        with open(path, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            doc.get_page_labels()
--- a/tests/test_pdfpage.py
+++ b/tests/test_pdfpage.py
@ -0,0 +1,18 @@
 from nose.tools import assert_equal
 from helpers import absolute_sample_path
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfpage import PDFPage
 class TestPdfPage(object):
    def test_page_labels(self):
        path = absolute_sample_path('contrib/pagelabels.pdf')
        expected_labels = ['iii', 'iv', '1', '2', '1']
        with open(path, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            for (i, page) in enumerate(PDFPage.create_pages(doc)):
                assert_equal(page.label, expected_labels[i])
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -3,7 +3,8 @@ import pathlib
 from helpers import absolute_sample_path
 from pdfminer.layout import LTComponent
-from pdfminer.utils import open_filename, Plane, shorten_str
+from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename,
                            Plane, shorten_str)
 class TestOpenFilename:
@ -76,3 +77,34 @@ class TestFunctions(object):
    def test_shorten_to_really_short(self):
        assert_equal('Hello', shorten_str('Hello World', 5))
    def test_format_int_alpha(self):
        assert_equal('a', format_int_alpha(1))
        assert_equal('b', format_int_alpha(2))
        assert_equal('z', format_int_alpha(26))
        assert_equal('aa', format_int_alpha(27))
        assert_equal('ab', format_int_alpha(28))
        assert_equal('az', format_int_alpha(26*2))
        assert_equal('ba', format_int_alpha(26*2 + 1))
        assert_equal('zz', format_int_alpha(26*27))
        assert_equal('aaa', format_int_alpha(26*27 + 1))
    def test_format_int_roman(self):
        assert_equal('i', format_int_roman(1))
        assert_equal('ii', format_int_roman(2))
        assert_equal('iii', format_int_roman(3))
        assert_equal('iv', format_int_roman(4))
        assert_equal('v', format_int_roman(5))
        assert_equal('vi', format_int_roman(6))
        assert_equal('vii', format_int_roman(7))
        assert_equal('viii', format_int_roman(8))
        assert_equal('ix', format_int_roman(9))
        assert_equal('x', format_int_roman(10))
        assert_equal('xi', format_int_roman(11))
        assert_equal('xx', format_int_roman(20))
        assert_equal('xl', format_int_roman(40))
        assert_equal('xlv', format_int_roman(45))
        assert_equal('l', format_int_roman(50))
        assert_equal('xc', format_int_roman(90))
        assert_equal('xci', format_int_roman(91))
        assert_equal('c', format_int_roman(100))