diff --git a/CHANGELOG.md b/CHANGELOG.md index 21483e3..649b4ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679)) - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626)) +- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680)) ### Fixed - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637)) diff --git a/pdfminer/data_structures.py b/pdfminer/data_structures.py new file mode 100644 index 0000000..5239a38 --- /dev/null +++ b/pdfminer/data_structures.py @@ -0,0 +1,53 @@ +import functools +from typing import Any, Dict, Iterable, List, Optional, Tuple + +from pdfminer import settings +from pdfminer.pdfparser import PDFSyntaxError +from pdfminer.pdftypes import list_value, int_value, dict_value +from pdfminer.utils import choplist + + +class NumberTree: + """A PDF number tree. + + See Section 3.8.6 of the PDF Reference. + """ + def __init__(self, obj: Any): + self._obj = dict_value(obj) + self.nums: Optional[Iterable[Any]] = None + self.kids: Optional[Iterable[Any]] = None + self.limits: Optional[Iterable[Any]] = None + + if 'Nums' in self._obj: + self.nums = list_value(self._obj['Nums']) + if 'Kids' in self._obj: + self.kids = list_value(self._obj['Kids']) + if 'Limits' in self._obj: + self.limits = list_value(self._obj['Limits']) + + def _parse(self) -> List[Tuple[int, Any]]: + l = [] + if self.nums: # Leaf node + for k, v in choplist(2, self.nums): + l.append((int_value(k), v)) + + if self.kids: # Root or intermediate node + for child_ref in self.kids: + l += NumberTree(child_ref)._parse() + + return l + + values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy + + @property # type: ignore [no-redef,misc] + @functools.lru_cache + def values(self) -> List[Tuple[int, Any]]: + values = self._parse() + + if settings.STRICT: + if not all(a[0] <= b[0] for a, b in zip(values, values[1:])): + raise PDFSyntaxError('Number tree elements are out of order') + else: + values.sort(key=lambda t: t[0]) + + return values diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index ee61937..1968569 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -1,3 +1,4 @@ +import itertools import logging import re import struct @@ -10,12 +11,14 @@ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes from . import settings from .arcfour import Arcfour +from .data_structures import NumberTree from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \ PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \ uint_value, dict_value, stream_value from .psparser import PSEOF, literal_name, LIT, KWD -from .utils import choplist, nunpack, decode_text +from .utils import choplist, decode_text, nunpack, format_int_roman, \ + format_int_alpha log = logging.getLogger(__name__) @@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException): pass +class PDFNoPageLabels(PDFException): + pass + + class PDFDestinationNotFound(PDFException): pass @@ -890,6 +897,24 @@ class PDFDocument: return return search(self.catalog['Outlines'], 0) + def get_page_labels(self) -> Iterator[str]: + """ + Generate page label strings for the PDF document. + + If the document includes page labels, generates strings, one per page. + If not, raises PDFNoPageLabels. + + The resulting iteration is unbounded. + """ + assert self.catalog is not None + + try: + page_labels = PageLabels(self.catalog['PageLabels']) + except (PDFTypeError, KeyError): + raise PDFNoPageLabels + + return page_labels.labels + def lookup_name( self, cat: str, @@ -989,3 +1014,61 @@ class PDFDocument: pos = int_value(trailer['Prev']) self.read_xref_from(parser, pos, xrefs) return + + +class PageLabels(NumberTree): + """PageLabels from the document catalog. + + See Section 8.3.1 in the PDF Reference. + """ + + @property + def labels(self) -> Iterator[str]: + ranges = self.values + + # The tree must begin with page index 0 + if len(ranges) == 0 or ranges[0][0] != 0: + if settings.STRICT: + raise PDFSyntaxError('PageLabels is missing page index 0') + else: + # Try to cope, by assuming empty labels for the initial pages + ranges.insert(0, (0, {})) + + for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1): + label_dict = dict_value(label_dict_unchecked) + style = label_dict.get('S') + prefix = decode_text(str_value(label_dict.get('P', b''))) + first_value = int_value(label_dict.get('St', 1)) + + if next == len(ranges): + # This is the last specified range. It continues until the end + # of the document. + values: Iterable[int] = itertools.count(first_value) + else: + end, _ = ranges[next] + range_length = end - start + values = range(first_value, first_value + range_length) + + for value in values: + label = self._format_page_label(value, style) + yield prefix + label + + @staticmethod + def _format_page_label(value: int, style: Any) -> str: + """Format page label value in a specific style""" + if style is None: + label = '' + elif style is LIT('D'): # Decimal arabic numerals + label = str(value) + elif style is LIT('R'): # Uppercase roman numerals + label = format_int_roman(value).upper() + elif style is LIT('r'): # Lowercase roman numerals + label = format_int_roman(value) + elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ... + label = format_int_alpha(value).upper() + elif style is LIT('a'): # Lowercase letters a-z, aa-zz... + label = format_int_alpha(value) + else: + log.warning('Unknown page label style: %r', style) + label = '' + return label diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index c822083..39195a3 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -1,9 +1,11 @@ +import itertools import logging from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple from pdfminer.utils import Rect from . import settings -from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed +from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \ + PDFNoPageLabels from .pdfparser import PDFParser from .pdftypes import PDFObjectNotFound from .pdftypes import dict_value @@ -38,23 +40,27 @@ class PDFPage: rotate: the page rotation (in degree). annots: the page annotations. beads: a chain that represents natural reading order. + label: the page's label (typically, the logical page number). """ def __init__( self, doc: PDFDocument, pageid: object, - attrs: object + attrs: object, + label: Optional[str] ) -> None: """Initialize a page object. doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. + label: page label string. """ self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) + self.label = label self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources: Dict[object, object] = \ resolve1(self.attrs.get('Resources', dict())) @@ -109,11 +115,17 @@ class PDFPage: elif tree_type is LITERAL_PAGE: log.info('Page: %r', tree) yield (objid, tree) + + try: + page_labels: Iterator[Optional[str]] = document.get_page_labels() + except PDFNoPageLabels: + page_labels = itertools.repeat(None) + pages = False if 'Pages' in document.catalog: objects = search(document.catalog['Pages'], document.catalog) for (objid, tree) in objects: - yield cls(document, objid, tree) + yield cls(document, objid, tree, next(page_labels)) pages = True if not pages: # fallback when /Pages is missing. @@ -123,7 +135,7 @@ class PDFPage: obj = document.getobj(objid) if isinstance(obj, dict) \ and obj.get('Type') is LITERAL_PAGE: - yield cls(document, objid, obj) + yield cls(document, objid, obj, next(page_labels)) except PDFObjectNotFound: pass return diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 01c5901..77d5f9b 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -3,6 +3,7 @@ Miscellaneous Routines. """ import io import pathlib +import string import struct from html import escape from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator, @@ -527,3 +528,47 @@ class Plane(Generic[LTComponentT]): or y1 <= obj.y0: continue yield obj + + +ROMAN_ONES = ['i', 'x', 'c', 'm'] +ROMAN_FIVES = ['v', 'l', 'd'] + + +def format_int_roman(value: int) -> str: + """Format a number as lowercase Roman numerals.""" + + assert 0 < value < 4000 + result: List[str] = [] + index = 0 + + while value != 0: + value, remainder = divmod(value, 10) + if remainder == 9: + result.insert(0, ROMAN_ONES[index]) + result.insert(1, ROMAN_ONES[index + 1]) + elif remainder == 4: + result.insert(0, ROMAN_ONES[index]) + result.insert(1, ROMAN_FIVES[index]) + else: + over_five = remainder >= 5 + if over_five: + result.insert(0, ROMAN_FIVES[index]) + remainder -= 5 + result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder) + index += 1 + + return ''.join(result) + + +def format_int_alpha(value: int) -> str: + """Format a number as lowercase letters a-z, aa-zz, etc.""" + + assert value > 0 + result: List[str] = [] + + while value != 0: + value, remainder = divmod(value - 1, len(string.ascii_lowercase)) + result.append(string.ascii_lowercase[remainder]) + + result.reverse() + return ''.join(result) diff --git a/samples/contrib/pagelabels.pdf b/samples/contrib/pagelabels.pdf new file mode 100644 index 0000000..44e2d6b Binary files /dev/null and b/samples/contrib/pagelabels.pdf differ diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 3947b1b..d90abc0 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -1,9 +1,11 @@ +import itertools + from nose.tools import assert_equal, raises from helpers import absolute_sample_path -from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels from pdfminer.pdfparser import PDFParser -from pdfminer.pdftypes import PDFObjectNotFound +from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value class TestPdfDocument(object): @@ -25,3 +27,21 @@ class TestPdfDocument(object): doc = PDFDocument(parser) assert_equal(doc.info, [{'Producer': b'European Patent Office'}]) + + def test_page_labels(self): + path = absolute_sample_path('contrib/pagelabels.pdf') + with open(path, 'rb') as fp: + parser = PDFParser(fp) + doc = PDFDocument(parser) + total_pages = int_value(dict_value(doc.catalog['Pages'])['Count']) + assert_equal( + list(itertools.islice(doc.get_page_labels(), total_pages)), + ['iii', 'iv', '1', '2', '1']) + + @raises(PDFNoPageLabels) + def test_no_page_labels(self): + path = absolute_sample_path('simple1.pdf') + with open(path, 'rb') as fp: + parser = PDFParser(fp) + doc = PDFDocument(parser) + doc.get_page_labels() diff --git a/tests/test_pdfpage.py b/tests/test_pdfpage.py new file mode 100644 index 0000000..06574c3 --- /dev/null +++ b/tests/test_pdfpage.py @@ -0,0 +1,18 @@ +from nose.tools import assert_equal + +from helpers import absolute_sample_path +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfpage import PDFPage + + +class TestPdfPage(object): + def test_page_labels(self): + path = absolute_sample_path('contrib/pagelabels.pdf') + expected_labels = ['iii', 'iv', '1', '2', '1'] + + with open(path, 'rb') as fp: + parser = PDFParser(fp) + doc = PDFDocument(parser) + for (i, page) in enumerate(PDFPage.create_pages(doc)): + assert_equal(page.label, expected_labels[i]) diff --git a/tests/test_utils.py b/tests/test_utils.py index dca99a6..6c32181 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,8 @@ import pathlib from helpers import absolute_sample_path from pdfminer.layout import LTComponent -from pdfminer.utils import open_filename, Plane, shorten_str +from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename, + Plane, shorten_str) class TestOpenFilename: @@ -76,3 +77,34 @@ class TestFunctions(object): def test_shorten_to_really_short(self): assert_equal('Hello', shorten_str('Hello World', 5)) + + def test_format_int_alpha(self): + assert_equal('a', format_int_alpha(1)) + assert_equal('b', format_int_alpha(2)) + assert_equal('z', format_int_alpha(26)) + assert_equal('aa', format_int_alpha(27)) + assert_equal('ab', format_int_alpha(28)) + assert_equal('az', format_int_alpha(26*2)) + assert_equal('ba', format_int_alpha(26*2 + 1)) + assert_equal('zz', format_int_alpha(26*27)) + assert_equal('aaa', format_int_alpha(26*27 + 1)) + + def test_format_int_roman(self): + assert_equal('i', format_int_roman(1)) + assert_equal('ii', format_int_roman(2)) + assert_equal('iii', format_int_roman(3)) + assert_equal('iv', format_int_roman(4)) + assert_equal('v', format_int_roman(5)) + assert_equal('vi', format_int_roman(6)) + assert_equal('vii', format_int_roman(7)) + assert_equal('viii', format_int_roman(8)) + assert_equal('ix', format_int_roman(9)) + assert_equal('x', format_int_roman(10)) + assert_equal('xi', format_int_roman(11)) + assert_equal('xx', format_int_roman(20)) + assert_equal('xl', format_int_roman(40)) + assert_equal('xlv', format_int_roman(45)) + assert_equal('l', format_int_roman(50)) + assert_equal('xc', format_int_roman(90)) + assert_equal('xci', format_int_roman(91)) + assert_equal('c', format_int_roman(100))