Added feature: page labels (#680)
* port page label code from pdfannots * add tests and clean up * more cleanup; harden against non-conforming input * one more test * update CHANGELOG * cleanup & respond to review feedback (incomplete) * Refactor implementation of get_page_labels() into a NumberTree and PageLabels class. * PageLabels *is* a NumberTree and should always behave like one. This justifies inheriting its data and behavior. And it simplifies the code a bit more. * fix type errors and cleanup slightly * fix mypy errors (including tweaking code to avoid problematic dynamic types) * hoist dict_value from NumberTree (where it may not be a dict) to PageLabels (where it must be) * avoid repeated warnings by calling _parse() recursively, and checking sortedness only at the end Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/704/head
parent
b19f9e7270
commit
1d1602e0c5
|
@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
### Added
|
### Added
|
||||||
- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
|
- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
|
||||||
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
|
- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
|
||||||
|
- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
import functools
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
from pdfminer import settings
|
||||||
|
from pdfminer.pdfparser import PDFSyntaxError
|
||||||
|
from pdfminer.pdftypes import list_value, int_value, dict_value
|
||||||
|
from pdfminer.utils import choplist
|
||||||
|
|
||||||
|
|
||||||
|
class NumberTree:
|
||||||
|
"""A PDF number tree.
|
||||||
|
|
||||||
|
See Section 3.8.6 of the PDF Reference.
|
||||||
|
"""
|
||||||
|
def __init__(self, obj: Any):
|
||||||
|
self._obj = dict_value(obj)
|
||||||
|
self.nums: Optional[Iterable[Any]] = None
|
||||||
|
self.kids: Optional[Iterable[Any]] = None
|
||||||
|
self.limits: Optional[Iterable[Any]] = None
|
||||||
|
|
||||||
|
if 'Nums' in self._obj:
|
||||||
|
self.nums = list_value(self._obj['Nums'])
|
||||||
|
if 'Kids' in self._obj:
|
||||||
|
self.kids = list_value(self._obj['Kids'])
|
||||||
|
if 'Limits' in self._obj:
|
||||||
|
self.limits = list_value(self._obj['Limits'])
|
||||||
|
|
||||||
|
def _parse(self) -> List[Tuple[int, Any]]:
|
||||||
|
l = []
|
||||||
|
if self.nums: # Leaf node
|
||||||
|
for k, v in choplist(2, self.nums):
|
||||||
|
l.append((int_value(k), v))
|
||||||
|
|
||||||
|
if self.kids: # Root or intermediate node
|
||||||
|
for child_ref in self.kids:
|
||||||
|
l += NumberTree(child_ref)._parse()
|
||||||
|
|
||||||
|
return l
|
||||||
|
|
||||||
|
values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy
|
||||||
|
|
||||||
|
@property # type: ignore [no-redef,misc]
|
||||||
|
@functools.lru_cache
|
||||||
|
def values(self) -> List[Tuple[int, Any]]:
|
||||||
|
values = self._parse()
|
||||||
|
|
||||||
|
if settings.STRICT:
|
||||||
|
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
|
||||||
|
raise PDFSyntaxError('Number tree elements are out of order')
|
||||||
|
else:
|
||||||
|
values.sort(key=lambda t: t[0])
|
||||||
|
|
||||||
|
return values
|
|
@ -1,3 +1,4 @@
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
|
@ -10,12 +11,14 @@ from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
||||||
|
|
||||||
from . import settings
|
from . import settings
|
||||||
from .arcfour import Arcfour
|
from .arcfour import Arcfour
|
||||||
|
from .data_structures import NumberTree
|
||||||
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
|
from .pdfparser import PDFSyntaxError, PDFParser, PDFStreamParser
|
||||||
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
|
from .pdftypes import DecipherCallable, PDFException, PDFTypeError, PDFStream, \
|
||||||
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
|
||||||
uint_value, dict_value, stream_value
|
uint_value, dict_value, stream_value
|
||||||
from .psparser import PSEOF, literal_name, LIT, KWD
|
from .psparser import PSEOF, literal_name, LIT, KWD
|
||||||
from .utils import choplist, nunpack, decode_text
|
from .utils import choplist, decode_text, nunpack, format_int_roman, \
|
||||||
|
format_int_alpha
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -36,6 +39,10 @@ class PDFNoOutlines(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PDFNoPageLabels(PDFException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PDFDestinationNotFound(PDFException):
|
class PDFDestinationNotFound(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -890,6 +897,24 @@ class PDFDocument:
|
||||||
return
|
return
|
||||||
return search(self.catalog['Outlines'], 0)
|
return search(self.catalog['Outlines'], 0)
|
||||||
|
|
||||||
|
def get_page_labels(self) -> Iterator[str]:
|
||||||
|
"""
|
||||||
|
Generate page label strings for the PDF document.
|
||||||
|
|
||||||
|
If the document includes page labels, generates strings, one per page.
|
||||||
|
If not, raises PDFNoPageLabels.
|
||||||
|
|
||||||
|
The resulting iteration is unbounded.
|
||||||
|
"""
|
||||||
|
assert self.catalog is not None
|
||||||
|
|
||||||
|
try:
|
||||||
|
page_labels = PageLabels(self.catalog['PageLabels'])
|
||||||
|
except (PDFTypeError, KeyError):
|
||||||
|
raise PDFNoPageLabels
|
||||||
|
|
||||||
|
return page_labels.labels
|
||||||
|
|
||||||
def lookup_name(
|
def lookup_name(
|
||||||
self,
|
self,
|
||||||
cat: str,
|
cat: str,
|
||||||
|
@ -989,3 +1014,61 @@ class PDFDocument:
|
||||||
pos = int_value(trailer['Prev'])
|
pos = int_value(trailer['Prev'])
|
||||||
self.read_xref_from(parser, pos, xrefs)
|
self.read_xref_from(parser, pos, xrefs)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class PageLabels(NumberTree):
|
||||||
|
"""PageLabels from the document catalog.
|
||||||
|
|
||||||
|
See Section 8.3.1 in the PDF Reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self) -> Iterator[str]:
|
||||||
|
ranges = self.values
|
||||||
|
|
||||||
|
# The tree must begin with page index 0
|
||||||
|
if len(ranges) == 0 or ranges[0][0] != 0:
|
||||||
|
if settings.STRICT:
|
||||||
|
raise PDFSyntaxError('PageLabels is missing page index 0')
|
||||||
|
else:
|
||||||
|
# Try to cope, by assuming empty labels for the initial pages
|
||||||
|
ranges.insert(0, (0, {}))
|
||||||
|
|
||||||
|
for (next, (start, label_dict_unchecked)) in enumerate(ranges, 1):
|
||||||
|
label_dict = dict_value(label_dict_unchecked)
|
||||||
|
style = label_dict.get('S')
|
||||||
|
prefix = decode_text(str_value(label_dict.get('P', b'')))
|
||||||
|
first_value = int_value(label_dict.get('St', 1))
|
||||||
|
|
||||||
|
if next == len(ranges):
|
||||||
|
# This is the last specified range. It continues until the end
|
||||||
|
# of the document.
|
||||||
|
values: Iterable[int] = itertools.count(first_value)
|
||||||
|
else:
|
||||||
|
end, _ = ranges[next]
|
||||||
|
range_length = end - start
|
||||||
|
values = range(first_value, first_value + range_length)
|
||||||
|
|
||||||
|
for value in values:
|
||||||
|
label = self._format_page_label(value, style)
|
||||||
|
yield prefix + label
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _format_page_label(value: int, style: Any) -> str:
|
||||||
|
"""Format page label value in a specific style"""
|
||||||
|
if style is None:
|
||||||
|
label = ''
|
||||||
|
elif style is LIT('D'): # Decimal arabic numerals
|
||||||
|
label = str(value)
|
||||||
|
elif style is LIT('R'): # Uppercase roman numerals
|
||||||
|
label = format_int_roman(value).upper()
|
||||||
|
elif style is LIT('r'): # Lowercase roman numerals
|
||||||
|
label = format_int_roman(value)
|
||||||
|
elif style is LIT('A'): # Uppercase letters A-Z, AA-ZZ...
|
||||||
|
label = format_int_alpha(value).upper()
|
||||||
|
elif style is LIT('a'): # Lowercase letters a-z, aa-zz...
|
||||||
|
label = format_int_alpha(value)
|
||||||
|
else:
|
||||||
|
log.warning('Unknown page label style: %r', style)
|
||||||
|
label = ''
|
||||||
|
return label
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
|
from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple
|
||||||
|
|
||||||
from pdfminer.utils import Rect
|
from pdfminer.utils import Rect
|
||||||
from . import settings
|
from . import settings
|
||||||
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
|
from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, \
|
||||||
|
PDFNoPageLabels
|
||||||
from .pdfparser import PDFParser
|
from .pdfparser import PDFParser
|
||||||
from .pdftypes import PDFObjectNotFound
|
from .pdftypes import PDFObjectNotFound
|
||||||
from .pdftypes import dict_value
|
from .pdftypes import dict_value
|
||||||
|
@ -38,23 +40,27 @@ class PDFPage:
|
||||||
rotate: the page rotation (in degree).
|
rotate: the page rotation (in degree).
|
||||||
annots: the page annotations.
|
annots: the page annotations.
|
||||||
beads: a chain that represents natural reading order.
|
beads: a chain that represents natural reading order.
|
||||||
|
label: the page's label (typically, the logical page number).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
doc: PDFDocument,
|
doc: PDFDocument,
|
||||||
pageid: object,
|
pageid: object,
|
||||||
attrs: object
|
attrs: object,
|
||||||
|
label: Optional[str]
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize a page object.
|
"""Initialize a page object.
|
||||||
|
|
||||||
doc: a PDFDocument object.
|
doc: a PDFDocument object.
|
||||||
pageid: any Python object that can uniquely identify the page.
|
pageid: any Python object that can uniquely identify the page.
|
||||||
attrs: a dictionary of page attributes.
|
attrs: a dictionary of page attributes.
|
||||||
|
label: page label string.
|
||||||
"""
|
"""
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.pageid = pageid
|
self.pageid = pageid
|
||||||
self.attrs = dict_value(attrs)
|
self.attrs = dict_value(attrs)
|
||||||
|
self.label = label
|
||||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||||
self.resources: Dict[object, object] = \
|
self.resources: Dict[object, object] = \
|
||||||
resolve1(self.attrs.get('Resources', dict()))
|
resolve1(self.attrs.get('Resources', dict()))
|
||||||
|
@ -109,11 +115,17 @@ class PDFPage:
|
||||||
elif tree_type is LITERAL_PAGE:
|
elif tree_type is LITERAL_PAGE:
|
||||||
log.info('Page: %r', tree)
|
log.info('Page: %r', tree)
|
||||||
yield (objid, tree)
|
yield (objid, tree)
|
||||||
|
|
||||||
|
try:
|
||||||
|
page_labels: Iterator[Optional[str]] = document.get_page_labels()
|
||||||
|
except PDFNoPageLabels:
|
||||||
|
page_labels = itertools.repeat(None)
|
||||||
|
|
||||||
pages = False
|
pages = False
|
||||||
if 'Pages' in document.catalog:
|
if 'Pages' in document.catalog:
|
||||||
objects = search(document.catalog['Pages'], document.catalog)
|
objects = search(document.catalog['Pages'], document.catalog)
|
||||||
for (objid, tree) in objects:
|
for (objid, tree) in objects:
|
||||||
yield cls(document, objid, tree)
|
yield cls(document, objid, tree, next(page_labels))
|
||||||
pages = True
|
pages = True
|
||||||
if not pages:
|
if not pages:
|
||||||
# fallback when /Pages is missing.
|
# fallback when /Pages is missing.
|
||||||
|
@ -123,7 +135,7 @@ class PDFPage:
|
||||||
obj = document.getobj(objid)
|
obj = document.getobj(objid)
|
||||||
if isinstance(obj, dict) \
|
if isinstance(obj, dict) \
|
||||||
and obj.get('Type') is LITERAL_PAGE:
|
and obj.get('Type') is LITERAL_PAGE:
|
||||||
yield cls(document, objid, obj)
|
yield cls(document, objid, obj, next(page_labels))
|
||||||
except PDFObjectNotFound:
|
except PDFObjectNotFound:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
|
@ -3,6 +3,7 @@ Miscellaneous Routines.
|
||||||
"""
|
"""
|
||||||
import io
|
import io
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import string
|
||||||
import struct
|
import struct
|
||||||
from html import escape
|
from html import escape
|
||||||
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
|
from typing import (Any, BinaryIO, Callable, Dict, Generic, Iterable, Iterator,
|
||||||
|
@ -527,3 +528,47 @@ class Plane(Generic[LTComponentT]):
|
||||||
or y1 <= obj.y0:
|
or y1 <= obj.y0:
|
||||||
continue
|
continue
|
||||||
yield obj
|
yield obj
|
||||||
|
|
||||||
|
|
||||||
|
ROMAN_ONES = ['i', 'x', 'c', 'm']
|
||||||
|
ROMAN_FIVES = ['v', 'l', 'd']
|
||||||
|
|
||||||
|
|
||||||
|
def format_int_roman(value: int) -> str:
|
||||||
|
"""Format a number as lowercase Roman numerals."""
|
||||||
|
|
||||||
|
assert 0 < value < 4000
|
||||||
|
result: List[str] = []
|
||||||
|
index = 0
|
||||||
|
|
||||||
|
while value != 0:
|
||||||
|
value, remainder = divmod(value, 10)
|
||||||
|
if remainder == 9:
|
||||||
|
result.insert(0, ROMAN_ONES[index])
|
||||||
|
result.insert(1, ROMAN_ONES[index + 1])
|
||||||
|
elif remainder == 4:
|
||||||
|
result.insert(0, ROMAN_ONES[index])
|
||||||
|
result.insert(1, ROMAN_FIVES[index])
|
||||||
|
else:
|
||||||
|
over_five = remainder >= 5
|
||||||
|
if over_five:
|
||||||
|
result.insert(0, ROMAN_FIVES[index])
|
||||||
|
remainder -= 5
|
||||||
|
result.insert(1 if over_five else 0, ROMAN_ONES[index] * remainder)
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def format_int_alpha(value: int) -> str:
|
||||||
|
"""Format a number as lowercase letters a-z, aa-zz, etc."""
|
||||||
|
|
||||||
|
assert value > 0
|
||||||
|
result: List[str] = []
|
||||||
|
|
||||||
|
while value != 0:
|
||||||
|
value, remainder = divmod(value - 1, len(string.ascii_lowercase))
|
||||||
|
result.append(string.ascii_lowercase[remainder])
|
||||||
|
|
||||||
|
result.reverse()
|
||||||
|
return ''.join(result)
|
||||||
|
|
Binary file not shown.
|
@ -1,9 +1,11 @@
|
||||||
|
import itertools
|
||||||
|
|
||||||
from nose.tools import assert_equal, raises
|
from nose.tools import assert_equal, raises
|
||||||
|
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdftypes import PDFObjectNotFound
|
from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value
|
||||||
|
|
||||||
|
|
||||||
class TestPdfDocument(object):
|
class TestPdfDocument(object):
|
||||||
|
@ -25,3 +27,21 @@ class TestPdfDocument(object):
|
||||||
doc = PDFDocument(parser)
|
doc = PDFDocument(parser)
|
||||||
assert_equal(doc.info,
|
assert_equal(doc.info,
|
||||||
[{'Producer': b'European Patent Office'}])
|
[{'Producer': b'European Patent Office'}])
|
||||||
|
|
||||||
|
def test_page_labels(self):
|
||||||
|
path = absolute_sample_path('contrib/pagelabels.pdf')
|
||||||
|
with open(path, 'rb') as fp:
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument(parser)
|
||||||
|
total_pages = int_value(dict_value(doc.catalog['Pages'])['Count'])
|
||||||
|
assert_equal(
|
||||||
|
list(itertools.islice(doc.get_page_labels(), total_pages)),
|
||||||
|
['iii', 'iv', '1', '2', '1'])
|
||||||
|
|
||||||
|
@raises(PDFNoPageLabels)
|
||||||
|
def test_no_page_labels(self):
|
||||||
|
path = absolute_sample_path('simple1.pdf')
|
||||||
|
with open(path, 'rb') as fp:
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument(parser)
|
||||||
|
doc.get_page_labels()
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
from nose.tools import assert_equal
|
||||||
|
|
||||||
|
from helpers import absolute_sample_path
|
||||||
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
||||||
|
|
||||||
|
class TestPdfPage(object):
|
||||||
|
def test_page_labels(self):
|
||||||
|
path = absolute_sample_path('contrib/pagelabels.pdf')
|
||||||
|
expected_labels = ['iii', 'iv', '1', '2', '1']
|
||||||
|
|
||||||
|
with open(path, 'rb') as fp:
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument(parser)
|
||||||
|
for (i, page) in enumerate(PDFPage.create_pages(doc)):
|
||||||
|
assert_equal(page.label, expected_labels[i])
|
|
@ -3,7 +3,8 @@ import pathlib
|
||||||
|
|
||||||
from helpers import absolute_sample_path
|
from helpers import absolute_sample_path
|
||||||
from pdfminer.layout import LTComponent
|
from pdfminer.layout import LTComponent
|
||||||
from pdfminer.utils import open_filename, Plane, shorten_str
|
from pdfminer.utils import (format_int_alpha, format_int_roman, open_filename,
|
||||||
|
Plane, shorten_str)
|
||||||
|
|
||||||
|
|
||||||
class TestOpenFilename:
|
class TestOpenFilename:
|
||||||
|
@ -76,3 +77,34 @@ class TestFunctions(object):
|
||||||
|
|
||||||
def test_shorten_to_really_short(self):
|
def test_shorten_to_really_short(self):
|
||||||
assert_equal('Hello', shorten_str('Hello World', 5))
|
assert_equal('Hello', shorten_str('Hello World', 5))
|
||||||
|
|
||||||
|
def test_format_int_alpha(self):
|
||||||
|
assert_equal('a', format_int_alpha(1))
|
||||||
|
assert_equal('b', format_int_alpha(2))
|
||||||
|
assert_equal('z', format_int_alpha(26))
|
||||||
|
assert_equal('aa', format_int_alpha(27))
|
||||||
|
assert_equal('ab', format_int_alpha(28))
|
||||||
|
assert_equal('az', format_int_alpha(26*2))
|
||||||
|
assert_equal('ba', format_int_alpha(26*2 + 1))
|
||||||
|
assert_equal('zz', format_int_alpha(26*27))
|
||||||
|
assert_equal('aaa', format_int_alpha(26*27 + 1))
|
||||||
|
|
||||||
|
def test_format_int_roman(self):
|
||||||
|
assert_equal('i', format_int_roman(1))
|
||||||
|
assert_equal('ii', format_int_roman(2))
|
||||||
|
assert_equal('iii', format_int_roman(3))
|
||||||
|
assert_equal('iv', format_int_roman(4))
|
||||||
|
assert_equal('v', format_int_roman(5))
|
||||||
|
assert_equal('vi', format_int_roman(6))
|
||||||
|
assert_equal('vii', format_int_roman(7))
|
||||||
|
assert_equal('viii', format_int_roman(8))
|
||||||
|
assert_equal('ix', format_int_roman(9))
|
||||||
|
assert_equal('x', format_int_roman(10))
|
||||||
|
assert_equal('xi', format_int_roman(11))
|
||||||
|
assert_equal('xx', format_int_roman(20))
|
||||||
|
assert_equal('xl', format_int_roman(40))
|
||||||
|
assert_equal('xlv', format_int_roman(45))
|
||||||
|
assert_equal('l', format_int_roman(50))
|
||||||
|
assert_equal('xc', format_int_roman(90))
|
||||||
|
assert_equal('xci', format_int_roman(91))
|
||||||
|
assert_equal('c', format_int_roman(100))
|
||||||
|
|
Loading…
Reference in New Issue