From 3e364354dade53be1de71751fdbe95b69c96c59a Mon Sep 17 00:00:00 2001 From: Sergei Maertens Date: Thu, 20 Jul 2017 20:46:35 +0200 Subject: [PATCH] Fixes #64 -- be less strict when inspecting a tree type (#76) In the PDFStream it's possible that the /Type element is not present, but /type is. According to the spec, these are different elements, but in the case in point they had the same meaning. If PDFMiner is not running in STRICT mode and /Type doesn't resolve, a fallback to /type is used to determine the tree type. --- pdfminer/pdfpage.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 2cbe58e..418aeb2 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -1,5 +1,6 @@ import logging +from . import settings from .psparser import LIT from .pdftypes import PDFObjectNotFound from .pdftypes import resolve1 @@ -88,12 +89,17 @@ class PDFPage(object): for (k, v) in six.iteritems(parent): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v - if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: + + tree_type = tree.get('Type') + if tree_type is None and not settings.STRICT: # See #64 + tree_type = tree.get('type') + + if tree_type is LITERAL_PAGES and 'Kids' in tree: log.info('Pages: Kids=%r', tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x - elif tree.get('Type') is LITERAL_PAGE: + elif tree_type is LITERAL_PAGE: log.info('Page: %r', tree) yield (objid, tree) pages = False