pdfminer.six/pdfminer/pdfparser.py

171 lines
5.2 KiB
Python
Raw Permalink Normal View History

2014-06-14 03:00:49 +00:00
import logging
2014-06-25 10:55:41 +00:00
from io import BytesIO
2014-06-26 09:12:39 +00:00
from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import PSEOF
from .psparser import KWD
2016-01-10 17:17:38 +00:00
from . import settings
2014-06-26 09:12:39 +00:00
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import PDFObjRef
from .pdftypes import int_value
from .pdftypes import dict_value
2016-05-20 19:12:05 +00:00
log = logging.getLogger(__name__)
2013-11-07 08:35:04 +00:00
class PDFSyntaxError(PDFException):
pass
class PDFParser(PSStackParser):
"""
PDFParser fetch PDF objects from a file stream.
It can handle indirect references by referring to
a PDF document set by set_document method.
It also reads XRefs at the end of every PDF file.
Typical usage:
parser = PDFParser(fp)
parser.read_xref()
2013-10-09 12:39:23 +00:00
parser.read_xref(fallback=True) # optional
parser.set_document(doc)
parser.seek(offset)
parser.nextobject()
2013-11-07 07:14:53 +00:00
"""
def __init__(self, fp):
PSStackParser.__init__(self, fp)
self.doc = None
self.fallback = False
return
def set_document(self, doc):
"""Associates the parser with a PDFDocument object."""
self.doc = doc
return
KEYWORD_R = KWD(b'R')
KEYWORD_NULL = KWD(b'null')
KEYWORD_ENDOBJ = KWD(b'endobj')
KEYWORD_STREAM = KWD(b'stream')
KEYWORD_XREF = KWD(b'xref')
KEYWORD_STARTXREF = KWD(b'startxref')
2013-11-07 08:35:04 +00:00
def do_keyword(self, pos, token):
"""Handles PDF-related keywords."""
2013-11-07 07:14:53 +00:00
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
2013-11-07 07:14:53 +00:00
elif token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
elif token is self.KEYWORD_NULL:
# null object
self.push((pos, None))
elif token is self.KEYWORD_R:
# reference to indirect object
try:
2013-11-07 08:35:04 +00:00
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
elif token is self.KEYWORD_STREAM:
# stream object
2013-11-07 08:35:04 +00:00
((_, dic),) = self.pop(1)
dic = dict_value(dic)
objlen = 0
if not self.fallback:
try:
objlen = int_value(dic['Length'])
except KeyError:
2016-01-10 17:17:38 +00:00
if settings.STRICT:
raise PDFSyntaxError('/Length is undefined: %r' % dic)
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
except PSEOF:
2016-01-10 17:17:38 +00:00
if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF')
return
pos += len(line)
self.fp.seek(pos)
data = bytearray(self.fp.read(objlen))
self.seek(pos+objlen)
while 1:
try:
(linepos, line) = self.nextline()
except PSEOF:
2016-01-10 17:17:38 +00:00
if settings.STRICT:
raise PDFSyntaxError('Unexpected EOF')
break
if b'endstream' in line:
i = line.index(b'endstream')
objlen += i
if self.fallback:
data += line[:i]
break
objlen += len(line)
if self.fallback:
data += line
data = bytes(data)
self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary
Enforce pep8 coding-style (#345) * Code Refractor: Use code-style enforcement #312 * Add flake8 to travis-ci * Remove python 2 3 comment on six library. 891 errors > 870 errors. * Remove class and functions comments that consist of just the name. 870 errors > 855 errors. * Fix flake8 errors in pdftypes.py. 855 errors > 833 errors. * Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting * Cleanup pdfinterp.py and add documentation from PDF Reference * Cleanup pdfpage.py * Cleanup pdffont.py * Clean psparser.py * Cleanup high_level.py * Cleanup layout.py * Cleanup pdfparser.py * Cleanup pdfcolor.py * Cleanup rijndael.py * Cleanup converter.py * Rename klass to cls if it is the class variable, to be more consistent with standard practice * Cleanup cmap.py * Cleanup pdfdevice.py * flake8 ignore fontmetrics.py * Cleanup test_pdfminer_psparser.py * Fix flake8 in pdfdocument.py; 339 errors to go * Fix flake8 utils.py; 326 errors togo * pep8 correction for few files in /tools/ 328 > 160 to go (#342) * pep8 correction for few files in /tools/ 328 > 160 to go * pep8 correction: 160 > 5 to go * Fix ascii85.py errors * Fix error in getting index from target that does not exists * Remove commented print lines * Fix flake8 error in pdfinterp.py * Fix python2 specific error by removing argument from print statement * Ignore invalid python2 syntax * Update contributing.md * Added changelog * Remove unused import Co-authored-by: Fakabbir Amin <f4amin@gmail.com>
2019-12-29 20:20:20 +00:00
log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj))
else:
# others
self.push((pos, token))
2013-11-07 07:14:53 +00:00
return
class PDFStreamParser(PDFParser):
"""
PDFStreamParser is used to parse PDF content streams
that is contained in each page and has instructions
for rendering the page. A reference to a PDF document is
needed because a PDF content stream can also have
indirect references to other objects in the same document.
"""
def __init__(self, data):
2014-06-25 10:55:41 +00:00
PDFParser.__init__(self, BytesIO(data))
return
def flush(self):
self.add_results(*self.popall())
return
KEYWORD_OBJ = KWD(b'obj')
Enforce pep8 coding-style (#345) * Code Refractor: Use code-style enforcement #312 * Add flake8 to travis-ci * Remove python 2 3 comment on six library. 891 errors > 870 errors. * Remove class and functions comments that consist of just the name. 870 errors > 855 errors. * Fix flake8 errors in pdftypes.py. 855 errors > 833 errors. * Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting * Cleanup pdfinterp.py and add documentation from PDF Reference * Cleanup pdfpage.py * Cleanup pdffont.py * Clean psparser.py * Cleanup high_level.py * Cleanup layout.py * Cleanup pdfparser.py * Cleanup pdfcolor.py * Cleanup rijndael.py * Cleanup converter.py * Rename klass to cls if it is the class variable, to be more consistent with standard practice * Cleanup cmap.py * Cleanup pdfdevice.py * flake8 ignore fontmetrics.py * Cleanup test_pdfminer_psparser.py * Fix flake8 in pdfdocument.py; 339 errors to go * Fix flake8 utils.py; 326 errors togo * pep8 correction for few files in /tools/ 328 > 160 to go (#342) * pep8 correction for few files in /tools/ 328 > 160 to go * pep8 correction: 160 > 5 to go * Fix ascii85.py errors * Fix error in getting index from target that does not exists * Remove commented print lines * Fix flake8 error in pdfinterp.py * Fix python2 specific error by removing argument from print statement * Ignore invalid python2 syntax * Update contributing.md * Added changelog * Remove unused import Co-authored-by: Fakabbir Amin <f4amin@gmail.com>
2019-12-29 20:20:20 +00:00
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
try:
2013-11-07 08:35:04 +00:00
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
return
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
2016-01-10 17:17:38 +00:00
if settings.STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used.
raise PDFSyntaxError('Keyword endobj found in stream')
return
# others
self.push((pos, token))
return