pdfminer.six/pdfminer/pdfparser.py

import logging
from io import BytesIO
from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import PSEOF
from .psparser import KWD
from . import settings
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import PDFObjRef
from .pdftypes import int_value
from .pdftypes import dict_value

log = logging.getLogger(__name__)


class PDFSyntaxError(PDFException):
    pass


class PDFParser(PSStackParser):
    """
    PDFParser fetch PDF objects from a file stream.
    It can handle indirect references by referring to
    a PDF document set by set_document method.
    It also reads XRefs at the end of every PDF file.

    Typical usage:
      parser = PDFParser(fp)
      parser.read_xref()
      parser.read_xref(fallback=True) # optional
      parser.set_document(doc)
      parser.seek(offset)
      parser.nextobject()

    """

    def __init__(self, fp):
        PSStackParser.__init__(self, fp)
        self.doc = None
        self.fallback = False
        return

    def set_document(self, doc):
        """Associates the parser with a PDFDocument object."""
        self.doc = doc
        return

    KEYWORD_R = KWD(b'R')
    KEYWORD_NULL = KWD(b'null')
    KEYWORD_ENDOBJ = KWD(b'endobj')
    KEYWORD_STREAM = KWD(b'stream')
    KEYWORD_XREF = KWD(b'xref')
    KEYWORD_STARTXREF = KWD(b'startxref')

    def do_keyword(self, pos, token):
        """Handles PDF-related keywords."""

        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
            self.add_results(*self.pop(1))

        elif token is self.KEYWORD_ENDOBJ:
            self.add_results(*self.pop(4))

        elif token is self.KEYWORD_NULL:
            # null object
            self.push((pos, None))

        elif token is self.KEYWORD_R:
            # reference to indirect object
            try:
                ((_, objid), (_, genno)) = self.pop(2)
                (objid, genno) = (int(objid), int(genno))
                obj = PDFObjRef(self.doc, objid, genno)
                self.push((pos, obj))
            except PSSyntaxError:
                pass

        elif token is self.KEYWORD_STREAM:
            # stream object
            ((_, dic),) = self.pop(1)
            dic = dict_value(dic)
            objlen = 0
            if not self.fallback:
                try:
                    objlen = int_value(dic['Length'])
                except KeyError:
                    if settings.STRICT:
                        raise PDFSyntaxError('/Length is undefined: %r' % dic)
            self.seek(pos)
            try:
                (_, line) = self.nextline()  # 'stream'
            except PSEOF:
                if settings.STRICT:
                    raise PDFSyntaxError('Unexpected EOF')
                return
            pos += len(line)
            self.fp.seek(pos)
            data = bytearray(self.fp.read(objlen))
            self.seek(pos+objlen)
            while 1:
                try:
                    (linepos, line) = self.nextline()
                except PSEOF:
                    if settings.STRICT:
                        raise PDFSyntaxError('Unexpected EOF')
                    break
                if b'endstream' in line:
                    i = line.index(b'endstream')
                    objlen += i
                    if self.fallback:
                        data += line[:i]
                    break
                objlen += len(line)
                if self.fallback:
                    data += line
            data = bytes(data)
            self.seek(pos+objlen)
            # XXX limit objlen not to exceed object boundary
            log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,
                      objlen, dic, data[:10])
            obj = PDFStream(dic, data, self.doc.decipher)
            self.push((pos, obj))

        else:
            # others
            self.push((pos, token))

        return


class PDFStreamParser(PDFParser):
    """
    PDFStreamParser is used to parse PDF content streams
    that is contained in each page and has instructions
    for rendering the page. A reference to a PDF document is
    needed because a PDF content stream can also have
    indirect references to other objects in the same document.
    """

    def __init__(self, data):
        PDFParser.__init__(self, BytesIO(data))
        return

    def flush(self):
        self.add_results(*self.popall())
        return

    KEYWORD_OBJ = KWD(b'obj')

    def do_keyword(self, pos, token):
        if token is self.KEYWORD_R:
            # reference to indirect object
            try:
                ((_, objid), (_, genno)) = self.pop(2)
                (objid, genno) = (int(objid), int(genno))
                obj = PDFObjRef(self.doc, objid, genno)
                self.push((pos, obj))
            except PSSyntaxError:
                pass
            return
        elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
            if settings.STRICT:
                # See PDF Spec 3.4.6: Only the object values are stored in the
                # stream; the obj and endobj keywords are not used.
                raise PDFSyntaxError('Keyword endobj found in stream')
            return
        # others
        self.push((pos, token))
        return
Use logging module instead of print. 2014-06-14 03:00:49 +00:00			`import logging`
Changed: StringIO -> io.BytesIO 2014-06-25 10:55:41 +00:00			`from io import BytesIO`
Cleanup imports. Use relative imports. 2014-06-26 09:12:39 +00:00			`from .psparser import PSStackParser`
			`from .psparser import PSSyntaxError`
			`from .psparser import PSEOF`
			`from .psparser import KWD`
Improved settings management 2016-01-10 17:17:38 +00:00			`from . import settings`
Cleanup imports. Use relative imports. 2014-06-26 09:12:39 +00:00			`from .pdftypes import PDFException`
			`from .pdftypes import PDFStream`
			`from .pdftypes import PDFObjRef`
			`from .pdftypes import int_value`
			`from .pdftypes import dict_value`
initial import. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@2 1aa58f4a-7d42-0410-adbc-911cccaed67c 2007-12-30 09:13:51 +00:00
Make the logger run in a namespace. 2016-05-20 19:12:05 +00:00			`log = logging.getLogger(__name__)`

initial import. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@2 1aa58f4a-7d42-0410-adbc-911cccaed67c 2007-12-30 09:13:51 +00:00
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`class PDFSyntaxError(PDFException):`
			`pass`
outline bug fixed git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@249 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-10-17 05:14:52 +00:00
initial import. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@2 1aa58f4a-7d42-0410-adbc-911cccaed67c 2007-12-30 09:13:51 +00:00
			`class PDFParser(PSStackParser):`
documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`"""`
			`PDFParser fetch PDF objects from a file stream.`
			`It can handle indirect references by referring to`
			`a PDF document set by set_document method.`
			`It also reads XRefs at the end of every PDF file.`

			`Typical usage:`
			`parser = PDFParser(fp)`
			`parser.read_xref()`
Introducing PDFObjectNotFound 2013-10-09 12:39:23 +00:00			`parser.read_xref(fallback=True) # optional`
documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`parser.set_document(doc)`
			`parser.seek(offset)`
			`parser.nextobject()`
PEP8: Remove trailing whitespace 2013-11-07 07:14:53 +00:00
documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`"""`

warning removal. code cleanup. cmap bug fixed. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@168 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 03:09:26 +00:00			`def __init__(self, fp):`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`PSStackParser.__init__(self, fp)`
warning removal. code cleanup. cmap bug fixed. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@168 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 03:09:26 +00:00			`self.doc = None`
improvement in fallback git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@238 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-08-29 06:39:24 +00:00			`self.fallback = False`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`return`

warning removal. code cleanup. cmap bug fixed. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@168 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 03:09:26 +00:00			`def set_document(self, doc):`
documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`"""Associates the parser with a PDFDocument object."""`
warning removal. code cleanup. cmap bug fixed. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@168 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-01 03:09:26 +00:00			`self.doc = doc`
			`return`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00
String-Bytes distinction (first attempt). 2014-06-30 10:05:56 +00:00			`KEYWORD_R = KWD(b'R')`
			`KEYWORD_NULL = KWD(b'null')`
			`KEYWORD_ENDOBJ = KWD(b'endobj')`
			`KEYWORD_STREAM = KWD(b'stream')`
			`KEYWORD_XREF = KWD(b'xref')`
			`KEYWORD_STARTXREF = KWD(b'startxref')`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`def do_keyword(self, pos, token):`
documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`"""Handles PDF-related keywords."""`
PEP8: Remove trailing whitespace 2013-11-07 07:14:53 +00:00
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):`
			`self.add_results(*self.pop(1))`
PEP8: Remove trailing whitespace 2013-11-07 07:14:53 +00:00
documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`elif token is self.KEYWORD_ENDOBJ:`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.add_results(*self.pop(4))`

documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`elif token is self.KEYWORD_NULL:`
more bugfixes. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@194 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-23 10:29:52 +00:00			`# null object`
			`self.push((pos, None))`

documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`elif token is self.KEYWORD_R:`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`# reference to indirect object`
			`try:`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`((_, objid), (_, genno)) = self.pop(2)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`(objid, genno) = (int(objid), int(genno))`
			`obj = PDFObjRef(self.doc, objid, genno)`
			`self.push((pos, obj))`
			`except PSSyntaxError:`
			`pass`

documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`elif token is self.KEYWORD_STREAM:`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`# stream object`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`((_, dic),) = self.pop(1)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`dic = dict_value(dic)`
improvement in fallback git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@238 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-08-29 06:39:24 +00:00			`objlen = 0`
			`if not self.fallback:`
			`try:`
			`objlen = int_value(dic['Length'])`
			`except KeyError:`
Improved settings management 2016-01-10 17:17:38 +00:00			`if settings.STRICT:`
improvement in fallback git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@238 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-08-29 06:39:24 +00:00			`raise PDFSyntaxError('/Length is undefined: %r' % dic)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.seek(pos)`
			`try:`
			`(_, line) = self.nextline() # 'stream'`
			`except PSEOF:`
Improved settings management 2016-01-10 17:17:38 +00:00			`if settings.STRICT:`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`raise PDFSyntaxError('Unexpected EOF')`
			`return`
			`pos += len(line)`
			`self.fp.seek(pos)`
Speed up handling of PDFs with large images with more minimal change 2018-04-02 21:21:09 +00:00			`data = bytearray(self.fp.read(objlen))`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.seek(pos+objlen)`
			`while 1:`
			`try:`
			`(linepos, line) = self.nextline()`
			`except PSEOF:`
Improved settings management 2016-01-10 17:17:38 +00:00			`if settings.STRICT:`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`raise PDFSyntaxError('Unexpected EOF')`
			`break`
String-Bytes distinction (first attempt). 2014-06-30 10:05:56 +00:00			`if b'endstream' in line:`
			`i = line.index(b'endstream')`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`objlen += i`
Implement revision 4 and 5 encryption handler. 2014-05-19 14:27:43 +00:00			`if self.fallback:`
Speed up handling of PDFs with large images with more minimal change 2018-04-02 21:21:09 +00:00			`data += line[:i]`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`break`
			`objlen += len(line)`
Implement revision 4 and 5 encryption handler. 2014-05-19 14:27:43 +00:00			`if self.fallback:`
Speed up handling of PDFs with large images with more minimal change 2018-04-02 21:21:09 +00:00			`data += line`
Fix cases where a bytearray doesn't work in place of bytes 2018-04-02 21:27:29 +00:00			`data = bytes(data)`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`self.seek(pos+objlen)`
improvement in fallback git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@238 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-08-29 06:39:24 +00:00			`# XXX limit objlen not to exceed object boundary`
Enforce pep8 coding-style (#345) * Code Refractor: Use code-style enforcement #312 * Add flake8 to travis-ci * Remove python 2 3 comment on six library. 891 errors > 870 errors. * Remove class and functions comments that consist of just the name. 870 errors > 855 errors. * Fix flake8 errors in pdftypes.py. 855 errors > 833 errors. * Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting * Cleanup pdfinterp.py and add documentation from PDF Reference * Cleanup pdfpage.py * Cleanup pdffont.py * Clean psparser.py * Cleanup high_level.py * Cleanup layout.py * Cleanup pdfparser.py * Cleanup pdfcolor.py * Cleanup rijndael.py * Cleanup converter.py * Rename klass to cls if it is the class variable, to be more consistent with standard practice * Cleanup cmap.py * Cleanup pdfdevice.py * flake8 ignore fontmetrics.py * Cleanup test_pdfminer_psparser.py * Fix flake8 in pdfdocument.py; 339 errors to go * Fix flake8 utils.py; 326 errors togo * pep8 correction for few files in /tools/ 328 > 160 to go (#342) * pep8 correction for few files in /tools/ 328 > 160 to go * pep8 correction: 160 > 5 to go * Fix ascii85.py errors * Fix error in getting index from target that does not exists * Remove commented print lines * Fix flake8 error in pdfinterp.py * Fix python2 specific error by removing argument from print statement * Ignore invalid python2 syntax * Update contributing.md * Added changelog * Remove unused import Co-authored-by: Fakabbir Amin <f4amin@gmail.com> 2019-12-29 20:20:20 +00:00			`log.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos,`
			`objlen, dic, data[:10])`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`obj = PDFStream(dic, data, self.doc.decipher)`
			`self.push((pos, obj))`

documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`else:`
			`# others`
			`self.push((pos, token))`
PEP8: Remove trailing whitespace 2013-11-07 07:14:53 +00:00
PSEOF check git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@28 1aa58f4a-7d42-0410-adbc-911cccaed67c 2008-05-03 04:10:59 +00:00			`return`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00
tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-01-10 09:14:46 +00:00
writing mode detection git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@196 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-25 11:38:47 +00:00			`class PDFStreamParser(PDFParser):`
documentation bit, ready for release-20100327 git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@198 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-27 06:06:09 +00:00			`"""`
			`PDFStreamParser is used to parse PDF content streams`
			`that is contained in each page and has instructions`
			`for rendering the page. A reference to a PDF document is`
			`needed because a PDF content stream can also have`
			`indirect references to other objects in the same document.`
			`"""`

writing mode detection git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@196 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-03-25 11:38:47 +00:00			`def __init__(self, data):`
Changed: StringIO -> io.BytesIO 2014-06-25 10:55:41 +00:00			`PDFParser.__init__(self, BytesIO(data))`
to 4-space indentation git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@142 1aa58f4a-7d42-0410-adbc-911cccaed67c 2009-10-24 04:41:59 +00:00			`return`

			`def flush(self):`
			`self.add_results(*self.popall())`
			`return`
jpeg extraction support added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-30 07:30:01 +00:00
String-Bytes distinction (first attempt). 2014-06-30 10:05:56 +00:00			`KEYWORD_OBJ = KWD(b'obj')`
Enforce pep8 coding-style (#345) * Code Refractor: Use code-style enforcement #312 * Add flake8 to travis-ci * Remove python 2 3 comment on six library. 891 errors > 870 errors. * Remove class and functions comments that consist of just the name. 870 errors > 855 errors. * Fix flake8 errors in pdftypes.py. 855 errors > 833 errors. * Moving flake8 testing from .travis.yml to tox.ini to ensure local testing before commiting * Cleanup pdfinterp.py and add documentation from PDF Reference * Cleanup pdfpage.py * Cleanup pdffont.py * Clean psparser.py * Cleanup high_level.py * Cleanup layout.py * Cleanup pdfparser.py * Cleanup pdfcolor.py * Cleanup rijndael.py * Cleanup converter.py * Rename klass to cls if it is the class variable, to be more consistent with standard practice * Cleanup cmap.py * Cleanup pdfdevice.py * flake8 ignore fontmetrics.py * Cleanup test_pdfminer_psparser.py * Fix flake8 in pdfdocument.py; 339 errors to go * Fix flake8 utils.py; 326 errors togo * pep8 correction for few files in /tools/ 328 > 160 to go (#342) * pep8 correction for few files in /tools/ 328 > 160 to go * pep8 correction: 160 > 5 to go * Fix ascii85.py errors * Fix error in getting index from target that does not exists * Remove commented print lines * Fix flake8 error in pdfinterp.py * Fix python2 specific error by removing argument from print statement * Ignore invalid python2 syntax * Update contributing.md * Added changelog * Remove unused import Co-authored-by: Fakabbir Amin <f4amin@gmail.com> 2019-12-29 20:20:20 +00:00
jpeg extraction support added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-30 07:30:01 +00:00			`def do_keyword(self, pos, token):`
			`if token is self.KEYWORD_R:`
			`# reference to indirect object`
			`try:`
PEP8: Whitespace changes to match pep8 2013-11-07 08:35:04 +00:00			`((_, objid), (_, genno)) = self.pop(2)`
jpeg extraction support added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-30 07:30:01 +00:00			`(objid, genno) = (int(objid), int(genno))`
			`obj = PDFObjRef(self.doc, objid, genno)`
			`self.push((pos, obj))`
			`except PSSyntaxError:`
			`pass`
			`return`
Fixed: issue #48 (thanks to speedplane) 2014-04-09 08:55:50 +00:00			`elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):`
Improved settings management 2016-01-10 17:17:38 +00:00			`if settings.STRICT:`
Fixed: issue #48 (thanks to speedplane) 2014-04-09 08:55:50 +00:00			`# See PDF Spec 3.4.6: Only the object values are stored in the`
			`# stream; the obj and endobj keywords are not used.`
String-Bytes distinction (first attempt). 2014-06-30 10:05:56 +00:00			`raise PDFSyntaxError('Keyword endobj found in stream')`
Fixed: issue #48 (thanks to speedplane) 2014-04-09 08:55:50 +00:00			`return`
jpeg extraction support added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67c 2010-01-30 07:30:01 +00:00			`# others`
			`self.push((pos, token))`
			`return`