more bugfixes.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@194 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-03-23 10:29:52 +00:00 · 2010-03-23 10:29:52 +00:00 · e536b3ef11
parent ee34d8d549
commit e536b3ef11
3 changed files with 15 additions and 5 deletions
--- a/docs/index.html
+++ b/docs/index.html
@ -19,7 +19,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Mon Mar 22 06:16:28 UTC 2010
+Last Modified: Tue Mar 23 10:29:24 UTC 2010
 <!-- hhmts end -->
 </div>

@ -335,17 +335,18 @@ no stream header is displayed for the ease of saving it to a file.
 <ul>
 <li> <A href="http://www.python.org/dev/peps/pep-0008/">PEP-8</a> and
 <a href="http://www.python.org/dev/peps/pep-0257/">PEP-257</a> conformance.
+<li> Better documentation.
 <li> Better text extraction / layout analysis.
-<li> Better API Documentation.
+<li> Robust error handling.
 <li> Crypt stream filter support. (More sample documents are needed!)
 <li> CCITTFax stream filter support.
-<li> Robust error handling.
 </ul>

 <a name="changes"></a>
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2010/03/xx: Bugfixes. Thanks to Brian Berry and Lubos Pintes.
 <li> 2010/03/22: Improved layout analysis. Added regression tests.
 <li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield.
 <li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -&gt; LTChar)
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@ -361,7 +361,7 @@ class PDFDocument(object):
            self._initialized = True
            return
        (docid, param) = self.encryption
-        if literal_name(param['Filter']) != 'Standard':
+        if literal_name(param.get('Filter')) != 'Standard':
            raise PDFEncryptionError('Unknown filter: param=%r' % param)
        V = int_value(param.get('V', 0))
        if not (V == 1 or V == 2):
@ -439,6 +439,7 @@ class PDFDocument(object):
            else:
                if STRICT:
                    raise PDFSyntaxError('Cannot locate objid=%r' % objid)
+                # return null for a nonexistent reference.
                return None
            if strmid:
                stream = stream_value(self.getobj(strmid))
@ -588,6 +589,7 @@ class PDFParser(PSStackParser):
        return

    KEYWORD_R = KWD('R')
+    KEYWORD_NULL = KWD('null')
    KEYWORD_ENDOBJ = KWD('endobj')
    KEYWORD_STREAM = KWD('stream')
    KEYWORD_XREF = KWD('xref')
@ -596,10 +598,16 @@ class PDFParser(PSStackParser):
        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
            self.add_results(*self.pop(1))
            return
+        
        if token is self.KEYWORD_ENDOBJ:
            self.add_results(*self.pop(4))
            return

+        if token is self.KEYWORD_NULL:
+            # null object
+            self.push((pos, None))
+            return
+
        if token is self.KEYWORD_R:
            # reference to indirect object
            try:
--- a/pdfminer/psparser.py
+++ b/pdfminer/psparser.py
@ -537,7 +537,8 @@ class PSStackParser(PSBaseParser):
                    (pos, objs) = self.end_type('d')
                    if len(objs) % 2 != 0:
                        raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
-                    d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs))
+                    # construct a Python dictionary.
+                    d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
                    self.push((pos, d))
                except PSTypeError:
                    if STRICT: raise