diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 3b2b551..90da1a8 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -644,10 +644,24 @@ class PDFDocument(object): def _getobj_parse(self, pos, objid): self._parser.seek(pos) (_, objid1) = self._parser.nexttoken() # objid - if objid1 != objid: - raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid)) (_, genno) = self._parser.nexttoken() # genno (_, kwd) = self._parser.nexttoken() + # #### hack around malformed pdf files + # copied from https://github.com/jaepil/pdfminer3k/blob/master/pdfminer/pdfparser.py#L399 + #to solve https://github.com/pdfminer/pdfminer.six/issues/56 + #assert objid1 == objid, (objid, objid1) + if objid1 != objid: + x = [] + while kwd is not self.KEYWORD_OBJ: + (_,kwd) = self._parser.nexttoken() + x.append(kwd) + if x: + objid1 = x[-2] + genno = x[-1] + # #### end hack around malformed pdf files + if objid1 != objid: + raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid)) + if kwd != KWD(b'obj'): raise PDFSyntaxError('Invalid object spec: offset=%r' % pos) (_, obj) = self._parser.nextobject() diff --git a/samples/contrib/2b.pdf b/samples/contrib/2b.pdf new file mode 100644 index 0000000..6a80676 Binary files /dev/null and b/samples/contrib/2b.pdf differ diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 82da814..9292b78 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -47,5 +47,8 @@ class TestDumpPDF(): def test_7(self): run('../samples/contrib/','stamp-no') """ + + def test_8(self): + run('../samples/contrib/','2b','-A -t xml') if __name__ == '__main__': nose.runmodule()