patch from Troy Bollinger.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@71 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-02-28 05:44:08 +00:00 · 2009-02-28 05:44:08 +00:00 · b432a3f4ae
parent 13a6603151
commit b432a3f4ae
5 changed files with 100 additions and 45 deletions
--- a/README.html
+++ b/README.html
@ -164,19 +164,19 @@ $ <strong>python -m pdflib.pdf2txt -P mypassword secret.pdf</strong>
 Options:
 <dl>
 <dt> <code>-o <em>filename</em></code> 
-<dd> Speficies the output file name.
+<dd> Specifies the output file name.
 By default, it prints the extracted contents to stdout.
 <p>
 <dt> <code>-p <em>pageno[,pageno,...]</em></code> 
-<dd> Speficies the comma-separated list of the page numbers to be extracted. 
+<dd> Specifies the comma-separated list of the page numbers to be extracted. 
 Page numbers are starting from one.
 By default, it extracts texts from all the pages.
 <p>
 <dt> <code>-c <em>codec</em></code> 
-<dd> Speficies the output codec for non-ASCII texts.
+<dd> Specifies the output codec for non-ASCII texts.
 <p>
 <dt> <code>-t <em>type</em></code> 
-<dd> Speficies the output format. The following formats are currently supported.
+<dd> Specifies the output format. The following formats are currently supported.
 <ul>
 <li> <code>html</code> : HTML format. (Default)
 <li> <code>sgml</code> : SGML format.
@ -221,14 +221,14 @@ Options:
 By default, it only prints the document trailer (like a header).
 <p>
 <dt> <code>-p <em>pageno</em></code> 
-<dd> Speficies the page number to be extracted. 
+<dd> Specifies the page number to be extracted.
 Multiple <code>-p</code> options are allowed.
 Note that page numbers start from one.
 <p>
 <dt> <code>-r</code> (raw)
 <dt> <code>-b</code> (binary)
 <dt> <code>-t</code> (text)
-<dd> Speficies the output format of stream contents.
+<dd> Specifies the output format of stream contents.
 Because the contents of stream objects can be very large,
 they are omitted when none of the options above is specified.
 <p>
--- a/pdflib/Makefile
+++ b/pdflib/Makefile
@ -1,6 +1,32 @@
 # Makefile for pdfminer

-all:
+DESTDIR=/usr/local/src/pdflib
+
+PDFLIB = ${DESTDIR}/__init__.py \
+	${DESTDIR}/arcfour.py \
+	${DESTDIR}/ascii85.py \
+	${DESTDIR}/cmap.py \
+	${DESTDIR}/fontmetrics.py \
+	${DESTDIR}/glyphlist.py \
+	${DESTDIR}/latin_enc.py \
+	${DESTDIR}/lzw.py \
+	${DESTDIR}/pdf2txt.py \
+	${DESTDIR}/pdfcolor.py \
+	${DESTDIR}/pdfdevice.py \
+	${DESTDIR}/pdffont.py \
+	${DESTDIR}/pdfinterp.py \
+	${DESTDIR}/pdfparser.py \
+	${DESTDIR}/pdftypes.py \
+	${DESTDIR}/psparser.py \
+	${DESTDIR}/pycdb.py \
+	${DESTDIR}/rijndael.py \
+	${DESTDIR}/utils.py \
+
+${DESTDIR}/%: %
+	cp $? $@
+	chmod 755 $@
+
+all: ${PDFLIB}

 clean:
 	-rm *.pyc *.pyo
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -59,9 +59,13 @@ class PDFBaseXRef(object):
    return

  def objids(self):
-    for objid_range in self.objid_ranges:
-      for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
-        yield objid
+    if self.objid_ranges:
+        for objid_range in self.objid_ranges:
+          for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
+            yield objid
+    else:
+        for objid in self.offsets:
+            yield objid
    return

 ##  PDFXRef
@ -70,6 +74,7 @@ class PDFXRef(PDFBaseXRef):
  def __init__(self):
    PDFBaseXRef.__init__(self)
    self.offsets = None
+    self.trailer = {}
    return

  def __repr__(self):
@ -81,6 +86,8 @@ class PDFXRef(PDFBaseXRef):
    while 1:
      try:
        (pos, line) = parser.nextline()
+        if not line.strip():
+            continue
      except PSEOF:
        raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
      if not line:
@ -112,7 +119,7 @@ class PDFXRef(PDFBaseXRef):
      print >>stderr, 'xref objects:', self.offsets
    self.load_trailer(parser)
    return
-  
+
  KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
  def load_trailer(self, parser):
    try:
@ -124,7 +131,7 @@ class PDFXRef(PDFBaseXRef):
      if not x:
        raise PDFNoValidXRef('Unexpected EOF - file corrupted')
      (_,dic) = x[0]
-    self.trailer = dict_value(dic)
+    self.trailer.update( dict_value(dic))
    return

  def getpos(self, objid):
@ -199,7 +206,7 @@ class PDFXRefStream(PDFBaseXRef):
 ##  PDFPage
 ##
 class PDFPage(object):
-  
+
  def __init__(self, doc, pageid, attrs):
    self.doc = doc
    self.pageid = pageid
@ -237,7 +244,7 @@ class PDFPage(object):
 class PDFDocument(object):

  debug = 0
-  
+
  def __init__(self):
    self.xrefs = []
    self.objs = {}
@ -257,7 +264,7 @@ class PDFDocument(object):
    self.parser = parser
    # The document is set to be temporarily ready during collecting
    # all the basic information about the document, e.g.
-    # the header, the encryption information, and the access rights 
+    # the header, the encryption information, and the access rights
    # for the document.
    self.ready = True
    # Retrieve the information of each header that was appended
@ -292,7 +299,7 @@ class PDFDocument(object):
      if STRICT:
        raise PDFSyntaxError('Catalog not found!')
    return
-  
+
  # initialize(password='')
  #   Perform the initialization with a given password.
  #   This step is mandatory even if there's no password associated
@ -316,7 +323,7 @@ class PDFDocument(object):
      raise PDFEncryptionError('Unknown revision: %r' % R)
    U = str_value(param['U'])
    P = int_value(param['P'])
-    self.is_printable = bool(P & 4)        
+    self.is_printable = bool(P & 4)
    self.is_modifiable = bool(P & 8)
    self.is_extractable = bool(P & 16)
    # Algorithm 3.2
@ -418,8 +425,18 @@ class PDFDocument(object):
        self.parser.seek(index)
        (_,objid1) = self.parser.nexttoken() # objid
        (_,genno) = self.parser.nexttoken() # genno
-        #assert objid1 == objid, (objid, objid1)
        (_,kwd) = self.parser.nexttoken()
+# #### hack around malformed pdf files
+#        assert objid1 == objid, (objid, objid1)
+        if objid1 != objid:
+            x = []
+            while kwd is not self.KEYWORD_OBJ:
+                (_,kwd) = self.parser.nexttoken()
+                x.append(kwd)
+            if x:
+                objid1 = x[-2]
+                genno = x[-1]
+# #### end hack around malformed pdf files
        if kwd is not self.KEYWORD_OBJ:
          raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
        (_,obj) = self.parser.nextobject()
@ -431,7 +448,7 @@ class PDFDocument(object):
    if self.decipher:
      obj = decipher_all(self.decipher, objid, genno, obj)
    return obj
-  
+
  INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
  def get_pages(self):
    if not self.ready:
@ -526,7 +543,7 @@ class PDFParser(PSStackParser):
    if token is self.KEYWORD_ENDOBJ:
      self.add_results(*self.pop(4))
      return
-    
+
    if token is self.KEYWORD_R:
      # reference to indirect object
      try:
@ -537,7 +554,7 @@ class PDFParser(PSStackParser):
      except PSSyntaxError:
        pass
      return
-      
+
    if token is self.KEYWORD_STREAM:
      # stream object
      ((_,dic),) = self.pop(1)
@ -580,7 +597,7 @@ class PDFParser(PSStackParser):
      obj = PDFStream(dic, data, self.doc.decipher)
      self.push((pos, obj))
      return
-    
+
    # others
    self.push((pos, token))
    return
@ -611,17 +628,15 @@ class PDFParser(PSStackParser):
      raise PDFNoValidXRef('Unexpected EOF')
    if 2 <= self.debug:
      print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
-    if isinstance(token, int):
+    try:
      # XRefStream: PDF-1.5
      self.seek(pos)
      self.reset()
      xref = PDFXRefStream()
      xref.load(self, debug=self.debug)
-    else:
-      if token is not self.KEYWORD_XREF:
-        raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % 
-                             (pos, token))
-      self.nextline()
+    except:
+      if token is self.KEYWORD_XREF:
+          self.nextline()
      xref = PDFXRef()
      xref.load(self, debug=self.debug)
    xrefs.append(xref)
@ -636,7 +651,7 @@ class PDFParser(PSStackParser):
      pos = int_value(trailer['Prev'])
      self.read_xref_from(pos, xrefs)
    return
-    
+
  # read xref tables and trailers
  def read_xref(self):
    xrefs = []
@ -656,17 +671,17 @@ class PDFParser(PSStackParser):
          (pos, line) = self.nextline()
        except PSEOF:
          break
-        if line.startswith('trailer'): break
+        if line.startswith('trailer'):
+          xref.offsets = offsets
+          self.seek(pos)
+          xref.load_trailer(self)
+          if 1 <= self.debug:
+            print >>stderr, 'trailer: %r' % xref.trailer
+          continue
        m = pat.match(line)
        if not m: continue
        (objid, genno) = m.groups()
        offsets[int(objid)] = (0, pos)
-      if not offsets: raise
-      xref.offsets = offsets
-      self.seek(pos)
-      xref.load_trailer(self)
-      if 1 <= self.debug:
-        print >>stderr, 'trailer: %r' % xref.trailer
      xrefs.append(xref)
    return xrefs

@ -674,7 +689,7 @@ class PDFParser(PSStackParser):
 ##  PDFObjStrmParser
 ##
 class PDFObjStrmParser(PDFParser):
-  
+
  def __init__(self, doc, data):
    try:
      from cStringIO import StringIO
@ -682,7 +697,7 @@ class PDFObjStrmParser(PDFParser):
      from StringIO import StringIO
    PDFParser.__init__(self, doc, StringIO(data))
    return
-  
+
  def flush(self):
    self.add_results(*self.popall())
    return
--- a/pdflib/pdftypes.py
+++ b/pdflib/pdftypes.py
@ -159,6 +159,20 @@ class PDFStream(PDFObject):
  def __repr__(self):
    return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)

+  def decomp(self,data):
+    import zlib
+    buf = data
+    # some FlateDecode streams have garbage (newlines, etc) appended to the
+    # end.  remove chars from the end to try and decompress the buffer
+    while len(buf) > 10:
+      try:
+          # will get errors if the document is encrypted.
+          dco = zlib.decompressobj()
+          return dco.decompress(buf)
+      except:
+          buf = buf[:-1]
+    raise Exception, "zlib.error while decompressing data"
+
  def decode(self):
    assert self.data == None and self.rawdata != None
    data = self.rawdata
@ -175,7 +189,7 @@ class PDFStream(PDFObject):
    for f in filters:
      if f in LITERALS_FLATE_DECODE:
        # will get errors if the document is encrypted.
-        data = zlib.decompress(data)
+        data = self.decomp(data)
      elif f in LITERALS_LZW_DECODE:
        try:
          from cStringIO import StringIO
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -29,7 +29,7 @@ def dumpxml(out, obj, codec=None):
      out.write('</value>\n')
    out.write('</dict>')
    return
-  
+
  if isinstance(obj, list):
    out.write('<list size="%d">\n' % len(obj))
    for v in obj:
@ -37,11 +37,11 @@ def dumpxml(out, obj, codec=None):
      out.write('\n')
    out.write('</list>')
    return
-  
+
  if isinstance(obj, str):
    out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
    return
-  
+
  if isinstance(obj, PDFStream):
    out.write('<stream>\n<props>\n')
    dumpxml(out, obj.dic)
@ -51,11 +51,11 @@ def dumpxml(out, obj, codec=None):
      out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
    out.write('</stream>')
    return
-  
+
  if isinstance(obj, PDFObjRef):
    out.write('<ref id="%d"/>' % obj.objid)
    return
-  
+
  if isinstance(obj, PSKeyword):
    out.write('<keyword>%s</keyword>' % obj.name)
    return
@ -63,7 +63,7 @@ def dumpxml(out, obj, codec=None):
  if isinstance(obj, PSLiteral):
    out.write('<literal>%s</literal>' % obj.name)
    return
-  
+
  if isinstance(obj, int) or isinstance(obj, float):
    out.write('<number>%s</number>' % obj)
    return