ASCII85 filter support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-08-30 07:40:52 +00:00 · 2008-08-30 07:40:52 +00:00 · 3ed3b4cfd5
parent 79f425b164
commit 3ed3b4cfd5
6 changed files with 85 additions and 14 deletions
--- a/README.html
+++ b/README.html
@ -14,7 +14,7 @@ Python PDF parser and analyzer
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Tue Jul 29 21:34:29 JST 2008
+Last Modified: Sat Aug 30 16:39:32 JST 2008
 <!-- hhmts end -->
 </div>
@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
 <li> Do the following test:<br>
 <blockquote><pre>
 $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
-&lt;page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"&gt;
+&lt;html&gt;&lt;head&gt;&lt;meta http-equiv="Content-Type" content="text/html; charset=ascii"&gt;
-&lt;text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"&gt; Hello World &lt;/text&gt;
+&lt;/head&gt;&lt;body&gt;
-&lt;/page&gt;
+&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="0"&gt;Page 0&lt;/a&gt;&lt;/div&gt;&lt;span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
 &lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"&gt; Hello World &lt;/span&gt;
 &lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#0"&gt;0&lt;/a&gt;&lt;/div&gt;
 &lt;/body&gt;&lt;/html&gt;
 </pre></blockquote>
 <li> Done!
 </ol>
@ -91,7 +94,8 @@ $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
 <p>
 <h3>For non-ASCII languages</h3>
 In order to handle non-ASCII languages (e.g. Japanese),
-you need to install an additional data called <code>CMap</code>.
+you need to install an additional data called <code>CMap</code>,
 which is distributed from Adobe.
 <p>
 Here is how:
@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
 <li> <code>sgml</code> : SGML format.
 <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
 HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
-Tags used here are defined in the PDF specification.
+Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
 </ul>
 <p>
 <dt> <code>-P <em>password</em></code> 
@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
 <li> 2008/08/30: ASCII85 encoding filter support.
 <li> 2008/07/27: Tagged contents extraction support.
 <li> 2008/07/10: Outline (TOC) extraction support.
 <li> 2008/06/29: HTML output added. Reorganized the directory structure.
--- a/3
+++ b/3
@ -1,5 +1,6 @@
 TODOs:
-  - Documentation.
+  - API Documentation.
  - Sample webapp for pdf->html.
  - Error handling for invalid type.
  - Infer text stream by clustering.
--- a/pdflib/arcfour.py
+++ b/pdflib/arcfour.py
@ -1,9 +1,12 @@
 #!/usr/bin/env python
 #
-#  Arcfour implementation
+#  Arcfour implementation in Python
 #  * public domain *
 #
 ##  Arcfour
 ##
 class Arcfour(object):
  def __init__(self, key):
@ -30,6 +33,7 @@ class Arcfour(object):
    (self.i, self.j) = (i, j)
    return r
 # test
 if __name__ == '__main__':
  def doit(key, data):
    cipher = Arcfour(key)
--- a/pdflib/ascii85.py
+++ b/pdflib/ascii85.py
@ -0,0 +1,47 @@
 #!/usr/bin/env python
 #
 #  ASCII85 decoder (Adobe version) implementation
 #  * public domain *
 #
 import struct
 # ascii85decode(data)
 def ascii85decode(data):
  n = b = 0
  out = ''
  for c in data:
    if '!' <= c and c <= 'u':
      n += 1
      b = b*85+(ord(c)-33)
      if n == 5:
        out += struct.pack('>L',b)
        n = b = 0
    elif c == 'z':
      assert n == 0
      out += '\0\0\0\0'
    elif c == '~':
      if n:
        for _ in range(5-n):
          b = b*85+84
        out += struct.pack('>L',b)[:n-1]
      break
  return out
 # test
 # sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
 if __name__ == '__main__':
  orig = r'''
  9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
  O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
  i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
  l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
  >uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
  '''
  data = \
       'Man is distinguished, not only by his reason, but by this singular passion from '\
       'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
       'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
       'any carnal pleasure.'
  assert ascii85decode(orig) == data
  print 'test succeeded'
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
    return
  def execute(self, streams):
-    parser = PDFContentParser(streams, debug=self.debug)
+    try:
      parser = PDFContentParser(streams, debug=self.debug)
    except PSEOF:
      # empty page
      return
    while 1:
      try:
        (_,obj) = parser.nextobject()
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
 LITERAL_PAGES = PSLiteralTable.intern('Pages')
 LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
 LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
-LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
+LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
-LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
+                         PSLiteralTable.intern('Fl'))
 LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
                       PSLiteralTable.intern('LZW'))
 LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
                           PSLiteralTable.intern('A85'))
 KEYWORD_R = PSKeywordTable.intern('R')
 KEYWORD_OBJ = PSKeywordTable.intern('obj')
 KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
@ -200,16 +204,19 @@ class PDFStream(PDFObject):
    if not isinstance(filters, list):
      filters = [ filters ]
    for f in filters:
-      if f == LITERAL_FLATE_DECODE:
+      if f in LITERALS_FLATE_DECODE:
        import zlib
        # will get errors if the document is encrypted.
        data = zlib.decompress(data)
-      elif f == LITERAL_LZW_DECODE:
+      elif f in LITERALS_LZW_DECODE:
        try:
          from cStringIO import StringIO
        except ImportError:
          from StringIO import StringIO
        data = ''.join(LZWDecoder(StringIO(data)).run())
      elif f in LITERALS_ASCII85_DECODE:
        import ascii85
        data = ascii85.ascii85decode(data)
      elif f == LITERAL_CRYPT:
        raise PDFEncryptionError
      else:
@ -265,7 +272,10 @@ class PDFPage(object):
    self.rotate = self.attrs.get('Rotate', 0)
    self.annots = self.attrs.get('Annots')
    self.beads = self.attrs.get('B')
-    contents = resolve1(self.attrs['Contents'])
+    if 'Contents' in self.attrs:
      contents = resolve1(self.attrs['Contents'])
    else:
      contents = []
    if not isinstance(contents, list):
      contents = [ contents ]
    self.contents = contents