ASCII85 filter support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-08-30 07:40:52 +00:00 · 2008-08-30 07:40:52 +00:00 · 3ed3b4cfd5
parent 79f425b164
commit 3ed3b4cfd5
6 changed files with 85 additions and 14 deletions
--- a/README.html
+++ b/README.html
@ -14,7 +14,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Tue Jul 29 21:34:29 JST 2008
+Last Modified: Sat Aug 30 16:39:32 JST 2008
 <!-- hhmts end -->
 </div>

@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
 <li> Do the following test:<br>
 <blockquote><pre>
 $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
-&lt;page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"&gt;
-&lt;text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"&gt; Hello World &lt;/text&gt;
-&lt;/page&gt;
+&lt;html&gt;&lt;head&gt;&lt;meta http-equiv="Content-Type" content="text/html; charset=ascii"&gt;
+&lt;/head&gt;&lt;body&gt;
+&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="0"&gt;Page 0&lt;/a&gt;&lt;/div&gt;&lt;span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
+&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"&gt; Hello World &lt;/span&gt;
+&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#0"&gt;0&lt;/a&gt;&lt;/div&gt;
+&lt;/body&gt;&lt;/html&gt;
 </pre></blockquote>
 <li> Done!
 </ol>
@ -91,7 +94,8 @@ $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
 <p>
 <h3>For non-ASCII languages</h3>
 In order to handle non-ASCII languages (e.g. Japanese),
-you need to install an additional data called <code>CMap</code>.
+you need to install an additional data called <code>CMap</code>,
+which is distributed from Adobe.
 <p>
 Here is how:

@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
 <li> <code>sgml</code> : SGML format.
 <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
 HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
-Tags used here are defined in the PDF specification.
+Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
 </ul>
 <p>
 <dt> <code>-P <em>password</em></code> 
@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2008/08/30: ASCII85 encoding filter support.
 <li> 2008/07/27: Tagged contents extraction support.
 <li> 2008/07/10: Outline (TOC) extraction support.
 <li> 2008/06/29: HTML output added. Reorganized the directory structure.
--- a/3
+++ b/3
@ -1,5 +1,6 @@
 TODOs:
-  - Documentation.
+  - API Documentation.
+  - Sample webapp for pdf->html.
  - Error handling for invalid type.
  - Infer text stream by clustering.

--- a/pdflib/arcfour.py
+++ b/pdflib/arcfour.py
@ -1,9 +1,12 @@
 #!/usr/bin/env python
 #
-#  Arcfour implementation
+#  Arcfour implementation in Python
 #  * public domain *
 #

+
+##  Arcfour
+##
 class Arcfour(object):
  
  def __init__(self, key):
@ -30,6 +33,7 @@ class Arcfour(object):
    (self.i, self.j) = (i, j)
    return r

+# test
 if __name__ == '__main__':
  def doit(key, data):
    cipher = Arcfour(key)
--- a/pdflib/ascii85.py
+++ b/pdflib/ascii85.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python
+#
+#  ASCII85 decoder (Adobe version) implementation
+#  * public domain *
+#
+
+import struct
+
+# ascii85decode(data)
+def ascii85decode(data):
+  n = b = 0
+  out = ''
+  for c in data:
+    if '!' <= c and c <= 'u':
+      n += 1
+      b = b*85+(ord(c)-33)
+      if n == 5:
+        out += struct.pack('>L',b)
+        n = b = 0
+    elif c == 'z':
+      assert n == 0
+      out += '\0\0\0\0'
+    elif c == '~':
+      if n:
+        for _ in range(5-n):
+          b = b*85+84
+        out += struct.pack('>L',b)[:n-1]
+      break
+  return out
+
+# test
+# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
+if __name__ == '__main__':
+  orig = r'''
+  9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
+  O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
+  i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
+  l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
+  >uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
+  '''
+  data = \
+       'Man is distinguished, not only by his reason, but by this singular passion from '\
+       'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
+       'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
+       'any carnal pleasure.'
+  assert ascii85decode(orig) == data
+  print 'test succeeded'
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
    return
  
  def execute(self, streams):
+    try:
      parser = PDFContentParser(streams, debug=self.debug)
+    except PSEOF:
+      # empty page
+      return
    while 1:
      try:
        (_,obj) = parser.nextobject()
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
 LITERAL_PAGES = PSLiteralTable.intern('Pages')
 LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
 LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
-LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
-LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
+LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
+                         PSLiteralTable.intern('Fl'))
+LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
+                       PSLiteralTable.intern('LZW'))
+LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
+                           PSLiteralTable.intern('A85'))
 KEYWORD_R = PSKeywordTable.intern('R')
 KEYWORD_OBJ = PSKeywordTable.intern('obj')
 KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
@ -200,16 +204,19 @@ class PDFStream(PDFObject):
    if not isinstance(filters, list):
      filters = [ filters ]
    for f in filters:
-      if f == LITERAL_FLATE_DECODE:
+      if f in LITERALS_FLATE_DECODE:
        import zlib
        # will get errors if the document is encrypted.
        data = zlib.decompress(data)
-      elif f == LITERAL_LZW_DECODE:
+      elif f in LITERALS_LZW_DECODE:
        try:
          from cStringIO import StringIO
        except ImportError:
          from StringIO import StringIO
        data = ''.join(LZWDecoder(StringIO(data)).run())
+      elif f in LITERALS_ASCII85_DECODE:
+        import ascii85
+        data = ascii85.ascii85decode(data)
      elif f == LITERAL_CRYPT:
        raise PDFEncryptionError
      else:
@ -265,7 +272,10 @@ class PDFPage(object):
    self.rotate = self.attrs.get('Rotate', 0)
    self.annots = self.attrs.get('Annots')
    self.beads = self.attrs.get('B')
+    if 'Contents' in self.attrs:
      contents = resolve1(self.attrs['Contents'])
+    else:
+      contents = []
    if not isinstance(contents, list):
      contents = [ contents ]
    self.contents = contents