From 3ed3b4cfd5c4bf5813b644f9e7c4d25170b80f32 Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Sat, 30 Aug 2008 07:40:52 +0000
Subject: [PATCH] ASCII85 filter support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 README.html         | 17 ++++++++++------
 TODO                |  3 ++-
 pdflib/arcfour.py   |  6 +++++-
 pdflib/ascii85.py   | 47 +++++++++++++++++++++++++++++++++++++++++++++
 pdflib/pdfinterp.py |  6 +++++-
 pdflib/pdfparser.py | 20 ++++++++++++++-----
 6 files changed, 85 insertions(+), 14 deletions(-)
 create mode 100644 pdflib/ascii85.py
diff --git a/README.html b/README.html
index b3edc81..50940ea 100644
--- a/README.html
+++ b/README.html
@@ -14,7 +14,7 @@ Python PDF parser and analyzer
 
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Tue Jul 29 21:34:29 JST 2008
+Last Modified: Sat Aug 30 16:39:32 JST 2008
 <!-- hhmts end -->
 </div>
 
@@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
 <li> Do the following test:<br>
 <blockquote><pre>
 $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
-&lt;page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"&gt;
-&lt;text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"&gt; Hello World &lt;/text&gt;
-&lt;/page&gt;
+&lt;html&gt;&lt;head&gt;&lt;meta http-equiv="Content-Type" content="text/html; charset=ascii"&gt;
+&lt;/head&gt;&lt;body&gt;
+&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="0"&gt;Page 0&lt;/a&gt;&lt;/div&gt;&lt;span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
+&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"&gt; Hello World &lt;/span&gt;
+&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#0"&gt;0&lt;/a&gt;&lt;/div&gt;
+&lt;/body&gt;&lt;/html&gt;
 </pre></blockquote>
 <li> Done!
 </ol>
@@ -91,7 +94,8 @@ $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
 <p>
 <h3>For non-ASCII languages</h3>
 In order to handle non-ASCII languages (e.g. Japanese),
-you need to install an additional data called <code>CMap</code>.
+you need to install an additional data called <code>CMap</code>,
+which is distributed from Adobe.
 <p>
 Here is how:
 
@@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
 <li> <code>sgml</code> : SGML format.
 <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
 HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
-Tags used here are defined in the PDF specification.
+Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
 </ul>
 <p>
 <dt> <code>-P <em>password</em></code> 
@@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2008/08/30: ASCII85 encoding filter support.
 <li> 2008/07/27: Tagged contents extraction support.
 <li> 2008/07/10: Outline (TOC) extraction support.
 <li> 2008/06/29: HTML output added. Reorganized the directory structure.
diff --git a/TODO b/TODO
index 859ffe8..f32ff7a 100644
--- a/TODO
+++ b/TODO
@@ -1,5 +1,6 @@
 TODOs:
-  - Documentation.
+  - API Documentation.
+  - Sample webapp for pdf->html.
   - Error handling for invalid type.
   - Infer text stream by clustering.
 
diff --git a/pdflib/arcfour.py b/pdflib/arcfour.py
index 188acac..a8034d6 100755
--- a/pdflib/arcfour.py
+++ b/pdflib/arcfour.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python
 #
-#  Arcfour implementation
+#  Arcfour implementation in Python
 #  * public domain *
 #
 
+
+##  Arcfour
+##
 class Arcfour(object):
   
   def __init__(self, key):
@@ -30,6 +33,7 @@ class Arcfour(object):
     (self.i, self.j) = (i, j)
     return r
 
+# test
 if __name__ == '__main__':
   def doit(key, data):
     cipher = Arcfour(key)
diff --git a/pdflib/ascii85.py b/pdflib/ascii85.py
new file mode 100644
index 0000000..19e9374
--- /dev/null
+++ b/pdflib/ascii85.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+#
+#  ASCII85 decoder (Adobe version) implementation
+#  * public domain *
+#
+
+import struct
+
+# ascii85decode(data)
+def ascii85decode(data):
+  n = b = 0
+  out = ''
+  for c in data:
+    if '!' <= c and c <= 'u':
+      n += 1
+      b = b*85+(ord(c)-33)
+      if n == 5:
+        out += struct.pack('>L',b)
+        n = b = 0
+    elif c == 'z':
+      assert n == 0
+      out += '\0\0\0\0'
+    elif c == '~':
+      if n:
+        for _ in range(5-n):
+          b = b*85+84
+        out += struct.pack('>L',b)[:n-1]
+      break
+  return out
+
+# test
+# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
+if __name__ == '__main__':
+  orig = r'''
+  9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
+  O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
+  i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
+  l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
+  >uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
+  '''
+  data = \
+       'Man is distinguished, not only by his reason, but by this singular passion from '\
+       'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
+       'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
+       'any carnal pleasure.'
+  assert ascii85decode(orig) == data
+  print 'test succeeded'
diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py
index 9c4a39d..fe2fe29 100644
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
     return
   
   def execute(self, streams):
-    parser = PDFContentParser(streams, debug=self.debug)
+    try:
+      parser = PDFContentParser(streams, debug=self.debug)
+    except PSEOF:
+      # empty page
+      return
     while 1:
       try:
         (_,obj) = parser.nextobject()
diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py
index b2b1509..4a30405 100755
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
 LITERAL_PAGES = PSLiteralTable.intern('Pages')
 LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
 LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
-LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
-LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
+LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
+                         PSLiteralTable.intern('Fl'))
+LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
+                       PSLiteralTable.intern('LZW'))
+LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
+                           PSLiteralTable.intern('A85'))
 KEYWORD_R = PSKeywordTable.intern('R')
 KEYWORD_OBJ = PSKeywordTable.intern('obj')
 KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
@@ -200,16 +204,19 @@ class PDFStream(PDFObject):
     if not isinstance(filters, list):
       filters = [ filters ]
     for f in filters:
-      if f == LITERAL_FLATE_DECODE:
+      if f in LITERALS_FLATE_DECODE:
         import zlib
         # will get errors if the document is encrypted.
         data = zlib.decompress(data)
-      elif f == LITERAL_LZW_DECODE:
+      elif f in LITERALS_LZW_DECODE:
         try:
           from cStringIO import StringIO
         except ImportError:
           from StringIO import StringIO
         data = ''.join(LZWDecoder(StringIO(data)).run())
+      elif f in LITERALS_ASCII85_DECODE:
+        import ascii85
+        data = ascii85.ascii85decode(data)
       elif f == LITERAL_CRYPT:
         raise PDFEncryptionError
       else:
@@ -265,7 +272,10 @@ class PDFPage(object):
     self.rotate = self.attrs.get('Rotate', 0)
     self.annots = self.attrs.get('Annots')
     self.beads = self.attrs.get('B')
-    contents = resolve1(self.attrs['Contents'])
+    if 'Contents' in self.attrs:
+      contents = resolve1(self.attrs['Contents'])
+    else:
+      contents = []
     if not isinstance(contents, list):
       contents = [ contents ]
     self.contents = contents