diff --git a/README.html b/README.html
index b3edc81..50940ea 100644
--- a/README.html
+++ b/README.html
@@ -14,7 +14,7 @@ Python PDF parser and analyzer
-Last Modified: Tue Jul 29 21:34:29 JST 2008
+Last Modified: Sat Aug 30 16:39:32 JST 2008
@@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
Do the following test:
$ python -m tools.pdf2txt samples/simple1.pdf
-<page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0">
-<text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"> Hello World </text>
-</page>
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=ascii">
+</head><body>
+<div style="position:absolute; top:50px;"><a name="0">Page 0</a></div><span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"></span>
+<span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"> Hello World </span>
+<div style="position:absolute; top:0px;">Page: <a href="#0">0</a></div>
+</body></html>
Done!
@@ -91,7 +94,8 @@ $ python -m tools.pdf2txt samples/simple1.pdf
For non-ASCII languages
In order to handle non-ASCII languages (e.g. Japanese),
-you need to install an additional data called CMap
.
+you need to install an additional data called CMap
,
+which is distributed from Adobe.
Here is how:
@@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
sgml
: SGML format.
tag
: "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
-Tags used here are defined in the PDF specification.
+Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF").
-P password
@@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+- 2008/08/30: ASCII85 encoding filter support.
- 2008/07/27: Tagged contents extraction support.
- 2008/07/10: Outline (TOC) extraction support.
- 2008/06/29: HTML output added. Reorganized the directory structure.
diff --git a/TODO b/TODO
index 859ffe8..f32ff7a 100644
--- a/TODO
+++ b/TODO
@@ -1,5 +1,6 @@
TODOs:
- - Documentation.
+ - API Documentation.
+ - Sample webapp for pdf->html.
- Error handling for invalid type.
- Infer text stream by clustering.
diff --git a/pdflib/arcfour.py b/pdflib/arcfour.py
index 188acac..a8034d6 100755
--- a/pdflib/arcfour.py
+++ b/pdflib/arcfour.py
@@ -1,9 +1,12 @@
#!/usr/bin/env python
#
-# Arcfour implementation
+# Arcfour implementation in Python
# * public domain *
#
+
+## Arcfour
+##
class Arcfour(object):
def __init__(self, key):
@@ -30,6 +33,7 @@ class Arcfour(object):
(self.i, self.j) = (i, j)
return r
+# test
if __name__ == '__main__':
def doit(key, data):
cipher = Arcfour(key)
diff --git a/pdflib/ascii85.py b/pdflib/ascii85.py
new file mode 100644
index 0000000..19e9374
--- /dev/null
+++ b/pdflib/ascii85.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+#
+# ASCII85 decoder (Adobe version) implementation
+# * public domain *
+#
+
+import struct
+
+# ascii85decode(data)
+def ascii85decode(data):
+ n = b = 0
+ out = ''
+ for c in data:
+ if '!' <= c and c <= 'u':
+ n += 1
+ b = b*85+(ord(c)-33)
+ if n == 5:
+ out += struct.pack('>L',b)
+ n = b = 0
+ elif c == 'z':
+ assert n == 0
+ out += '\0\0\0\0'
+ elif c == '~':
+ if n:
+ for _ in range(5-n):
+ b = b*85+84
+ out += struct.pack('>L',b)[:n-1]
+ break
+ return out
+
+# test
+# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
+if __name__ == '__main__':
+ orig = r'''
+ 9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKFCj@.4Gp$d7F!,L7@<6@)/0JDEF@3BB/F*&OCAfu2/AKY
+ i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF-FD5W8ARlolDIa
+ l(DIduD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
+ '''
+ data = \
+ 'Man is distinguished, not only by his reason, but by this singular passion from '\
+ 'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
+ 'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
+ 'any carnal pleasure.'
+ assert ascii85decode(orig) == data
+ print 'test succeeded'
diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py
index 9c4a39d..fe2fe29 100644
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
return
def execute(self, streams):
- parser = PDFContentParser(streams, debug=self.debug)
+ try:
+ parser = PDFContentParser(streams, debug=self.debug)
+ except PSEOF:
+ # empty page
+ return
while 1:
try:
(_,obj) = parser.nextobject()
diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py
index b2b1509..4a30405 100755
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
-LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
-LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
+LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
+ PSLiteralTable.intern('Fl'))
+LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
+ PSLiteralTable.intern('LZW'))
+LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
+ PSLiteralTable.intern('A85'))
KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
@@ -200,16 +204,19 @@ class PDFStream(PDFObject):
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
- if f == LITERAL_FLATE_DECODE:
+ if f in LITERALS_FLATE_DECODE:
import zlib
# will get errors if the document is encrypted.
data = zlib.decompress(data)
- elif f == LITERAL_LZW_DECODE:
+ elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
+ elif f in LITERALS_ASCII85_DECODE:
+ import ascii85
+ data = ascii85.ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFEncryptionError
else:
@@ -265,7 +272,10 @@ class PDFPage(object):
self.rotate = self.attrs.get('Rotate', 0)
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
- contents = resolve1(self.attrs['Contents'])
+ if 'Contents' in self.attrs:
+ contents = resolve1(self.attrs['Contents'])
+ else:
+ contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents