ASCII85 filter support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-08-30 07:40:52 +00:00
parent 79f425b164
commit 3ed3b4cfd5
6 changed files with 85 additions and 14 deletions

View File

@ -14,7 +14,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Tue Jul 29 21:34:29 JST 2008
Last Modified: Sat Aug 30 16:39:32 JST 2008
<!-- hhmts end -->
</div>
@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
<li> Do the following test:<br>
<blockquote><pre>
$ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
&lt;page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"&gt;
&lt;text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"&gt; Hello World &lt;/text&gt;
&lt;/page&gt;
&lt;html&gt;&lt;head&gt;&lt;meta http-equiv="Content-Type" content="text/html; charset=ascii"&gt;
&lt;/head&gt;&lt;body&gt;
&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="0"&gt;Page 0&lt;/a&gt;&lt;/div&gt;&lt;span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"&gt; Hello World &lt;/span&gt;
&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#0"&gt;0&lt;/a&gt;&lt;/div&gt;
&lt;/body&gt;&lt;/html&gt;
</pre></blockquote>
<li> Done!
</ol>
@ -91,7 +94,8 @@ $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
<p>
<h3>For non-ASCII languages</h3>
In order to handle non-ASCII languages (e.g. Japanese),
you need to install an additional data called <code>CMap</code>.
you need to install an additional data called <code>CMap</code>,
which is distributed from Adobe.
<p>
Here is how:
@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
<li> <code>sgml</code> : SGML format.
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
Tags used here are defined in the PDF specification.
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
</ul>
<p>
<dt> <code>-P <em>password</em></code>
@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2008/08/30: ASCII85 encoding filter support.
<li> 2008/07/27: Tagged contents extraction support.
<li> 2008/07/10: Outline (TOC) extraction support.
<li> 2008/06/29: HTML output added. Reorganized the directory structure.

3
TODO
View File

@ -1,5 +1,6 @@
TODOs:
- Documentation.
- API Documentation.
- Sample webapp for pdf->html.
- Error handling for invalid type.
- Infer text stream by clustering.

View File

@ -1,9 +1,12 @@
#!/usr/bin/env python
#
# Arcfour implementation
# Arcfour implementation in Python
# * public domain *
#
## Arcfour
##
class Arcfour(object):
def __init__(self, key):
@ -30,6 +33,7 @@ class Arcfour(object):
(self.i, self.j) = (i, j)
return r
# test
if __name__ == '__main__':
def doit(key, data):
cipher = Arcfour(key)

47
pdflib/ascii85.py Normal file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
#
# ASCII85 decoder (Adobe version) implementation
# * public domain *
#
import struct
# ascii85decode(data)
def ascii85decode(data):
n = b = 0
out = ''
for c in data:
if '!' <= c and c <= 'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == 'z':
assert n == 0
out += '\0\0\0\0'
elif c == '~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
# test
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
if __name__ == '__main__':
orig = r'''
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
'''
data = \
'Man is distinguished, not only by his reason, but by this singular passion from '\
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.'
assert ascii85decode(orig) == data
print 'test succeeded'

View File

@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
return
def execute(self, streams):
parser = PDFContentParser(streams, debug=self.debug)
try:
parser = PDFContentParser(streams, debug=self.debug)
except PSEOF:
# empty page
return
while 1:
try:
(_,obj) = parser.nextobject()

View File

@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
PSLiteralTable.intern('A85'))
KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
@ -200,16 +204,19 @@ class PDFStream(PDFObject):
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f == LITERAL_FLATE_DECODE:
if f in LITERALS_FLATE_DECODE:
import zlib
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f == LITERAL_LZW_DECODE:
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFEncryptionError
else:
@ -265,7 +272,10 @@ class PDFPage(object):
self.rotate = self.attrs.get('Rotate', 0)
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
contents = resolve1(self.attrs['Contents'])
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents