ASCII85 filter support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-08-30 07:40:52 +00:00
parent 79f425b164
commit 3ed3b4cfd5
6 changed files with 85 additions and 14 deletions

View File

@ -14,7 +14,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Tue Jul 29 21:34:29 JST 2008 Last Modified: Sat Aug 30 16:39:32 JST 2008
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
<li> Do the following test:<br> <li> Do the following test:<br>
<blockquote><pre> <blockquote><pre>
$ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong> $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
&lt;page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"&gt; &lt;html&gt;&lt;head&gt;&lt;meta http-equiv="Content-Type" content="text/html; charset=ascii"&gt;
&lt;text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"&gt; Hello World &lt;/text&gt; &lt;/head&gt;&lt;body&gt;
&lt;/page&gt; &lt;div style="position:absolute; top:50px;"&gt;&lt;a name="0"&gt;Page 0&lt;/a&gt;&lt;/div&gt;&lt;span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"&gt; Hello World &lt;/span&gt;
&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#0"&gt;0&lt;/a&gt;&lt;/div&gt;
&lt;/body&gt;&lt;/html&gt;
</pre></blockquote> </pre></blockquote>
<li> Done! <li> Done!
</ol> </ol>
@ -91,7 +94,8 @@ $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
<p> <p>
<h3>For non-ASCII languages</h3> <h3>For non-ASCII languages</h3>
In order to handle non-ASCII languages (e.g. Japanese), In order to handle non-ASCII languages (e.g. Japanese),
you need to install an additional data called <code>CMap</code>. you need to install an additional data called <code>CMap</code>,
which is distributed from Adobe.
<p> <p>
Here is how: Here is how:
@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
<li> <code>sgml</code> : SGML format. <li> <code>sgml</code> : SGML format.
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with <li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations. HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
Tags used here are defined in the PDF specification. Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
</ul> </ul>
<p> <p>
<dt> <code>-P <em>password</em></code> <dt> <code>-P <em>password</em></code>
@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2008/08/30: ASCII85 encoding filter support.
<li> 2008/07/27: Tagged contents extraction support. <li> 2008/07/27: Tagged contents extraction support.
<li> 2008/07/10: Outline (TOC) extraction support. <li> 2008/07/10: Outline (TOC) extraction support.
<li> 2008/06/29: HTML output added. Reorganized the directory structure. <li> 2008/06/29: HTML output added. Reorganized the directory structure.

3
TODO
View File

@ -1,5 +1,6 @@
TODOs: TODOs:
- Documentation. - API Documentation.
- Sample webapp for pdf->html.
- Error handling for invalid type. - Error handling for invalid type.
- Infer text stream by clustering. - Infer text stream by clustering.

View File

@ -1,9 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
# #
# Arcfour implementation # Arcfour implementation in Python
# * public domain * # * public domain *
# #
## Arcfour
##
class Arcfour(object): class Arcfour(object):
def __init__(self, key): def __init__(self, key):
@ -30,6 +33,7 @@ class Arcfour(object):
(self.i, self.j) = (i, j) (self.i, self.j) = (i, j)
return r return r
# test
if __name__ == '__main__': if __name__ == '__main__':
def doit(key, data): def doit(key, data):
cipher = Arcfour(key) cipher = Arcfour(key)

47
pdflib/ascii85.py Normal file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
#
# ASCII85 decoder (Adobe version) implementation
# * public domain *
#
import struct
# ascii85decode(data)
def ascii85decode(data):
n = b = 0
out = ''
for c in data:
if '!' <= c and c <= 'u':
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
n = b = 0
elif c == 'z':
assert n == 0
out += '\0\0\0\0'
elif c == '~':
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
break
return out
# test
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
if __name__ == '__main__':
orig = r'''
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
'''
data = \
'Man is distinguished, not only by his reason, but by this singular passion from '\
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
'any carnal pleasure.'
assert ascii85decode(orig) == data
print 'test succeeded'

View File

@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
return return
def execute(self, streams): def execute(self, streams):
try:
parser = PDFContentParser(streams, debug=self.debug) parser = PDFContentParser(streams, debug=self.debug)
except PSEOF:
# empty page
return
while 1: while 1:
try: try:
(_,obj) = parser.nextobject() (_,obj) = parser.nextobject()

View File

@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_CRYPT = PSLiteralTable.intern('Crypt') LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode') PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
PSLiteralTable.intern('A85'))
KEYWORD_R = PSKeywordTable.intern('R') KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
@ -200,16 +204,19 @@ class PDFStream(PDFObject):
if not isinstance(filters, list): if not isinstance(filters, list):
filters = [ filters ] filters = [ filters ]
for f in filters: for f in filters:
if f == LITERAL_FLATE_DECODE: if f in LITERALS_FLATE_DECODE:
import zlib import zlib
# will get errors if the document is encrypted. # will get errors if the document is encrypted.
data = zlib.decompress(data) data = zlib.decompress(data)
elif f == LITERAL_LZW_DECODE: elif f in LITERALS_LZW_DECODE:
try: try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run()) data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
raise PDFEncryptionError raise PDFEncryptionError
else: else:
@ -265,7 +272,10 @@ class PDFPage(object):
self.rotate = self.attrs.get('Rotate', 0) self.rotate = self.attrs.get('Rotate', 0)
self.annots = self.attrs.get('Annots') self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B') self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents']) contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list): if not isinstance(contents, list):
contents = [ contents ] contents = [ contents ]
self.contents = contents self.contents = contents