ASCII85 filter support added.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
79f425b164
commit
3ed3b4cfd5
17
README.html
17
README.html
|
@ -14,7 +14,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Tue Jul 29 21:34:29 JST 2008
|
Last Modified: Sat Aug 30 16:39:32 JST 2008
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
|
||||||
<li> Do the following test:<br>
|
<li> Do the following test:<br>
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
|
$ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
|
||||||
<page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0">
|
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ascii">
|
||||||
<text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"> Hello World </text>
|
</head><body>
|
||||||
</page>
|
<div style="position:absolute; top:50px;"><a name="0">Page 0</a></div><span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"></span>
|
||||||
|
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"> Hello World </span>
|
||||||
|
<div style="position:absolute; top:0px;">Page: <a href="#0">0</a></div>
|
||||||
|
</body></html>
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
<li> Done!
|
<li> Done!
|
||||||
</ol>
|
</ol>
|
||||||
|
@ -91,7 +94,8 @@ $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
|
||||||
<p>
|
<p>
|
||||||
<h3>For non-ASCII languages</h3>
|
<h3>For non-ASCII languages</h3>
|
||||||
In order to handle non-ASCII languages (e.g. Japanese),
|
In order to handle non-ASCII languages (e.g. Japanese),
|
||||||
you need to install an additional data called <code>CMap</code>.
|
you need to install an additional data called <code>CMap</code>,
|
||||||
|
which is distributed from Adobe.
|
||||||
<p>
|
<p>
|
||||||
Here is how:
|
Here is how:
|
||||||
|
|
||||||
|
@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
|
||||||
<li> <code>sgml</code> : SGML format.
|
<li> <code>sgml</code> : SGML format.
|
||||||
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
||||||
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
||||||
Tags used here are defined in the PDF specification.
|
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||||
</ul>
|
</ul>
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-P <em>password</em></code>
|
<dt> <code>-P <em>password</em></code>
|
||||||
|
@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2008/08/30: ASCII85 encoding filter support.
|
||||||
<li> 2008/07/27: Tagged contents extraction support.
|
<li> 2008/07/27: Tagged contents extraction support.
|
||||||
<li> 2008/07/10: Outline (TOC) extraction support.
|
<li> 2008/07/10: Outline (TOC) extraction support.
|
||||||
<li> 2008/06/29: HTML output added. Reorganized the directory structure.
|
<li> 2008/06/29: HTML output added. Reorganized the directory structure.
|
||||||
|
|
3
TODO
3
TODO
|
@ -1,5 +1,6 @@
|
||||||
TODOs:
|
TODOs:
|
||||||
- Documentation.
|
- API Documentation.
|
||||||
|
- Sample webapp for pdf->html.
|
||||||
- Error handling for invalid type.
|
- Error handling for invalid type.
|
||||||
- Infer text stream by clustering.
|
- Infer text stream by clustering.
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
#
|
#
|
||||||
# Arcfour implementation
|
# Arcfour implementation in Python
|
||||||
# * public domain *
|
# * public domain *
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
|
## Arcfour
|
||||||
|
##
|
||||||
class Arcfour(object):
|
class Arcfour(object):
|
||||||
|
|
||||||
def __init__(self, key):
|
def __init__(self, key):
|
||||||
|
@ -30,6 +33,7 @@ class Arcfour(object):
|
||||||
(self.i, self.j) = (i, j)
|
(self.i, self.j) = (i, j)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
# test
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
def doit(key, data):
|
def doit(key, data):
|
||||||
cipher = Arcfour(key)
|
cipher = Arcfour(key)
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
#
|
||||||
|
# ASCII85 decoder (Adobe version) implementation
|
||||||
|
# * public domain *
|
||||||
|
#
|
||||||
|
|
||||||
|
import struct
|
||||||
|
|
||||||
|
# ascii85decode(data)
|
||||||
|
def ascii85decode(data):
|
||||||
|
n = b = 0
|
||||||
|
out = ''
|
||||||
|
for c in data:
|
||||||
|
if '!' <= c and c <= 'u':
|
||||||
|
n += 1
|
||||||
|
b = b*85+(ord(c)-33)
|
||||||
|
if n == 5:
|
||||||
|
out += struct.pack('>L',b)
|
||||||
|
n = b = 0
|
||||||
|
elif c == 'z':
|
||||||
|
assert n == 0
|
||||||
|
out += '\0\0\0\0'
|
||||||
|
elif c == '~':
|
||||||
|
if n:
|
||||||
|
for _ in range(5-n):
|
||||||
|
b = b*85+84
|
||||||
|
out += struct.pack('>L',b)[:n-1]
|
||||||
|
break
|
||||||
|
return out
|
||||||
|
|
||||||
|
# test
|
||||||
|
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
|
||||||
|
if __name__ == '__main__':
|
||||||
|
orig = r'''
|
||||||
|
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
|
||||||
|
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
|
||||||
|
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
|
||||||
|
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
|
||||||
|
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
|
||||||
|
'''
|
||||||
|
data = \
|
||||||
|
'Man is distinguished, not only by his reason, but by this singular passion from '\
|
||||||
|
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
|
||||||
|
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
|
||||||
|
'any carnal pleasure.'
|
||||||
|
assert ascii85decode(orig) == data
|
||||||
|
print 'test succeeded'
|
|
@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def execute(self, streams):
|
def execute(self, streams):
|
||||||
parser = PDFContentParser(streams, debug=self.debug)
|
try:
|
||||||
|
parser = PDFContentParser(streams, debug=self.debug)
|
||||||
|
except PSEOF:
|
||||||
|
# empty page
|
||||||
|
return
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(_,obj) = parser.nextobject()
|
(_,obj) = parser.nextobject()
|
||||||
|
|
|
@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
|
||||||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
|
||||||
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
|
PSLiteralTable.intern('Fl'))
|
||||||
|
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
|
||||||
|
PSLiteralTable.intern('LZW'))
|
||||||
|
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
|
||||||
|
PSLiteralTable.intern('A85'))
|
||||||
KEYWORD_R = PSKeywordTable.intern('R')
|
KEYWORD_R = PSKeywordTable.intern('R')
|
||||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||||
|
@ -200,16 +204,19 @@ class PDFStream(PDFObject):
|
||||||
if not isinstance(filters, list):
|
if not isinstance(filters, list):
|
||||||
filters = [ filters ]
|
filters = [ filters ]
|
||||||
for f in filters:
|
for f in filters:
|
||||||
if f == LITERAL_FLATE_DECODE:
|
if f in LITERALS_FLATE_DECODE:
|
||||||
import zlib
|
import zlib
|
||||||
# will get errors if the document is encrypted.
|
# will get errors if the document is encrypted.
|
||||||
data = zlib.decompress(data)
|
data = zlib.decompress(data)
|
||||||
elif f == LITERAL_LZW_DECODE:
|
elif f in LITERALS_LZW_DECODE:
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
data = ''.join(LZWDecoder(StringIO(data)).run())
|
data = ''.join(LZWDecoder(StringIO(data)).run())
|
||||||
|
elif f in LITERALS_ASCII85_DECODE:
|
||||||
|
import ascii85
|
||||||
|
data = ascii85.ascii85decode(data)
|
||||||
elif f == LITERAL_CRYPT:
|
elif f == LITERAL_CRYPT:
|
||||||
raise PDFEncryptionError
|
raise PDFEncryptionError
|
||||||
else:
|
else:
|
||||||
|
@ -265,7 +272,10 @@ class PDFPage(object):
|
||||||
self.rotate = self.attrs.get('Rotate', 0)
|
self.rotate = self.attrs.get('Rotate', 0)
|
||||||
self.annots = self.attrs.get('Annots')
|
self.annots = self.attrs.get('Annots')
|
||||||
self.beads = self.attrs.get('B')
|
self.beads = self.attrs.get('B')
|
||||||
contents = resolve1(self.attrs['Contents'])
|
if 'Contents' in self.attrs:
|
||||||
|
contents = resolve1(self.attrs['Contents'])
|
||||||
|
else:
|
||||||
|
contents = []
|
||||||
if not isinstance(contents, list):
|
if not isinstance(contents, list):
|
||||||
contents = [ contents ]
|
contents = [ contents ]
|
||||||
self.contents = contents
|
self.contents = contents
|
||||||
|
|
Loading…
Reference in New Issue