ASCII85 filter support added.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
79f425b164
commit
3ed3b4cfd5
17
README.html
17
README.html
|
@ -14,7 +14,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Tue Jul 29 21:34:29 JST 2008
|
||||
Last Modified: Sat Aug 30 16:39:32 JST 2008
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
|
|||
<li> Do the following test:<br>
|
||||
<blockquote><pre>
|
||||
$ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
|
||||
<page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0">
|
||||
<text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"> Hello World </text>
|
||||
</page>
|
||||
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ascii">
|
||||
</head><body>
|
||||
<div style="position:absolute; top:50px;"><a name="0">Page 0</a></div><span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"></span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"> Hello World </span>
|
||||
<div style="position:absolute; top:0px;">Page: <a href="#0">0</a></div>
|
||||
</body></html>
|
||||
</pre></blockquote>
|
||||
<li> Done!
|
||||
</ol>
|
||||
|
@ -91,7 +94,8 @@ $ <strong>python -m tools.pdf2txt samples/simple1.pdf</strong>
|
|||
<p>
|
||||
<h3>For non-ASCII languages</h3>
|
||||
In order to handle non-ASCII languages (e.g. Japanese),
|
||||
you need to install an additional data called <code>CMap</code>.
|
||||
you need to install an additional data called <code>CMap</code>,
|
||||
which is distributed from Adobe.
|
||||
<p>
|
||||
Here is how:
|
||||
|
||||
|
@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.
|
|||
<li> <code>sgml</code> : SGML format.
|
||||
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
||||
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
||||
Tags used here are defined in the PDF specification.
|
||||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||
</ul>
|
||||
<p>
|
||||
<dt> <code>-P <em>password</em></code>
|
||||
|
@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<hr noshade>
|
||||
<h2>Changes</h2>
|
||||
<ul>
|
||||
<li> 2008/08/30: ASCII85 encoding filter support.
|
||||
<li> 2008/07/27: Tagged contents extraction support.
|
||||
<li> 2008/07/10: Outline (TOC) extraction support.
|
||||
<li> 2008/06/29: HTML output added. Reorganized the directory structure.
|
||||
|
|
3
TODO
3
TODO
|
@ -1,5 +1,6 @@
|
|||
TODOs:
|
||||
- Documentation.
|
||||
- API Documentation.
|
||||
- Sample webapp for pdf->html.
|
||||
- Error handling for invalid type.
|
||||
- Infer text stream by clustering.
|
||||
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Arcfour implementation
|
||||
# Arcfour implementation in Python
|
||||
# * public domain *
|
||||
#
|
||||
|
||||
|
||||
## Arcfour
|
||||
##
|
||||
class Arcfour(object):
|
||||
|
||||
def __init__(self, key):
|
||||
|
@ -30,6 +33,7 @@ class Arcfour(object):
|
|||
(self.i, self.j) = (i, j)
|
||||
return r
|
||||
|
||||
# test
|
||||
if __name__ == '__main__':
|
||||
def doit(key, data):
|
||||
cipher = Arcfour(key)
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# ASCII85 decoder (Adobe version) implementation
|
||||
# * public domain *
|
||||
#
|
||||
|
||||
import struct
|
||||
|
||||
# ascii85decode(data)
|
||||
def ascii85decode(data):
|
||||
n = b = 0
|
||||
out = ''
|
||||
for c in data:
|
||||
if '!' <= c and c <= 'u':
|
||||
n += 1
|
||||
b = b*85+(ord(c)-33)
|
||||
if n == 5:
|
||||
out += struct.pack('>L',b)
|
||||
n = b = 0
|
||||
elif c == 'z':
|
||||
assert n == 0
|
||||
out += '\0\0\0\0'
|
||||
elif c == '~':
|
||||
if n:
|
||||
for _ in range(5-n):
|
||||
b = b*85+84
|
||||
out += struct.pack('>L',b)[:n-1]
|
||||
break
|
||||
return out
|
||||
|
||||
# test
|
||||
# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85
|
||||
if __name__ == '__main__':
|
||||
orig = r'''
|
||||
9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
|
||||
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF"AGXBPCsi+DGm>@3BB/F*&OCAfu2/AKY
|
||||
i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
|
||||
l(DId<j@<?3r@:F%a+D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
|
||||
>uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>
|
||||
'''
|
||||
data = \
|
||||
'Man is distinguished, not only by his reason, but by this singular passion from '\
|
||||
'other animals, which is a lust of the mind, that by a perseverance of delight in the '\
|
||||
'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\
|
||||
'any carnal pleasure.'
|
||||
assert ascii85decode(orig) == data
|
||||
print 'test succeeded'
|
|
@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
|
||||
def execute(self, streams):
|
||||
try:
|
||||
parser = PDFContentParser(streams, debug=self.debug)
|
||||
except PSEOF:
|
||||
# empty page
|
||||
return
|
||||
while 1:
|
||||
try:
|
||||
(_,obj) = parser.nextobject()
|
||||
|
|
|
@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page')
|
|||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
||||
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
|
||||
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
|
||||
PSLiteralTable.intern('Fl'))
|
||||
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
|
||||
PSLiteralTable.intern('LZW'))
|
||||
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
|
||||
PSLiteralTable.intern('A85'))
|
||||
KEYWORD_R = PSKeywordTable.intern('R')
|
||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||
|
@ -200,16 +204,19 @@ class PDFStream(PDFObject):
|
|||
if not isinstance(filters, list):
|
||||
filters = [ filters ]
|
||||
for f in filters:
|
||||
if f == LITERAL_FLATE_DECODE:
|
||||
if f in LITERALS_FLATE_DECODE:
|
||||
import zlib
|
||||
# will get errors if the document is encrypted.
|
||||
data = zlib.decompress(data)
|
||||
elif f == LITERAL_LZW_DECODE:
|
||||
elif f in LITERALS_LZW_DECODE:
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
data = ''.join(LZWDecoder(StringIO(data)).run())
|
||||
elif f in LITERALS_ASCII85_DECODE:
|
||||
import ascii85
|
||||
data = ascii85.ascii85decode(data)
|
||||
elif f == LITERAL_CRYPT:
|
||||
raise PDFEncryptionError
|
||||
else:
|
||||
|
@ -265,7 +272,10 @@ class PDFPage(object):
|
|||
self.rotate = self.attrs.get('Rotate', 0)
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
contents = [ contents ]
|
||||
self.contents = contents
|
||||
|
|
Loading…
Reference in New Issue