From 3ed3b4cfd5c4bf5813b644f9e7c4d25170b80f32 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 30 Aug 2008 07:40:52 +0000 Subject: [PATCH] ASCII85 filter support added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@48 1aa58f4a-7d42-0410-adbc-911cccaed67c --- README.html | 17 ++++++++++------ TODO | 3 ++- pdflib/arcfour.py | 6 +++++- pdflib/ascii85.py | 47 +++++++++++++++++++++++++++++++++++++++++++++ pdflib/pdfinterp.py | 6 +++++- pdflib/pdfparser.py | 20 ++++++++++++++----- 6 files changed, 85 insertions(+), 14 deletions(-) create mode 100644 pdflib/ascii85.py diff --git a/README.html b/README.html index b3edc81..50940ea 100644 --- a/README.html +++ b/README.html @@ -14,7 +14,7 @@ Python PDF parser and analyzer
-Last Modified: Tue Jul 29 21:34:29 JST 2008 +Last Modified: Sat Aug 30 16:39:32 JST 2008
@@ -81,9 +81,12 @@ http://pdf2html.tabesugi.net:8080/
  • Do the following test:
     $ python -m tools.pdf2txt samples/simple1.pdf
    -<page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0">
    -<text font="Helvetica" direction="1" bbox="100.000,695.032,237.352,719.032" fontsize="24.000"> Hello World </text>
    -</page>
    +<html><head><meta http-equiv="Content-Type" content="text/html; charset=ascii">
    +</head><body>
    +<div style="position:absolute; top:50px;"><a name="0">Page 0</a></div><span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"></span>
    +<span style="position:absolute; writing-mode:lr-tb; left:100px; top:122px; font-size:24px;"> Hello World </span>
    +<div style="position:absolute; top:0px;">Page: <a href="#0">0</a></div>
    +</body></html>
     
  • Done! @@ -91,7 +94,8 @@ $ python -m tools.pdf2txt samples/simple1.pdf

    For non-ASCII languages

    In order to handle non-ASCII languages (e.g. Japanese), -you need to install an additional data called CMap. +you need to install an additional data called CMap, +which is distributed from Adobe.

    Here is how: @@ -173,7 +177,7 @@ By default, it extracts texts from all the pages.

  • sgml : SGML format.
  • tag : "Tagged PDF" format. A tagged PDF has its own contents annotated with HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations. -Tags used here are defined in the PDF specification. +Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF").

    -P password @@ -241,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.

    Changes

      +
    • 2008/08/30: ASCII85 encoding filter support.
    • 2008/07/27: Tagged contents extraction support.
    • 2008/07/10: Outline (TOC) extraction support.
    • 2008/06/29: HTML output added. Reorganized the directory structure. diff --git a/TODO b/TODO index 859ffe8..f32ff7a 100644 --- a/TODO +++ b/TODO @@ -1,5 +1,6 @@ TODOs: - - Documentation. + - API Documentation. + - Sample webapp for pdf->html. - Error handling for invalid type. - Infer text stream by clustering. diff --git a/pdflib/arcfour.py b/pdflib/arcfour.py index 188acac..a8034d6 100755 --- a/pdflib/arcfour.py +++ b/pdflib/arcfour.py @@ -1,9 +1,12 @@ #!/usr/bin/env python # -# Arcfour implementation +# Arcfour implementation in Python # * public domain * # + +## Arcfour +## class Arcfour(object): def __init__(self, key): @@ -30,6 +33,7 @@ class Arcfour(object): (self.i, self.j) = (i, j) return r +# test if __name__ == '__main__': def doit(key, data): cipher = Arcfour(key) diff --git a/pdflib/ascii85.py b/pdflib/ascii85.py new file mode 100644 index 0000000..19e9374 --- /dev/null +++ b/pdflib/ascii85.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# +# ASCII85 decoder (Adobe version) implementation +# * public domain * +# + +import struct + +# ascii85decode(data) +def ascii85decode(data): + n = b = 0 + out = '' + for c in data: + if '!' <= c and c <= 'u': + n += 1 + b = b*85+(ord(c)-33) + if n == 5: + out += struct.pack('>L',b) + n = b = 0 + elif c == 'z': + assert n == 0 + out += '\0\0\0\0' + elif c == '~': + if n: + for _ in range(5-n): + b = b*85+84 + out += struct.pack('>L',b)[:n-1] + break + return out + +# test +# sample taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 +if __name__ == '__main__': + orig = r''' + 9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKFCj@.4Gp$d7F!,L7@<6@)/0JDEF@3BB/F*&OCAfu2/AKY + i(DIb:@FD,*)+C]U=@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF-FD5W8ARlolDIa + l(DIduD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~> + ''' + data = \ + 'Man is distinguished, not only by his reason, but by this singular passion from '\ + 'other animals, which is a lust of the mind, that by a perseverance of delight in the '\ + 'continued and indefatigable generation of knowledge, exceeds the short vehemence of '\ + 'any carnal pleasure.' + assert ascii85decode(orig) == data + print 'test succeeded' diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 9c4a39d..fe2fe29 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -1016,7 +1016,11 @@ class PDFPageInterpreter(object): return def execute(self, streams): - parser = PDFContentParser(streams, debug=self.debug) + try: + parser = PDFContentParser(streams, debug=self.debug) + except PSEOF: + # empty page + return while 1: try: (_,obj) = parser.nextobject() diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index b2b1509..4a30405 100755 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -35,8 +35,12 @@ LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_CRYPT = PSLiteralTable.intern('Crypt') -LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') -LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode') +LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), + PSLiteralTable.intern('Fl')) +LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), + PSLiteralTable.intern('LZW')) +LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), + PSLiteralTable.intern('A85')) KEYWORD_R = PSKeywordTable.intern('R') KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') @@ -200,16 +204,19 @@ class PDFStream(PDFObject): if not isinstance(filters, list): filters = [ filters ] for f in filters: - if f == LITERAL_FLATE_DECODE: + if f in LITERALS_FLATE_DECODE: import zlib # will get errors if the document is encrypted. data = zlib.decompress(data) - elif f == LITERAL_LZW_DECODE: + elif f in LITERALS_LZW_DECODE: try: from cStringIO import StringIO except ImportError: from StringIO import StringIO data = ''.join(LZWDecoder(StringIO(data)).run()) + elif f in LITERALS_ASCII85_DECODE: + import ascii85 + data = ascii85.ascii85decode(data) elif f == LITERAL_CRYPT: raise PDFEncryptionError else: @@ -265,7 +272,10 @@ class PDFPage(object): self.rotate = self.attrs.get('Rotate', 0) self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') - contents = resolve1(self.attrs['Contents']) + if 'Contents' in self.attrs: + contents = resolve1(self.attrs['Contents']) + else: + contents = [] if not isinstance(contents, list): contents = [ contents ] self.contents = contents