diff --git a/TODO b/TODO new file mode 100644 index 0000000..7658243 --- /dev/null +++ b/TODO @@ -0,0 +1,9 @@ +TODO: + - Code Documentation. + - Error handling for invalid type. + + - Outlines. + - Named Objects. (pages) + - Writers. + - Linearized PDF. + - Encryption? diff --git a/arcfour.py b/arcfour.py new file mode 100755 index 0000000..c9c13a8 --- /dev/null +++ b/arcfour.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# +# Arcfour implementation +# * public domain * +# + +class Arcfour: + + def __init__(self, key): + s = range(256) + j = 0 + klen = len(key) + for i in xrange(256): + j = (j + s[i] + ord(key[i % klen])) % 256 + (s[i], s[j]) = (s[j], s[i]) + self.s = s + (self.i, self.j) = (0, 0) + return + + def process(self, data): + (i, j) = (self.i, self.j) + s = self.s + r = '' + for c in data: + i = (i+1) % 256 + j = (j+s[i]) % 256 + (s[i], s[j]) = (s[j], s[i]) + k = s[(s[i]+s[j]) % 256] + r += chr(ord(c) ^ k) + (self.i, self.j) = (i, j) + return r + +if __name__ == '__main__': + def doit(key, data): + cipher = Arcfour(key) + return ''.join( '%02X' % ord(c) for c in cipher.process(data) ) + assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3' + assert doit("Wiki", "pedia") == '1021BF0420' + assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5' + print 'test succeeded' diff --git a/extent.py b/extent.py index 67e005d..74d08b7 100755 --- a/extent.py +++ b/extent.py @@ -20,6 +20,9 @@ class Rect: self.y1 = y0+h return + def __repr__(self): + return '' % (self.x0, self.y0, self.x1, self.y1) + def overlap(self, rect): return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or rect.y1 <= self.y0 or self.y1 <= rect.y0) @@ -31,7 +34,7 @@ class ExtSet: def __init__(self, gridsize): self.gridsize = gridsize - self.grid = [] + self.grid = {} return def cells(self, x0, x1): @@ -45,13 +48,19 @@ class ExtSet: def add(self, x0, x1, obj): for i in self.cells(x0, x1): - self.grid[i].append(obj) + if i not in self.grid: + a = [] + self.grid[i] = a + else: + a = self.grid[i] + a.append(obj) return def get(self, x0, x1): objs = set() for i in self.cells(x0, x1): - objs.update(self.grid[i]) + if i in self.grid: + objs.update(self.grid[i]) return objs def test_extset(): @@ -78,12 +87,13 @@ class ExtGrid: self.vext = ExtSet(gridsize) return - def add(self, rect): - self.hext.add(rect.x0, rect.x1, rect) - self.vext.add(rect.y0, rect.y1, rect) + def add(self, rect, obj): + self.hext.add(rect.x0, rect.x1, obj) + self.vext.add(rect.y0, rect.y1, obj) return - def get(self, rect): - rects = self.hext.get(rect.x0, rect.x1) - rects.update_intersect(self.vext.get(rect.y0, rect.y1)) - return rects + def get(self, rect, getrect): + objs = self.hext.get(rect.x0, rect.x1) + objs.intersection_update(self.vext.get(rect.y0, rect.y1)) + objs = [ obj for obj in objs if rect.overlap(getrect(obj)) ] + return objs diff --git a/pdf2txt.py b/pdf2txt.py index 9180067..a6bd115 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -7,86 +7,183 @@ from pdfinterp import PDFDevice, PDFResourceManager, \ PDFPageInterpreter, PDFUnicodeNotDefined, \ mult_matrix, apply_matrix from cmap import CMapDB +from extent import Rect, ExtSet, ExtGrid + + +## PageItem +## +class PageItem: + + GRID_SIZE = 20 + + def __init__(self, id, (x0,y0,x1,y1), rotate=0): + self.id = id + self.bbox = Rect(x0, y0, x1-x0, y1-y0) + self.rotate = rotate + self.grid = ExtGrid(self.GRID_SIZE) + self.objs = [] + return + + def __repr__(self): + bbox = self.bbox + return ('' % + (self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate)) + + def add(self, obj): + self.objs.append(obj) + self.grid.add(obj.bbox, obj) + return + + def dump(self, outfp, codec): + outfp.write(repr(self)+'\n') + for obj in self.objs: + obj.dump(outfp, codec) + outfp.write('\n') + return + + def fuse(self): + for obj1 in self.objs: + f = (lambda obj: obj.bbox) + for rect in obj1.search_range(): + neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ] + #print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ] + return + + +## FigureItem +## +class FigureItem(PageItem): + + def __repr__(self): + bbox = self.bbox + return ('
' % + (self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1)) + + def dump(self, outfp, codec): + outfp.write(repr(self)+'\n') + for obj in self.objs: + obj.dump(outfp, codec) + outfp.write('
\n') + return + + def search_range(self): + return [] + + +## TextItem +## +class TextItem: + + def __init__(self, matrix, font, size, width, text): + self.matrix = matrix + self.font = font + (a,b,c,d,tx,ty) = self.matrix + (self.width, self.size) = apply_matrix((a,b,c,d,0,0), (width,size)) + self.width = abs(self.width) + self.origin = (tx,ty) + self.direction = 0 + if not self.font.is_vertical(): + self.direction = 1 + (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001)) + (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001)) + self.bbox = Rect(tx, ty+descent, self.width, self.size) + else: + self.direction = 2 + mindisp = min( d for (d,_) in text ) + (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0)) + self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width) + self.text = ''.join( c for (_,c) in text ) + return + + def __repr__(self): + return ('' % + (self.matrix, self.font, self.size, self.width, self.text)) + + def dump(self, outfp, codec): + (a,b,c,d,tx,ty) = self.matrix + outfp.write('' % + (tx, ty, self.font.fontname, self.size, self.width)) + outfp.write(self.text.encode(codec, 'xmlcharrefreplace')) + outfp.write('\n') + return + + def search_range(self): + if self.direction == 1: + return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ] + else: + return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ] ## TextConverter ## class TextConverter(PDFDevice): - def __init__(self, outfp, rsrc, codec, debug=0): + def __init__(self, rsrc, debug=0): PDFDevice.__init__(self, rsrc, debug=debug) - self.outfp = outfp - self.codec = codec - return - - def close(self): - self.outfp.write('\n') + self.pages = [] + self.stack = [] return def begin_page(self, page): - (x0,y0,x1,y1) = page.mediabox - self.outfp.write('' % - (page.pageid, x0,y0,x1,y1, page.rotate)) + self.context = PageItem(str(page.pageid), page.mediabox, page.rotate) return def end_page(self, _): - self.outfp.write('\n') + assert not self.stack + self.pages.append(self.context) return def begin_figure(self, name, bbox): - (x0,y0,x1,y1) = bbox - self.outfp.write('
\n' % - (name, x0,y0,x1,y1)) + self.stack.append(self.context) + self.context = FigureItem(name, bbox) return def end_figure(self, _): - self.outfp.write('
\n') + fig = self.context + self.context = self.stack.pop() + self.context.add(fig) return def handle_undefined_char(self, cidcoding, cid): if self.debug: print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) #return unichr(cid) - #return unichr(cid+32) - return + return None def render_string(self, textstate, textmatrix, size, seq): font = textstate.font spwidth = int(-font.char_width(32) * 0.6) # space width - buf = '' + text = [] for x in seq: if isinstance(x, int) or isinstance(x, float): if not font.is_vertical() and x <= spwidth: - buf += ' ' + text.append((0, ' ')) else: chars = font.decode(x) for cid in chars: try: char = font.to_unicode(cid) - buf += char + text.append((font.char_disp(cid), char)) except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args s = self.handle_undefined_char(cidcoding, cid) if s: - buf += s - (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm) - if font.is_vertical(): - size = -size - tag = 'vtext' - else: - tag = 'htext' - if (b != 0 or c != 0 or a <= 0 or d <= 0): - tag += ' skewed' - s = buf.encode(self.codec, 'xmlcharrefreplace') - (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize)) - def f(x): return '%.03f' % x - self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s\n' % - (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag)) + text.append(s) + item = TextItem(mult_matrix(textmatrix, self.ctm), + font, textstate.fontsize, size, text) + self.context.add(item) + return + + def dump(self, outfp, codec): + outfp.write('\n') + for page in self.pages: + #page.fuse() + page.dump(outfp, codec) + outfp.write('\n') return # pdf2txt def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): - device = TextConverter(outfp, rsrc, codec, debug=debug) - outfp.write('\n') + device = TextConverter(rsrc, debug=debug) doc = PDFDocument(debug=debug) fp = file(fname) parser = PDFParser(doc, fp, debug=debug) @@ -95,7 +192,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): if pages and (i not in pages): continue interpreter.process_page(page) fp.close() - outfp.write('\n') + device.dump(outfp, codec) device.close() return diff --git a/pdfparser.py b/pdfparser.py index e0c9df0..308db76 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -4,30 +4,30 @@ # ver 0.1, Dec 24 2004- # ver 0.2, Dec 24 2007 -# TODO: -# - Code Documentation. -# - Error handling for invalid type. - -# - Outlines. -# - Named Objects. (pages) -# - Writers. -# - Linearized PDF. -# - Encryption? - import sys +import md5, struct stderr = sys.stderr from utils import choplist, nunpack +from arcfour import Arcfour from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, \ PSStackParser, STRICT +def decrypt_rc4(key, objid, genno, data): + key += struct.pack('>stderr, 'register: objid=%r: %r' % (objid, obj) self.objs[objid] = obj + if self.decipher: + obj = decipher_all(self.decipher, objid, genno, obj) return obj def get_pages(self, debug=0): diff --git a/samples/dmca.pdf b/samples/dmca.pdf new file mode 100644 index 0000000..90d1522 Binary files /dev/null and b/samples/dmca.pdf differ diff --git a/samples/f1040nr.pdf b/samples/f1040nr.pdf new file mode 100644 index 0000000..2c0a6d0 Binary files /dev/null and b/samples/f1040nr.pdf differ diff --git a/samples/i1040nr.pdf b/samples/i1040nr.pdf new file mode 100644 index 0000000..7f9621e Binary files /dev/null and b/samples/i1040nr.pdf differ diff --git a/samples/kampo.pdf b/samples/kampo.pdf new file mode 100644 index 0000000..b41689b Binary files /dev/null and b/samples/kampo.pdf differ diff --git a/samples/nlp2004slides.pdf b/samples/nlp2004slides.pdf new file mode 100644 index 0000000..ba29cd0 Binary files /dev/null and b/samples/nlp2004slides.pdf differ