diff --git a/Makefile b/Makefile index ad36e55..7afc258 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Makefile for pdfminer PACKAGE=pdfminer -VERSION=20080727 +VERSION=20080830 GNUTAR=tar SVN=svn PYTHON=python diff --git a/pdflib/page.py b/pdflib/page.py new file mode 100644 index 0000000..bc4e63a --- /dev/null +++ b/pdflib/page.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +import sys +stdout = sys.stdout +stderr = sys.stderr +from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \ + mult_matrix, apply_matrix + + +## PageItem +## +class PageItem(object): + + def __init__(self, id, (x0,y0,x1,y1), rotate=0): + self.id = id + self.bbox = (x0, y0, x1, y1) + self.rotate = rotate + self.objs = [] + return + + def __repr__(self): + return ('' % (self.id, self.bbox, self.rotate)) + + def add(self, obj): + self.objs.append(obj) + return + + +## FigureItem +## +class FigureItem(PageItem): + + def __repr__(self): + return ('
' % (self.id, self.bbox)) + + +## TextItem +## +class TextItem(object): + + def __init__(self, matrix, font, fontsize, width, text): + self.matrix = matrix + self.font = font + (a,b,c,d,tx,ty) = self.matrix + self.origin = (tx,ty) + self.direction = 0 + if not self.font.is_vertical(): + self.direction = 1 + (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize)) + self.width = abs(self.width) + (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001)) + (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001)) + ty += descent + self.bbox = (tx, ty, tx+self.width, ty+self.height) + else: + self.direction = 2 + (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width)) + self.width = abs(self.width) + (disp,_) = text[0] + (_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001)) + tx -= self.width/2 + ty += disp + self.bbox = (tx, ty+self.height, tx+self.width, ty) + self.text = ''.join( c for (_,c) in text ) + (w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize)) + self.fontsize = max(w,h) + return + + def __repr__(self): + return ('' % + (self.matrix, self.font, self.fontsize, self.width, self.height, self.text)) + + +## TextConverter +## +class TextConverter(PDFDevice): + + def __init__(self, rsrc, outfp, codec='utf-8', debug=0): + PDFDevice.__init__(self, rsrc, debug=debug) + self.outfp = outfp + self.codec = codec + self.pageno = 0 + self.stack = [] + return + + def begin_page(self, page): + self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate) + return + def end_page(self, _): + assert not self.stack + assert isinstance(self.cur_item, PageItem) + self.pageno += 1 + return + + def begin_figure(self, name, bbox): + self.stack.append(self.cur_item) + self.cur_item = FigureItem(name, bbox) + return + def end_figure(self, _): + fig = self.cur_item + self.cur_item = self.stack.pop() + self.cur_item.add(fig) + return + + def render_image(self, stream, size, matrix): + return + + def handle_undefined_char(self, cidcoding, cid): + if self.debug: + print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) + return None + + def render_string(self, textstate, textmatrix, size, seq, ratio=0.6): + font = textstate.font + spwidth = int(-font.char_width(32) * ratio) # space width + text = [] + for x in seq: + if isinstance(x, int) or isinstance(x, float): + if not font.is_vertical() and x <= spwidth: + text.append((0, ' ')) + else: + chars = font.decode(x) + for cid in chars: + try: + char = font.to_unicode(cid) + text.append((font.char_disp(cid), char)) + except PDFUnicodeNotDefined, e: + (cidcoding, cid) = e.args + s = self.handle_undefined_char(cidcoding, cid) + if s: + text.append(s) + if text: + item = TextItem(mult_matrix(textmatrix, self.ctm), + font, textstate.fontsize, size, text) + self.cur_item.add(item) + return diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 8f153d9..f65874b 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -4,9 +4,9 @@ stdout = sys.stdout stderr = sys.stderr from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \ - PDFPageInterpreter, PDFUnicodeNotDefined, \ - mult_matrix, apply_matrix + PDFPageInterpreter, PDFUnicodeNotDefined from pdflib.cmap import CMapDB +from pdflib.page import PageItem, FigureItem, TextItem, TextConverter def enc(x, codec): @@ -18,142 +18,12 @@ def encprops(props, codec): return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) ) -## PageItem -## -class PageItem(object): - - def __init__(self, id, (x0,y0,x1,y1), rotate=0): - self.id = id - self.bbox = (x0, y0, x1, y1) - self.rotate = rotate - self.objs = [] - return - - def __repr__(self): - return ('' % (self.id, self.bbox, self.rotate)) - - def add(self, obj): - self.objs.append(obj) - return - - -## FigureItem -## -class FigureItem(PageItem): - - def __repr__(self): - return ('
' % (self.id, self.bbox)) - - -## TextItem -## -class TextItem(object): - - def __init__(self, matrix, font, fontsize, width, text): - self.matrix = matrix - self.font = font - (a,b,c,d,tx,ty) = self.matrix - self.origin = (tx,ty) - self.direction = 0 - if not self.font.is_vertical(): - self.direction = 1 - (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize)) - self.width = abs(self.width) - (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001)) - (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001)) - ty += descent - self.bbox = (tx, ty, tx+self.width, ty+self.height) - else: - self.direction = 2 - (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width)) - self.width = abs(self.width) - (disp,_) = text[0] - (_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001)) - tx -= self.width/2 - ty += disp - self.bbox = (tx, ty+self.height, tx+self.width, ty) - self.text = ''.join( c for (_,c) in text ) - (w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize)) - self.fontsize = max(w,h) - return - - def __repr__(self): - return ('' % - (self.matrix, self.font, self.fontsize, self.width, self.height, self.text)) - - -## TextConverter -## -class TextConverter(PDFDevice): - - def __init__(self, rsrc, outfp, codec='utf-8', debug=0): - PDFDevice.__init__(self, rsrc, debug=debug) - self.outfp = outfp - self.codec = codec - self.pageno = 0 - self.stack = [] - return - - def begin_page(self, page): - self.context = PageItem(self.pageno, page.mediabox, page.rotate) - return - def end_page(self, _): - assert not self.stack - assert isinstance(self.context, PageItem) - self.pageno += 1 - self.dump_page(self.context) - return - - def begin_figure(self, name, bbox): - self.stack.append(self.context) - self.context = FigureItem(name, bbox) - return - def end_figure(self, _): - fig = self.context - self.context = self.stack.pop() - self.context.add(fig) - return - - def render_image(self, stream, size, matrix): - return - - def handle_undefined_char(self, cidcoding, cid): - if self.debug: - print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) - #return unichr(cid) - return None - - def render_string(self, textstate, textmatrix, size, seq): - font = textstate.font - spwidth = int(-font.char_width(32) * 0.6) # space width - text = [] - for x in seq: - if isinstance(x, int) or isinstance(x, float): - if not font.is_vertical() and x <= spwidth: - text.append((0, ' ')) - else: - chars = font.decode(x) - for cid in chars: - try: - char = font.to_unicode(cid) - text.append((font.char_disp(cid), char)) - except PDFUnicodeNotDefined, e: - (cidcoding, cid) = e.args - s = self.handle_undefined_char(cidcoding, cid) - if s: - text.append(s) - if text: - item = TextItem(mult_matrix(textmatrix, self.ctm), - font, textstate.fontsize, size, text) - self.context.add(item) - return - - ## SGMLConverter ## class SGMLConverter(TextConverter): - def dump_page(self, page): + def end_page(self, _): + page = self.cur_item def f(item): bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox if isinstance(item, FigureItem): @@ -189,7 +59,8 @@ class HTMLConverter(TextConverter): self.yoffset = self.pagepad return - def dump_page(self, page): + def end_page(self, _): + page = self.cur_item def f(item): if isinstance(item, FigureItem): pass