From a865b28bd934d639f6b729c2fc56dec2fff9600d Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Tue, 5 May 2009 12:26:29 +0000 Subject: [PATCH] fix git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@96 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdflib/layout.py | 500 ++++++++++++++++++++++++-------------------- pdflib/pdf2txt.py | 292 +++++++++++++------------- pdflib/pdfdevice.py | 6 +- pdflib/utils.py | 16 -- 4 files changed, 429 insertions(+), 385 deletions(-) diff --git a/pdflib/layout.py b/pdflib/layout.py index 9d67771..99651e9 100644 --- a/pdflib/layout.py +++ b/pdflib/layout.py @@ -1,122 +1,18 @@ #!/usr/bin/env python import sys -from utils import matrix2str, rect2str, point2str, pick, apply_matrix_norm +from utils import apply_matrix_norm INF = sys.maxint -## PageItem +## pick ## -class PageItem(object): - - def __init__(self, (x0,y0,x1,y1)): - #assert x0 <= x1 and y0 <= y1 - self.x0 = x0 - self.y0 = y0 - self.x1 = x1 - self.y1 = y1 - self.width = x1-x0 - self.height = y1-y0 - return - - def __repr__(self): - return ('' % (self.bbox())) - - def bbox(self): - return rect2str((self.x0, self.y0, self.x1, self.y1)) - - def hoverlap(self, obj): - assert isinstance(obj, PageItem) - if self.x1 <= obj.x0 or obj.x1 <= self.x0: - return 0 - else: - return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) - - def voverlap(self, obj): - assert isinstance(obj, PageItem) - if self.y1 <= obj.y0 or obj.y1 <= self.y0: - return 0 - else: - return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) - - -class PageContainer(PageItem): - - def __init__(self, bbox): - PageItem.__init__(self, bbox) - self.objs = [] - return - - def add(self, obj): - self.objs.append(obj) - return - -class Page(PageContainer): - - def __init__(self, id, bbox, rotate=0): - PageContainer.__init__(self, bbox) - self.id = id - self.rotate = rotate - return - - def __repr__(self): - return ('' % (self.id, self.bbox(), self.rotate)) - - -## FigureItem -## -class FigureItem(PageContainer): - - def __init__(self, id, bbox): - PageContainer.__init__(self, bbox) - self.id = id - return - - def __repr__(self): - return ('
' % (self.id, self.bbox())) - - -## TextItem -## -class TextItem(PageItem): - - def __init__(self, matrix, font, fontsize, charspace, scaling, chars): - assert chars - self.matrix = matrix - self.font = font - (_,_,_,_,tx,ty) = self.matrix - self.vertical = self.font.is_vertical() - self.text = ''.join( char for (char,_) in chars ) - adv = sum( font.char_width(cid) for (_,cid) in chars ) - adv = (adv * fontsize + len(chars)*charspace) * scaling * .01 - size = (font.get_ascent() - font.get_descent()) * fontsize - if not self.vertical: - # horizontal text - self.vertical = False - (dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) - (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) - ty += descent - self.adv = (dx, 0) - bbox = (tx, ty, tx+dx, ty+dy) - else: - # vertical text - (_,cid) = chars[0] - (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001)) - (dx,dy) = apply_matrix_norm(self.matrix, (size,adv)) - tx -= dx/2 - ty += disp - self.adv = (0, dy) - bbox = (tx, ty+dy, tx+dx, ty) - self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) - PageItem.__init__(self, bbox) - return - - def __len__(self): - return len(self.text) - - def __repr__(self): - return ('' % - (matrix2str(self.matrix), self.font, self.fontsize, self.bbox(), - point2str(self.adv), self.text)) +def pick(seq, func, maxobj=None): + maxscore = None + for obj in seq: + score = func(obj) + if maxscore == None or maxscore < score: + (maxscore,maxobj) = (score,obj) + return maxobj ## bsearch @@ -156,38 +52,40 @@ def bsearch(objs, v0, v1): ## ## Reorders objects according to its writing direction. ## -def reorder_hv(objs, hdir): +def reorder_vh(objs, hdir): if 0 < hdir: hkey = (lambda obj: obj.x0) + vkey = (lambda obj: -obj.y1) else: hkey = (lambda obj: -obj.x1) - vkey = (lambda obj: -obj.y1) + vkey = (lambda obj: -obj.y1) r = [] line = [] - for obj1 in sorted(objs, key=vkey): - if line and not line[-1].voverlap(obj1): + for obj in sorted(objs, key=vkey): + if line and not line[-1].voverlap(obj): line.sort(key=hkey) r.append(line) line = [] - line.append(obj1) + line.append(obj) line.sort(key=hkey) r.append(line) return r -def reorder_vh(objs, hdir): +def reorder_hv(objs, hdir): if 0 < hdir: hkey = (lambda obj: obj.x0) + vkey = (lambda obj: -obj.y1) else: hkey = (lambda obj: -obj.x1) - vkey = (lambda obj: -obj.y1) + vkey = (lambda obj: -obj.y1) r = [] line = [] - for obj1 in sorted(objs, key=hkey): - if line and not line[-1].hoverlap(obj1): + for obj in sorted(objs, key=hkey): + if line and not line[-1].hoverlap(obj): line.sort(key=vkey) r.append(line) line = [] - line.append(obj1) + line.append(obj) line.sort(key=vkey) r.append(line) return r @@ -212,6 +110,7 @@ class Plane(object): # place(obj): place an object in a certain area. def place(self, obj): + assert isinstance(obj, LayoutItem) self.xobjs.append((obj.x0, obj)) self.xobjs.append((obj.x1, obj)) self.yobjs.append((obj.y0, obj)) @@ -232,80 +131,6 @@ class Plane(object): return objs -## TextBox -## -## A set of text objects that are clustered in -## a certain rectangular area. -## -class TextBox(PageItem): - - def __init__(self, objs): - self.objs = set(objs) - self.vertical = False - self.length = None - return - - def __repr__(self): - return ('' % (self.bbox(), self.vertical, len(self.objs))) - - def __len__(self): - return self.length - - # merge(boxes): merges with other textboxes. - def merge(self, box): - self.objs.update(box.objs) - return - - # finish(): determines its boundery and writing direction. - def finish(self): - assert self.objs - (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) - for obj in self.objs: - bx0 = min(bx0, obj.x0) - by0 = min(by0, obj.y0) - bx1 = max(bx1, obj.x1) - by1 = max(by1, obj.y1) - PageItem.__init__(self, (bx0, by0, bx1, by1)) - self.length = sum( len(obj) for obj in self.objs ) - for obj in self.objs: - self.vertical = obj.vertical - break - if 2 <= len(self.objs): - objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) - if len(objs[0]) == 1 and len(objs[1]) == 1: - h = objs[0].voverlap(objs[1]) - v = objs[0].hoverlap(objs[1]) - self.vertical = (h < v) - return - - def lines(self, ratio): - if self.vertical: - objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) - for line in reorder_vh(objs, -1): - s = '' - y0 = -INF - for obj in line: - margin = abs(obj.fontsize * ratio) - if obj.y1 < y0-margin: - s += ' ' - s += obj.text - y0 = obj.y0 - yield s - else: - objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1) - for line in reorder_hv(objs, +1): - s = '' - x1 = INF - for obj in line: - margin = abs(obj.fontsize * ratio) - if x1+margin < obj.x0: - s += ' ' - s += obj.text - x1 = obj.x1 - yield s - return - - ## ClusterSet ## ## Maintains a set of TextBox objects. @@ -316,43 +141,272 @@ class TextBox(PageItem): ## class ClusterSet(object): - def __init__(self): + def __init__(self, klass): self.clusters = {} + self.klass = klass return # add(objs): groups text objects if necessary. def add(self, objs): - c = TextBox(objs) + group = self.klass(objs) for obj in objs: if obj in self.clusters: - c.merge(self.clusters[obj]) - for obj in c.objs: - self.clusters[obj] = c + group.merge(self.clusters[obj]) + for obj in group: + self.clusters[obj] = group return # finish(): returns all the TextBoxes in a page. def finish(self): r = set(self.clusters.itervalues()) - for textbox in r: - textbox.finish() + for group in r: + group.fixate() return r -# cluster_textobjs -def cluster_textobjs(objs, ratio): - plane = Plane(objs) - cset = ClusterSet() - for obj in objs: - margin = abs(obj.fontsize * ratio) - neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin)) - cset.add(neighbors) - clusters = cset.finish() - vertical = ((sum( len(textbox) for textbox in clusters )/2) < - sum( len(textbox) for textbox in clusters if textbox.vertical )) - if vertical: - lines = reorder_hv(clusters, -1) - else: - lines = reorder_vh(clusters, +1) - r = [] - for line in lines: - r.extend(line) - return r + +## LayoutItem +## +class LayoutItem(object): + + def __init__(self, id, bbox): + #assert x0 <= x1 and y0 <= y1 + self.id = id + self.set_bbox(bbox) + return + + def set_bbox(self, (x0,y0,x1,y1)): + self.x0 = x0 + self.y0 = y0 + self.x1 = x1 + self.y1 = y1 + self.width = x1-x0 + self.height = y1-y0 + return + + def __repr__(self): + return ('' % (self.get_bbox())) + + def hoverlap(self, obj): + assert isinstance(obj, LayoutItem) + if self.x1 <= obj.x0 or obj.x1 <= self.x0: + return 0 + else: + return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) + + def voverlap(self, obj): + assert isinstance(obj, LayoutItem) + if self.y1 <= obj.y0 or obj.y1 <= self.y0: + return 0 + else: + return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) + + def get_bbox(self): + return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1) + + def get_margin(self, ratio): + return 0 + + def get_weight(self): + return 0 + + def get_direction(self): + return False + + +## LayoutContainer +## +class LayoutContainer(LayoutItem): + + def __init__(self, id, bbox, objs=None): + LayoutItem.__init__(self, id, bbox) + if objs: + self.objs = set(objs) + else: + self.objs = set() + self.weight = None + return + + def __repr__(self): + return ('' % (self.get_bbox(), len(self.objs))) + + def __iter__(self): + return iter(self.objs) + + def add(self, obj): + self.objs.add(obj) + return + + def merge(self, group): + self.objs.update(iter(group)) + return + + # fixate(): determines its boundery and writing direction. + def fixate(self): + if not self.width and self.objs: + (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) + for obj in self.objs: + bx0 = min(bx0, obj.x0) + by0 = min(by0, obj.y0) + bx1 = max(bx1, obj.x1) + by1 = max(by1, obj.y1) + self.set_bbox((bx0, by0, bx1, by1)) + self.weight = sum( obj.get_weight() for obj in self.objs ) + return + + def group_objs(self, ratio, klass): + plane = Plane(self.objs) + cset = ClusterSet(klass) + for obj in self.objs: + margin = abs(obj.get_margin(ratio)) + neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin)) + cset.add(neighbors) + self.objs = cset.finish() + return + + def get_weight(self): + return self.weight + + def get_direction(self): + return ((sum( obj.get_weight() for obj in self.objs )/2) < + sum( obj.get_weight() for obj in self.objs if obj.get_direction() )) + + +## FigureItem +## +class FigureItem(LayoutContainer): + + def __repr__(self): + return ('
' % (self.id, self.get_bbox())) + + +## TextItem +## +class TextItem(LayoutItem): + + def __init__(self, matrix, font, fontsize, charspace, scaling, chars): + assert chars + self.matrix = matrix + self.font = font + (_,_,_,_,tx,ty) = self.matrix + self.vertical = self.font.is_vertical() + self.text = ''.join( char for (char,_) in chars ) + adv = sum( font.char_width(cid) for (_,cid) in chars ) + adv = (adv * fontsize + len(chars)*charspace) * scaling * .01 + size = (font.get_ascent() - font.get_descent()) * fontsize + if not self.vertical: + # horizontal text + self.vertical = False + (dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) + (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) + ty += descent + self.adv = (dx, 0) + bbox = (tx, ty, tx+dx, ty+dy) + else: + # vertical text + (_,cid) = chars[0] + (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001)) + (dx,dy) = apply_matrix_norm(self.matrix, (size,adv)) + tx -= dx/2 + ty += disp + self.adv = (0, dy) + bbox = (tx, ty+dy, tx+dx, ty) + self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) + LayoutItem.__init__(self, None, bbox) + return + + def __repr__(self): + return ('' % + ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix, + self.font, self.fontsize, self.get_bbox(), + '(%.1f, %.1f)' % self.adv, + self.text)) + + def get_margin(self, ratio): + return self.fontsize * ratio + + def get_weight(self): + return len(self.text) + + def get_direction(self): + return self.vertical + + +## TextBox +## +## A set of text objects that are clustered in +## a certain rectangular area. +## +class TextBox(LayoutContainer): + + def __init__(self, objs): + LayoutContainer.__init__(self, None, (0,0,0,0), objs) + self.vertical = False + return + + def fixate(self): + LayoutContainer.fixate(self) + for obj in self.objs: + self.vertical = bool(obj.get_direction()) + break + if 2 <= len(self.objs): + objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) + if objs[0].get_weight() == 1 and objs[1].get_weight() == 1: + h = objs[0].voverlap(objs[1]) + v = objs[0].hoverlap(objs[1]) + self.vertical = (h < v) + return + + def get_direction(self): + return self.vertical + + def get_lines(self, ratio): + if self.get_direction(): + for line in reorder_hv(self.objs, -1): + s = '' + y0 = -INF + for obj in line: + margin = abs(obj.fontsize * ratio) + if obj.y1+margin < y0: + s += ' ' + s += obj.text + y0 = obj.y0 + yield s + else: + for line in reorder_vh(self.objs, +1): + s = '' + x1 = INF + for obj in line: + margin = abs(obj.fontsize * ratio) + if x1 < obj.x0-margin: + s += ' ' + s += obj.text + x1 = obj.x1 + yield s + return + + +## Page +## +class Page(LayoutContainer): + + def __init__(self, id, bbox, rotate=0): + LayoutContainer.__init__(self, id, bbox) + self.rotate = rotate + return + + def __repr__(self): + return ('' % (self.id, self.get_bbox(), self.rotate)) + + def fixate(self): + return + + def group_text(self, ratio): + self.group_objs(ratio, TextBox) + if self.get_direction(): + lines = reorder_hv(self.objs, -1) + else: + lines = reorder_vh(self.objs, +1) + self.objs = [] + for line in lines: + self.objs.extend(line) + return diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 11ed698..d26b2f2 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -3,167 +3,39 @@ import sys from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfdevice import PDFDevice, PDFPageAggregator -from layout import Page, FigureItem, TextItem, cluster_textobjs +from layout import Page, LayoutContainer, TextItem, TextBox from pdffont import PDFUnicodeNotDefined from cmap import CMapDB -def enc(x, codec): +# e(x): encode string +def e(x, codec='ascii'): x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') return x.encode(codec, 'xmlcharrefreplace') -def encprops(props, codec): - if not props: return '' - return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) ) - -def get_textobjs(item, r=None): - if r == None: r = [] - if isinstance(item, TextItem): - r.append(item) - elif isinstance(item, Page): - for child in item.objs: - get_textobjs(child, r) - return r - ## PDFConverter +## class PDFConverter(PDFPageAggregator): - def __init__(self, rsrc, outfp, codec='ascii'): + def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None): PDFPageAggregator.__init__(self, rsrc) + self.cluster_margin = cluster_margin self.outfp = outfp self.codec = codec return - - -## SGMLConverter -## -class SGMLConverter(PDFConverter): def end_page(self, page): - page = PDFConverter.end_page(self, page) - def f(item): - bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox - if isinstance(item, FigureItem): - self.outfp.write('
\n' % (item.id, bbox)) - for child in item.objs: - f(child) - self.outfp.write('
\n') - elif isinstance(item, TextItem): - self.outfp.write('' % - (enc(item.font.fontname, self.codec), item.vertical, bbox, item.fontsize)) - self.outfp.write(enc(item.text, self.codec)) - self.outfp.write('\n') - bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox - self.outfp.write('\n' % - (page.id, bbox, page.rotate)) - for child in page.objs: - f(child) - self.outfp.write('\n') - return - - -## HTMLConverter -## -class HTMLConverter(PDFConverter): - - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None): - PDFConverter.__init__(self, rsrc, outfp, codec=codec) - self.pagenum = pagenum - self.pagepad = pagepad - self.scale = scale - self.outfp.write('\n' % self.codec) - self.outfp.write('\n') - self.yoffset = self.pagepad - self.cluster_margin = cluster_margin - self.show_text_border = False - return - - def end_page(self, page): - page = PDFConverter.end_page(self, page) - self.yoffset += page.y1 - if self.pagenum: - self.outfp.write('' % - ((self.yoffset-page.y1)*self.scale, page.id, page.id)) - self.outfp.write('\n' % - (page.x0*self.scale, (self.yoffset-page.y1)*self.scale, - page.width*self.scale, page.height*self.scale)) - def draw(item): - if isinstance(item, FigureItem): - for child in item.objs: - draw(child) - elif isinstance(item, TextItem): - if item.vertical: - wmode = 'tb-rl' - else: - wmode = 'lr-tb' - self.outfp.write('' % - (wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale, - item.fontsize*self.scale)) - self.outfp.write(enc(item.text, self.codec)) - self.outfp.write('\n') - if self.show_text_border: - self.outfp.write('\n' % - (item.x0*self.scale, (self.yoffset-item.y1)*self.scale, - item.width*self.scale, self.height*self.scale)) - for child in page.objs: - draw(child) + page = PDFPageAggregator.end_page(self, page) if self.cluster_margin: - clusters = cluster_textobjs(get_textobjs(page), self.cluster_margin) - for textbox in clusters: - self.outfp.write('\n' % - (textbox.x0*self.scale, (self.yoffset-textbox.y1)*self.scale, - textbox.width*self.scale, textbox.height*self.scale)) - self.yoffset += self.pagepad - return + page.group_text(self.cluster_margin) + return page - def close(self): - self.outfp.write('
Page: %s
\n' % - ', '.join('%s' % (i,i) for i in xrange(1,self.pageno))) - self.outfp.write('\n') - return - - -## TextConverter -## -class TextConverter(PDFConverter): - - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None): - PDFConverter.__init__(self, rsrc, outfp, codec=codec) - self.pagenum = pagenum - if cluster_margin == None: - cluster_margin = 0.5 - self.cluster_margin = cluster_margin - self.word_margin = 0.2 + def write(self, text): + self.outfp.write(e(text, self.codec)) return - def end_page(self, page): - page = PDFConverter.end_page(self, page) - if self.pagenum: - self.outfp.write('Page %d\n' % page.id) - if self.cluster_margin: - textobjs = get_textobjs(page) - clusters = cluster_textobjs(textobjs, self.cluster_margin) - for textbox in clusters: - for line in textbox.lines(self.word_margin): - self.outfp.write(line.encode(self.codec, 'replace')+'\n') - self.outfp.write('\n') - else: - for obj in page.objs: - if isinstance(obj, TextItem): - self.outfp.write(obj.text.encode(self.codec, 'replace')) - self.outfp.write('\n') - self.outfp.write('\f') - return - - def close(self): - return - - + ## TagExtractor ## class TagExtractor(PDFDevice): @@ -191,7 +63,7 @@ class TagExtractor(PDFDevice): text += char except PDFUnicodeNotDefined, e: pass - self.outfp.write(enc(text, self.codec)) + self.write(text) return def begin_page(self, page): @@ -207,18 +79,150 @@ class TagExtractor(PDFDevice): return def begin_tag(self, tag, props=None): - self.outfp.write('<%s%s>' % (enc(tag.name, self.codec), encprops(props, self.codec))) + s = '' + if props: + s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v) + in sorted(props.iteritems()) ) + self.outfp.write('<%s%s>' % (e(tag.name), s)) self.tag = tag return def end_tag(self): assert self.tag - self.outfp.write('' % enc(self.tag.name, self.codec)) + self.outfp.write('' % e(self.tag.name)) self.tag = None return def do_tag(self, tag, props=None): - self.outfp.write('<%s%s/>' % (enc(tag.name, self.codec), encprops(props, self.codec))) + self.begin_tag(tag, props) + self.tag = None + return + + +## SGMLConverter +## +class SGMLConverter(PDFConverter): + + def end_page(self, page): + def draw(item): + if isinstance(item, TextItem): + self.outfp.write('' % + (e(item.font.fontname), item.get_direction(), + item.get_bbox(), item.fontsize)) + self.write(item.text) + self.outfp.write('\n') + elif isinstance(item, LayoutContainer): + self.outfp.write('\n' % (item.id, item.get_bbox())) + for child in item: + draw(child) + self.outfp.write('\n') + return + page = PDFConverter.end_page(self, page) + self.outfp.write('\n' % + (page.id, page.get_bbox(), page.rotate)) + draw(page) + self.outfp.write('\n') + return + + +## HTMLConverter +## +class HTMLConverter(PDFConverter): + + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, + pagepad=50, scale=1, cluster_margin=None): + PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin) + self.pagenum = pagenum + self.pagepad = pagepad + self.scale = scale + self.outfp.write('\n') + self.outfp.write('\n' % + self.codec) + self.outfp.write('\n') + self.yoffset = self.pagepad + self.show_text_border = False + return + + def write_rect(self, color, x, y, w, h): + self.outfp.write('\n' % + (color, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) + return + + def end_page(self, page): + def draw(item): + if isinstance(item, Page): + self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height) + if self.pagenum: + self.outfp.write('
' % + ((self.yoffset-page.y1)*self.scale)) + self.outfp.write('Page %s
\n' % (page.id, page.id)) + for child in item: + draw(child) + elif isinstance(item, TextItem): + if item.vertical: + wmode = 'tb-rl' + else: + wmode = 'lr-tb' + self.outfp.write('' % + (wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale, + item.fontsize*self.scale)) + self.write(item.text) + self.outfp.write('\n') + if self.show_text_border: + self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height) + elif isinstance(item, LayoutContainer): + self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height) + for child in item: + draw(child) + return + page = PDFConverter.end_page(self, page) + self.yoffset += page.y1 + draw(page) + self.yoffset += self.pagepad + return + + def close(self): + self.outfp.write('
Page: %s
\n' % + ', '.join('%s' % (i,i) for i in xrange(1,self.pageno))) + self.outfp.write('\n') + return + + +## TextConverter +## +class TextConverter(PDFConverter): + + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, + cluster_margin=None, word_margin=0.2): + if cluster_margin == None: + cluster_margin = 0.5 + PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin) + self.pagenum = pagenum + self.word_margin = word_margin + return + + def end_page(self, page): + def draw(item): + if isinstance(item, TextItem): + self.outfp.write(obj.text.encode(self.codec, 'replace')) + self.outfp.write('\n') + elif isinstance(item, TextBox): + for line in item.get_lines(self.word_margin): + self.outfp.write(line.encode(self.codec, 'replace')+'\n') + self.outfp.write('\n') + elif isinstance(item, LayoutContainer): + for child in item: + draw(child) + page = PDFConverter.end_page(self, page) + if self.pagenum: + self.outfp.write('Page %d\n' % page.id) + draw(page) + self.outfp.write('\f') + return + + def close(self): return diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index eacff71..3a3b3b3 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -3,7 +3,7 @@ import sys stdout = sys.stdout stderr = sys.stderr from pdffont import PDFUnicodeNotDefined -from layout import PageItem, Page, FigureItem, TextItem +from layout import Page, FigureItem, TextItem from utils import mult_matrix, translate_matrix @@ -68,7 +68,8 @@ class PDFPageAggregator(PDFDevice): def end_page(self, _): assert not self.stack - assert isinstance(self.cur_item, PageItem) + assert isinstance(self.cur_item, Page) + self.cur_item.fixate() self.pageno += 1 return self.cur_item @@ -79,6 +80,7 @@ class PDFPageAggregator(PDFDevice): def end_figure(self, _): fig = self.cur_item + self.cur_item.fixate() self.cur_item = self.stack.pop() self.cur_item.add(fig) return diff --git a/pdflib/utils.py b/pdflib/utils.py index 6b19752..e2849a7 100644 --- a/pdflib/utils.py +++ b/pdflib/utils.py @@ -23,13 +23,6 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)): '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))''' return (a*p+c*q, b*p+d*q) -# display functions -def matrix2str((a,b,c,d,e,f)): - return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f) -def rect2str((x0,y0,x1,y1)): - return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1) -def point2str((x,y)): - return '(%.1f, %.1f)' % (x,y) ## Utilities ## @@ -98,12 +91,3 @@ def decode_text(s): return unicode(s[2:], 'utf-16be', 'ignore') else: return ''.join( PDFDocEncoding[ord(c)] for c in s ) - -## -def pick(seq, func, maxobj=None): - maxscore = None - for obj in seq: - score = func(obj) - if maxscore == None or maxscore < score: - (maxscore,maxobj) = (score,obj) - return maxobj