diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 6fefb9f..67d88d4 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -28,8 +28,8 @@ def get_textobjs(item, r=None): ## PDFConverter class PDFConverter(PDFPageAggregator): - def __init__(self, rsrc, outfp, codec='ascii', splitwords=False): - PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords) + def __init__(self, rsrc, outfp, codec='ascii'): + PDFPageAggregator.__init__(self, rsrc) self.outfp = outfp self.codec = codec return @@ -66,8 +66,8 @@ class SGMLConverter(PDFConverter): ## class HTMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords) + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None): + PDFConverter.__init__(self, rsrc, outfp, codec=codec) self.pagenum = pagenum self.pagepad = pagepad self.scale = scale @@ -75,25 +75,12 @@ class HTMLConverter(PDFConverter): self.outfp.write('\n') self.yoffset = self.pagepad self.cluster_margin = cluster_margin + self.show_text_border = False return def end_page(self, page): from cluster import cluster_pageobjs page = PDFConverter.end_page(self, page) - def f(item): - if isinstance(item, FigureItem): - for child in item.objs: - f(child) - elif isinstance(item, TextItem): - if item.direction == 2: - wmode = 'tb-rl' - else: - wmode = 'lr-tb' - (x,_,_,y) = item.bbox - self.outfp.write('' % - (wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale)) - self.outfp.write(enc(item.text, self.codec)) - self.outfp.write('\n') (x0,y0,x1,y1) = page.bbox self.yoffset += y1 if self.pagenum: @@ -102,8 +89,26 @@ class HTMLConverter(PDFConverter): self.outfp.write('\n' % (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) + def draw(item): + if isinstance(item, FigureItem): + for child in item.objs: + draw(child) + elif isinstance(item, TextItem): + if item.direction == 2: + wmode = 'tb-rl' + else: + wmode = 'lr-tb' + (x0,y0,x1,y1) = item.bbox + self.outfp.write('' % + (wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale)) + self.outfp.write(enc(item.text, self.codec)) + self.outfp.write('\n') + if self.show_text_border: + self.outfp.write('\n' % + (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) for child in page.objs: - f(child) + draw(child) if self.cluster_margin: clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin) for ((x0,y0,x1,y1),_,objs) in clusters: @@ -124,8 +129,8 @@ class HTMLConverter(PDFConverter): ## class TextConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True) + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None): + PDFConverter.__init__(self, rsrc, outfp, codec=codec) self.pagenum = pagenum if cluster_margin == None: cluster_margin = 0.5 @@ -288,9 +293,9 @@ def main(argv): CMapDB.initialize(cmapdir, cdbcmapdir) rsrc = PDFResourceManager() if outtype == 'sgml': - device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords) + device = SGMLConverter(rsrc, outfp, codec=codec) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin) + device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'tag': diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index d450486..7793ee0 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -3,7 +3,8 @@ import sys stdout = sys.stdout stderr = sys.stderr from pdffont import PDFUnicodeNotDefined -from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix +from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \ + matrix2str, rect2str, point2str ## PDFDevice @@ -84,71 +85,48 @@ class TextItem(object): self.matrix = matrix self.font = font (_,_,_,_,tx,ty) = self.matrix - self.origin = (tx,ty) self.direction = 0 self.text = '' - scaling *= .01 + adv = 0 + for (char,cid) in chars: + self.text += char + adv += font.char_width(cid) + adv = (adv * fontsize + len(chars)*charspace) * scaling * .01 size = (font.get_ascent() - font.get_descent()) * fontsize if not self.font.is_vertical(): # horizontal text - spwidth = font.space_width() self.direction = 1 - w = 0 - dx = 0 - prev = ' ' - for (char,cid,t) in chars: - if char: - if prev != ' ' and spwidth < dx: - self.text += ' ' - prev = char - self.text += char - dx = 0 - w += (font.char_width(cid) * fontsize + charspace) * scaling - else: - t *= .001 - dx -= t - w -= t * fontsize * scaling + (dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) ty += descent - (w,h) = apply_matrix_norm(self.matrix, (w,size)) - self.adv = (w, 0) - self.bbox = (tx, ty, tx+w, ty+h) + self.adv = (dx, 0) + self.bbox = (tx, ty, tx+dx, ty+dy) else: # vertical text self.direction = 2 - disp = 0 - h = 0 - for (char,cid,disp) in chars: - if not char: continue - (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) - self.text += font.to_unicode(cid) - h += (font.char_width(cid) * fontsize + charspace) * scaling - break - for (char,cid,_) in chars[1:]: - if not char: continue - self.text += font.to_unicode(cid) - h += (font.char_width(cid) * fontsize + charspace) * scaling - (w,h) = apply_matrix_norm(self.matrix, (size,h)) - tx -= w/2 + (_,cid) = chars[0] + (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001)) + (dx,dy) = apply_matrix_norm(self.matrix, (size,adv)) + tx -= dx/2 ty += disp - self.adv = (0, h) - self.bbox = (tx, ty+h, tx+w, ty) + self.adv = (0, dy) + self.bbox = (tx, ty+dy, tx+dx, ty) self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) return def __repr__(self): - return ('' % - (self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv)) + return ('' % + (matrix2str(self.matrix), self.font, self.fontsize, + rect2str(self.bbox), self.text, point2str(self.adv))) ## PDFPageAggregator ## class PDFPageAggregator(PDFDevice): - def __init__(self, rsrc, pageno=1, splitwords=False): + def __init__(self, rsrc, pageno=1): PDFDevice.__init__(self, rsrc) self.pageno = pageno - self.splitwords = splitwords self.stack = [] return @@ -181,12 +159,22 @@ class PDFPageAggregator(PDFDevice): print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) return None + def render_chars(self, textmatrix, textstate, chars): + if not chars: return (0, 0) + item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars) + self.cur_item.add(item) + return item.adv + def render_string(self, textstate, textmatrix, seq): font = textstate.font + textmatrix = mult_matrix(textmatrix, self.ctm) chars = [] for x in seq: if isinstance(x, int) or isinstance(x, float): - chars.append((None, None, x)) + (dx,dy) = self.render_chars(textmatrix, textstate, chars) + dx -= x * textstate.scaling * .0001 + textmatrix = translate_matrix(textmatrix, (dx, dy)) + chars = [] else: for cid in font.decode(x): try: @@ -194,20 +182,11 @@ class PDFPageAggregator(PDFDevice): except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args char = self.handle_undefined_char(cidcoding, cid) - chars.append((char, cid, font.char_disp(cid))) - textmatrix = mult_matrix(textmatrix, self.ctm) - word = [] - for (char, cid, disp) in chars: - word.append((char,cid,disp)) - if self.splitwords and cid == 32 and not font.is_multibyte(): - if word: - item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word) - self.cur_item.add(item) - (dx,dy) = item.adv - dx += textstate.wordspace * textstate.scaling * .01 - textmatrix = translate_matrix(textmatrix, (dx, dy)) - word = [] - if word: - item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word) - self.cur_item.add(item) + chars.append((char, cid)) + if cid == 32 and not font.is_multibyte(): + (dx,dy) = self.render_chars(textmatrix, textstate, chars) + dx += textstate.wordspace * textstate.scaling * .01 + textmatrix = translate_matrix(textmatrix, (dx, dy)) + chars = [] + self.render_chars(textmatrix, textstate, chars) return diff --git a/pdflib/utils.py b/pdflib/utils.py index e2849a7..a0a7e49 100644 --- a/pdflib/utils.py +++ b/pdflib/utils.py @@ -23,6 +23,13 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)): '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))''' return (a*p+c*q, b*p+d*q) +# display functions +def matrix2str((a,b,c,d,e,f)): + return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f) +def rect2str((x0,y0,x1,y1)): + return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1) +def point2str((x,y)): + return '(%.1f, %.1f)' % (x,y) ## Utilities ## diff --git a/tools/prof.py b/tools/prof.py index 34746db..4041228 100644 --- a/tools/prof.py +++ b/tools/prof.py @@ -5,15 +5,16 @@ def prof_main(argv): import getopt import hotshot, hotshot.stats def usage(): - print 'usage: %s output.prof mod.func [args ...]' % argv[0] + print 'usage: %s module.function [args ...]' % argv[0] return 100 args = argv[1:] - if len(args) < 2: return usage() - prof = args.pop(0) + if len(args) < 1: return usage() name = args.pop(0) + prof = name+'.prof' i = name.rindex('.') (modname, funcname) = (name[:i], name[i+1:]) - func = getattr(__import__(modname, fromlist=[modname]), funcname) + module = __import__(modname, fromlist=1) + func = getattr(module, funcname) if args: args.insert(0, argv[0]) prof = hotshot.Profile(prof)