diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 6fefb9f..67d88d4 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -28,8 +28,8 @@ def get_textobjs(item, r=None): ## PDFConverter class PDFConverter(PDFPageAggregator): - def __init__(self, rsrc, outfp, codec='ascii', splitwords=False): - PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords) + def __init__(self, rsrc, outfp, codec='ascii'): + PDFPageAggregator.__init__(self, rsrc) self.outfp = outfp self.codec = codec return @@ -66,8 +66,8 @@ class SGMLConverter(PDFConverter): ## class HTMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords) + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None): + PDFConverter.__init__(self, rsrc, outfp, codec=codec) self.pagenum = pagenum self.pagepad = pagepad self.scale = scale @@ -75,25 +75,12 @@ class HTMLConverter(PDFConverter): self.outfp.write('
\n') self.yoffset = self.pagepad self.cluster_margin = cluster_margin + self.show_text_border = False return def end_page(self, page): from cluster import cluster_pageobjs page = PDFConverter.end_page(self, page) - def f(item): - if isinstance(item, FigureItem): - for child in item.objs: - f(child) - elif isinstance(item, TextItem): - if item.direction == 2: - wmode = 'tb-rl' - else: - wmode = 'lr-tb' - (x,_,_,y) = item.bbox - self.outfp.write('' % - (wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale)) - self.outfp.write(enc(item.text, self.codec)) - self.outfp.write('\n') (x0,y0,x1,y1) = page.bbox self.yoffset += y1 if self.pagenum: @@ -102,8 +89,26 @@ class HTMLConverter(PDFConverter): self.outfp.write('\n' % (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) + def draw(item): + if isinstance(item, FigureItem): + for child in item.objs: + draw(child) + elif isinstance(item, TextItem): + if item.direction == 2: + wmode = 'tb-rl' + else: + wmode = 'lr-tb' + (x0,y0,x1,y1) = item.bbox + self.outfp.write('' % + (wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale)) + self.outfp.write(enc(item.text, self.codec)) + self.outfp.write('\n') + if self.show_text_border: + self.outfp.write('\n' % + (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) for child in page.objs: - f(child) + draw(child) if self.cluster_margin: clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin) for ((x0,y0,x1,y1),_,objs) in clusters: @@ -124,8 +129,8 @@ class HTMLConverter(PDFConverter): ## class TextConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False): - PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True) + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None): + PDFConverter.__init__(self, rsrc, outfp, codec=codec) self.pagenum = pagenum if cluster_margin == None: cluster_margin = 0.5 @@ -288,9 +293,9 @@ def main(argv): CMapDB.initialize(cmapdir, cdbcmapdir) rsrc = PDFResourceManager() if outtype == 'sgml': - device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords) + device = SGMLConverter(rsrc, outfp, codec=codec) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin) + device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) elif outtype == 'tag': diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index d450486..7793ee0 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -3,7 +3,8 @@ import sys stdout = sys.stdout stderr = sys.stderr from pdffont import PDFUnicodeNotDefined -from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix +from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \ + matrix2str, rect2str, point2str ## PDFDevice @@ -84,71 +85,48 @@ class TextItem(object): self.matrix = matrix self.font = font (_,_,_,_,tx,ty) = self.matrix - self.origin = (tx,ty) self.direction = 0 self.text = '' - scaling *= .01 + adv = 0 + for (char,cid) in chars: + self.text += char + adv += font.char_width(cid) + adv = (adv * fontsize + len(chars)*charspace) * scaling * .01 size = (font.get_ascent() - font.get_descent()) * fontsize if not self.font.is_vertical(): # horizontal text - spwidth = font.space_width() self.direction = 1 - w = 0 - dx = 0 - prev = ' ' - for (char,cid,t) in chars: - if char: - if prev != ' ' and spwidth < dx: - self.text += ' ' - prev = char - self.text += char - dx = 0 - w += (font.char_width(cid) * fontsize + charspace) * scaling - else: - t *= .001 - dx -= t - w -= t * fontsize * scaling + (dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) ty += descent - (w,h) = apply_matrix_norm(self.matrix, (w,size)) - self.adv = (w, 0) - self.bbox = (tx, ty, tx+w, ty+h) + self.adv = (dx, 0) + self.bbox = (tx, ty, tx+dx, ty+dy) else: # vertical text self.direction = 2 - disp = 0 - h = 0 - for (char,cid,disp) in chars: - if not char: continue - (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) - self.text += font.to_unicode(cid) - h += (font.char_width(cid) * fontsize + charspace) * scaling - break - for (char,cid,_) in chars[1:]: - if not char: continue - self.text += font.to_unicode(cid) - h += (font.char_width(cid) * fontsize + charspace) * scaling - (w,h) = apply_matrix_norm(self.matrix, (size,h)) - tx -= w/2 + (_,cid) = chars[0] + (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001)) + (dx,dy) = apply_matrix_norm(self.matrix, (size,adv)) + tx -= dx/2 ty += disp - self.adv = (0, h) - self.bbox = (tx, ty+h, tx+w, ty) + self.adv = (0, dy) + self.bbox = (tx, ty+dy, tx+dx, ty) self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) return def __repr__(self): - return ('