text positioning got right.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@87 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-04-18 17:15:49 +00:00
parent f8510edffc
commit 6d91453187
4 changed files with 78 additions and 86 deletions

View File

@ -28,8 +28,8 @@ def get_textobjs(item, r=None):
## PDFConverter ## PDFConverter
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False): def __init__(self, rsrc, outfp, codec='ascii'):
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords) PDFPageAggregator.__init__(self, rsrc)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
@ -66,8 +66,8 @@ class SGMLConverter(PDFConverter):
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False): def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords) PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum self.pagenum = pagenum
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
@ -75,25 +75,12 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</head><body>\n') self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad self.yoffset = self.pagepad
self.cluster_margin = cluster_margin self.cluster_margin = cluster_margin
self.show_text_border = False
return return
def end_page(self, page): def end_page(self, page):
from cluster import cluster_pageobjs from cluster import cluster_pageobjs
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
def f(item):
if isinstance(item, FigureItem):
for child in item.objs:
f(child)
elif isinstance(item, TextItem):
if item.direction == 2:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
(x,_,_,y) = item.bbox
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</span>\n')
(x0,y0,x1,y1) = page.bbox (x0,y0,x1,y1) = page.bbox
self.yoffset += y1 self.yoffset += y1
if self.pagenum: if self.pagenum:
@ -102,8 +89,26 @@ class HTMLConverter(PDFConverter):
self.outfp.write('<span style="position:absolute; border: 1px solid gray; ' self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
def draw(item):
if isinstance(item, FigureItem):
for child in item.objs:
draw(child)
elif isinstance(item, TextItem):
if item.direction == 2:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
(x0,y0,x1,y1) = item.bbox
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</span>\n')
if self.show_text_border:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
for child in page.objs: for child in page.objs:
f(child) draw(child)
if self.cluster_margin: if self.cluster_margin:
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin) clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
for ((x0,y0,x1,y1),_,objs) in clusters: for ((x0,y0,x1,y1),_,objs) in clusters:
@ -124,8 +129,8 @@ class HTMLConverter(PDFConverter):
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False): def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True) PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum self.pagenum = pagenum
if cluster_margin == None: if cluster_margin == None:
cluster_margin = 0.5 cluster_margin = 0.5
@ -288,9 +293,9 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir) CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager() rsrc = PDFResourceManager()
if outtype == 'sgml': if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords) device = SGMLConverter(rsrc, outfp, codec=codec)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin) device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'text': elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'tag': elif outtype == 'tag':

View File

@ -3,7 +3,8 @@ import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
matrix2str, rect2str, point2str
## PDFDevice ## PDFDevice
@ -84,71 +85,48 @@ class TextItem(object):
self.matrix = matrix self.matrix = matrix
self.font = font self.font = font
(_,_,_,_,tx,ty) = self.matrix (_,_,_,_,tx,ty) = self.matrix
self.origin = (tx,ty)
self.direction = 0 self.direction = 0
self.text = '' self.text = ''
scaling *= .01 adv = 0
for (char,cid) in chars:
self.text += char
adv += font.char_width(cid)
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.font.is_vertical(): if not self.font.is_vertical():
# horizontal text # horizontal text
spwidth = font.space_width()
self.direction = 1 self.direction = 1
w = 0 (dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
dx = 0
prev = ' '
for (char,cid,t) in chars:
if char:
if prev != ' ' and spwidth < dx:
self.text += ' '
prev = char
self.text += char
dx = 0
w += (font.char_width(cid) * fontsize + charspace) * scaling
else:
t *= .001
dx -= t
w -= t * fontsize * scaling
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent ty += descent
(w,h) = apply_matrix_norm(self.matrix, (w,size)) self.adv = (dx, 0)
self.adv = (w, 0) self.bbox = (tx, ty, tx+dx, ty+dy)
self.bbox = (tx, ty, tx+w, ty+h)
else: else:
# vertical text # vertical text
self.direction = 2 self.direction = 2
disp = 0 (_,cid) = chars[0]
h = 0 (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
for (char,cid,disp) in chars: (dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
if not char: continue tx -= dx/2
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
break
for (char,cid,_) in chars[1:]:
if not char: continue
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
(w,h) = apply_matrix_norm(self.matrix, (size,h))
tx -= w/2
ty += disp ty += disp
self.adv = (0, h) self.adv = (0, dy)
self.bbox = (tx, ty+h, tx+w, ty) self.bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
return return
def __repr__(self): def __repr__(self):
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' % return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
(self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv)) (matrix2str(self.matrix), self.font, self.fontsize,
rect2str(self.bbox), self.text, point2str(self.adv)))
## PDFPageAggregator ## PDFPageAggregator
## ##
class PDFPageAggregator(PDFDevice): class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, splitwords=False): def __init__(self, rsrc, pageno=1):
PDFDevice.__init__(self, rsrc) PDFDevice.__init__(self, rsrc)
self.pageno = pageno self.pageno = pageno
self.splitwords = splitwords
self.stack = [] self.stack = []
return return
@ -181,12 +159,22 @@ class PDFPageAggregator(PDFDevice):
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return None return None
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
def render_string(self, textstate, textmatrix, seq): def render_string(self, textstate, textmatrix, seq):
font = textstate.font font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm)
chars = [] chars = []
for x in seq: for x in seq:
if isinstance(x, int) or isinstance(x, float): if isinstance(x, int) or isinstance(x, float):
chars.append((None, None, x)) (dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx -= x * textstate.scaling * .0001
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
else: else:
for cid in font.decode(x): for cid in font.decode(x):
try: try:
@ -194,20 +182,11 @@ class PDFPageAggregator(PDFDevice):
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args (cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid) char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid, font.char_disp(cid))) chars.append((char, cid))
textmatrix = mult_matrix(textmatrix, self.ctm) if cid == 32 and not font.is_multibyte():
word = [] (dx,dy) = self.render_chars(textmatrix, textstate, chars)
for (char, cid, disp) in chars: dx += textstate.wordspace * textstate.scaling * .01
word.append((char,cid,disp)) textmatrix = translate_matrix(textmatrix, (dx, dy))
if self.splitwords and cid == 32 and not font.is_multibyte(): chars = []
if word: self.render_chars(textmatrix, textstate, chars)
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
self.cur_item.add(item)
(dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
word = []
if word:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
self.cur_item.add(item)
return return

View File

@ -23,6 +23,13 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))''' '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
return (a*p+c*q, b*p+d*q) return (a*p+c*q, b*p+d*q)
# display functions
def matrix2str((a,b,c,d,e,f)):
return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f)
def rect2str((x0,y0,x1,y1)):
return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1)
def point2str((x,y)):
return '(%.1f, %.1f)' % (x,y)
## Utilities ## Utilities
## ##

View File

@ -5,15 +5,16 @@ def prof_main(argv):
import getopt import getopt
import hotshot, hotshot.stats import hotshot, hotshot.stats
def usage(): def usage():
print 'usage: %s output.prof mod.func [args ...]' % argv[0] print 'usage: %s module.function [args ...]' % argv[0]
return 100 return 100
args = argv[1:] args = argv[1:]
if len(args) < 2: return usage() if len(args) < 1: return usage()
prof = args.pop(0)
name = args.pop(0) name = args.pop(0)
prof = name+'.prof'
i = name.rindex('.') i = name.rindex('.')
(modname, funcname) = (name[:i], name[i+1:]) (modname, funcname) = (name[:i], name[i+1:])
func = getattr(__import__(modname, fromlist=[modname]), funcname) module = __import__(modname, fromlist=1)
func = getattr(module, funcname)
if args: if args:
args.insert(0, argv[0]) args.insert(0, argv[0])
prof = hotshot.Profile(prof) prof = hotshot.Profile(prof)