text positioning got right.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@87 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-04-18 17:15:49 +00:00
parent f8510edffc
commit 6d91453187
4 changed files with 78 additions and 86 deletions

View File

@ -28,8 +28,8 @@ def get_textobjs(item, r=None):
## PDFConverter
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
def __init__(self, rsrc, outfp, codec='ascii'):
PDFPageAggregator.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
return
@ -66,8 +66,8 @@ class SGMLConverter(PDFConverter):
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
@ -75,25 +75,12 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
self.cluster_margin = cluster_margin
self.show_text_border = False
return
def end_page(self, page):
from cluster import cluster_pageobjs
page = PDFConverter.end_page(self, page)
def f(item):
if isinstance(item, FigureItem):
for child in item.objs:
f(child)
elif isinstance(item, TextItem):
if item.direction == 2:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
(x,_,_,y) = item.bbox
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</span>\n')
(x0,y0,x1,y1) = page.bbox
self.yoffset += y1
if self.pagenum:
@ -102,8 +89,26 @@ class HTMLConverter(PDFConverter):
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
def draw(item):
if isinstance(item, FigureItem):
for child in item.objs:
draw(child)
elif isinstance(item, TextItem):
if item.direction == 2:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
(x0,y0,x1,y1) = item.bbox
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</span>\n')
if self.show_text_border:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
for child in page.objs:
f(child)
draw(child)
if self.cluster_margin:
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
for ((x0,y0,x1,y1),_,objs) in clusters:
@ -124,8 +129,8 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum
if cluster_margin == None:
cluster_margin = 0.5
@ -288,9 +293,9 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager()
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
device = SGMLConverter(rsrc, outfp, codec=codec)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'tag':

View File

@ -3,7 +3,8 @@ import sys
stdout = sys.stdout
stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
matrix2str, rect2str, point2str
## PDFDevice
@ -84,71 +85,48 @@ class TextItem(object):
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.origin = (tx,ty)
self.direction = 0
self.text = ''
scaling *= .01
adv = 0
for (char,cid) in chars:
self.text += char
adv += font.char_width(cid)
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.font.is_vertical():
# horizontal text
spwidth = font.space_width()
self.direction = 1
w = 0
dx = 0
prev = ' '
for (char,cid,t) in chars:
if char:
if prev != ' ' and spwidth < dx:
self.text += ' '
prev = char
self.text += char
dx = 0
w += (font.char_width(cid) * fontsize + charspace) * scaling
else:
t *= .001
dx -= t
w -= t * fontsize * scaling
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
(w,h) = apply_matrix_norm(self.matrix, (w,size))
self.adv = (w, 0)
self.bbox = (tx, ty, tx+w, ty+h)
self.adv = (dx, 0)
self.bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.direction = 2
disp = 0
h = 0
for (char,cid,disp) in chars:
if not char: continue
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
break
for (char,cid,_) in chars[1:]:
if not char: continue
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
(w,h) = apply_matrix_norm(self.matrix, (size,h))
tx -= w/2
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, h)
self.bbox = (tx, ty+h, tx+w, ty)
self.adv = (0, dy)
self.bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
return
def __repr__(self):
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
(self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
(matrix2str(self.matrix), self.font, self.fontsize,
rect2str(self.bbox), self.text, point2str(self.adv)))
## PDFPageAggregator
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, splitwords=False):
def __init__(self, rsrc, pageno=1):
PDFDevice.__init__(self, rsrc)
self.pageno = pageno
self.splitwords = splitwords
self.stack = []
return
@ -181,12 +159,22 @@ class PDFPageAggregator(PDFDevice):
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return None
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm)
chars = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
chars.append((None, None, x))
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx -= x * textstate.scaling * .0001
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
else:
for cid in font.decode(x):
try:
@ -194,20 +182,11 @@ class PDFPageAggregator(PDFDevice):
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid, font.char_disp(cid)))
textmatrix = mult_matrix(textmatrix, self.ctm)
word = []
for (char, cid, disp) in chars:
word.append((char,cid,disp))
if self.splitwords and cid == 32 and not font.is_multibyte():
if word:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
self.cur_item.add(item)
(dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
word = []
if word:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
self.cur_item.add(item)
chars.append((char, cid))
if cid == 32 and not font.is_multibyte():
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = []
self.render_chars(textmatrix, textstate, chars)
return

View File

@ -23,6 +23,13 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
return (a*p+c*q, b*p+d*q)
# display functions
def matrix2str((a,b,c,d,e,f)):
return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f)
def rect2str((x0,y0,x1,y1)):
return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1)
def point2str((x,y)):
return '(%.1f, %.1f)' % (x,y)
## Utilities
##

View File

@ -5,15 +5,16 @@ def prof_main(argv):
import getopt
import hotshot, hotshot.stats
def usage():
print 'usage: %s output.prof mod.func [args ...]' % argv[0]
print 'usage: %s module.function [args ...]' % argv[0]
return 100
args = argv[1:]
if len(args) < 2: return usage()
prof = args.pop(0)
if len(args) < 1: return usage()
name = args.pop(0)
prof = name+'.prof'
i = name.rindex('.')
(modname, funcname) = (name[:i], name[i+1:])
func = getattr(__import__(modname, fromlist=[modname]), funcname)
module = __import__(modname, fromlist=1)
func = getattr(module, funcname)
if args:
args.insert(0, argv[0])
prof = hotshot.Profile(prof)