text positioning got right.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@87 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
f8510edffc
commit
6d91453187
|
@ -28,8 +28,8 @@ def get_textobjs(item, r=None):
|
||||||
## PDFConverter
|
## PDFConverter
|
||||||
class PDFConverter(PDFPageAggregator):
|
class PDFConverter(PDFPageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
|
def __init__(self, rsrc, outfp, codec='ascii'):
|
||||||
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
|
PDFPageAggregator.__init__(self, rsrc)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
@ -66,8 +66,8 @@ class SGMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
|
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
|
||||||
self.pagenum = pagenum
|
self.pagenum = pagenum
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
@ -75,25 +75,12 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('</head><body>\n')
|
self.outfp.write('</head><body>\n')
|
||||||
self.yoffset = self.pagepad
|
self.yoffset = self.pagepad
|
||||||
self.cluster_margin = cluster_margin
|
self.cluster_margin = cluster_margin
|
||||||
|
self.show_text_border = False
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
from cluster import cluster_pageobjs
|
from cluster import cluster_pageobjs
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
def f(item):
|
|
||||||
if isinstance(item, FigureItem):
|
|
||||||
for child in item.objs:
|
|
||||||
f(child)
|
|
||||||
elif isinstance(item, TextItem):
|
|
||||||
if item.direction == 2:
|
|
||||||
wmode = 'tb-rl'
|
|
||||||
else:
|
|
||||||
wmode = 'lr-tb'
|
|
||||||
(x,_,_,y) = item.bbox
|
|
||||||
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
|
||||||
(wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale))
|
|
||||||
self.outfp.write(enc(item.text, self.codec))
|
|
||||||
self.outfp.write('</span>\n')
|
|
||||||
(x0,y0,x1,y1) = page.bbox
|
(x0,y0,x1,y1) = page.bbox
|
||||||
self.yoffset += y1
|
self.yoffset += y1
|
||||||
if self.pagenum:
|
if self.pagenum:
|
||||||
|
@ -102,8 +89,26 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
|
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
|
||||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||||
|
def draw(item):
|
||||||
|
if isinstance(item, FigureItem):
|
||||||
|
for child in item.objs:
|
||||||
|
draw(child)
|
||||||
|
elif isinstance(item, TextItem):
|
||||||
|
if item.direction == 2:
|
||||||
|
wmode = 'tb-rl'
|
||||||
|
else:
|
||||||
|
wmode = 'lr-tb'
|
||||||
|
(x0,y0,x1,y1) = item.bbox
|
||||||
|
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||||
|
(wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
|
||||||
|
self.outfp.write(enc(item.text, self.codec))
|
||||||
|
self.outfp.write('</span>\n')
|
||||||
|
if self.show_text_border:
|
||||||
|
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
|
||||||
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||||
|
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||||
for child in page.objs:
|
for child in page.objs:
|
||||||
f(child)
|
draw(child)
|
||||||
if self.cluster_margin:
|
if self.cluster_margin:
|
||||||
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
|
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
|
||||||
for ((x0,y0,x1,y1),_,objs) in clusters:
|
for ((x0,y0,x1,y1),_,objs) in clusters:
|
||||||
|
@ -124,8 +129,8 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False):
|
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
|
||||||
self.pagenum = pagenum
|
self.pagenum = pagenum
|
||||||
if cluster_margin == None:
|
if cluster_margin == None:
|
||||||
cluster_margin = 0.5
|
cluster_margin = 0.5
|
||||||
|
@ -288,9 +293,9 @@ def main(argv):
|
||||||
CMapDB.initialize(cmapdir, cdbcmapdir)
|
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||||
rsrc = PDFResourceManager()
|
rsrc = PDFResourceManager()
|
||||||
if outtype == 'sgml':
|
if outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
device = SGMLConverter(rsrc, outfp, codec=codec)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
|
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'text':
|
elif outtype == 'text':
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
|
|
|
@ -3,7 +3,8 @@ import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
|
||||||
|
matrix2str, rect2str, point2str
|
||||||
|
|
||||||
|
|
||||||
## PDFDevice
|
## PDFDevice
|
||||||
|
@ -84,71 +85,48 @@ class TextItem(object):
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
self.font = font
|
self.font = font
|
||||||
(_,_,_,_,tx,ty) = self.matrix
|
(_,_,_,_,tx,ty) = self.matrix
|
||||||
self.origin = (tx,ty)
|
|
||||||
self.direction = 0
|
self.direction = 0
|
||||||
self.text = ''
|
self.text = ''
|
||||||
scaling *= .01
|
adv = 0
|
||||||
|
for (char,cid) in chars:
|
||||||
|
self.text += char
|
||||||
|
adv += font.char_width(cid)
|
||||||
|
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
||||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||||
if not self.font.is_vertical():
|
if not self.font.is_vertical():
|
||||||
# horizontal text
|
# horizontal text
|
||||||
spwidth = font.space_width()
|
|
||||||
self.direction = 1
|
self.direction = 1
|
||||||
w = 0
|
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
||||||
dx = 0
|
|
||||||
prev = ' '
|
|
||||||
for (char,cid,t) in chars:
|
|
||||||
if char:
|
|
||||||
if prev != ' ' and spwidth < dx:
|
|
||||||
self.text += ' '
|
|
||||||
prev = char
|
|
||||||
self.text += char
|
|
||||||
dx = 0
|
|
||||||
w += (font.char_width(cid) * fontsize + charspace) * scaling
|
|
||||||
else:
|
|
||||||
t *= .001
|
|
||||||
dx -= t
|
|
||||||
w -= t * fontsize * scaling
|
|
||||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||||
ty += descent
|
ty += descent
|
||||||
(w,h) = apply_matrix_norm(self.matrix, (w,size))
|
self.adv = (dx, 0)
|
||||||
self.adv = (w, 0)
|
self.bbox = (tx, ty, tx+dx, ty+dy)
|
||||||
self.bbox = (tx, ty, tx+w, ty+h)
|
|
||||||
else:
|
else:
|
||||||
# vertical text
|
# vertical text
|
||||||
self.direction = 2
|
self.direction = 2
|
||||||
disp = 0
|
(_,cid) = chars[0]
|
||||||
h = 0
|
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
||||||
for (char,cid,disp) in chars:
|
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
||||||
if not char: continue
|
tx -= dx/2
|
||||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
|
||||||
self.text += font.to_unicode(cid)
|
|
||||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
|
||||||
break
|
|
||||||
for (char,cid,_) in chars[1:]:
|
|
||||||
if not char: continue
|
|
||||||
self.text += font.to_unicode(cid)
|
|
||||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
|
||||||
(w,h) = apply_matrix_norm(self.matrix, (size,h))
|
|
||||||
tx -= w/2
|
|
||||||
ty += disp
|
ty += disp
|
||||||
self.adv = (0, h)
|
self.adv = (0, dy)
|
||||||
self.bbox = (tx, ty+h, tx+w, ty)
|
self.bbox = (tx, ty+dy, tx+dx, ty)
|
||||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
|
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
|
||||||
(self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
|
(matrix2str(self.matrix), self.font, self.fontsize,
|
||||||
|
rect2str(self.bbox), self.text, point2str(self.adv)))
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
## PDFPageAggregator
|
||||||
##
|
##
|
||||||
class PDFPageAggregator(PDFDevice):
|
class PDFPageAggregator(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, pageno=1, splitwords=False):
|
def __init__(self, rsrc, pageno=1):
|
||||||
PDFDevice.__init__(self, rsrc)
|
PDFDevice.__init__(self, rsrc)
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
self.splitwords = splitwords
|
|
||||||
self.stack = []
|
self.stack = []
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -181,12 +159,22 @@ class PDFPageAggregator(PDFDevice):
|
||||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def render_chars(self, textmatrix, textstate, chars):
|
||||||
|
if not chars: return (0, 0)
|
||||||
|
item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
|
||||||
|
self.cur_item.add(item)
|
||||||
|
return item.adv
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
def render_string(self, textstate, textmatrix, seq):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
|
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||||
chars = []
|
chars = []
|
||||||
for x in seq:
|
for x in seq:
|
||||||
if isinstance(x, int) or isinstance(x, float):
|
if isinstance(x, int) or isinstance(x, float):
|
||||||
chars.append((None, None, x))
|
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||||
|
dx -= x * textstate.scaling * .0001
|
||||||
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||||
|
chars = []
|
||||||
else:
|
else:
|
||||||
for cid in font.decode(x):
|
for cid in font.decode(x):
|
||||||
try:
|
try:
|
||||||
|
@ -194,20 +182,11 @@ class PDFPageAggregator(PDFDevice):
|
||||||
except PDFUnicodeNotDefined, e:
|
except PDFUnicodeNotDefined, e:
|
||||||
(cidcoding, cid) = e.args
|
(cidcoding, cid) = e.args
|
||||||
char = self.handle_undefined_char(cidcoding, cid)
|
char = self.handle_undefined_char(cidcoding, cid)
|
||||||
chars.append((char, cid, font.char_disp(cid)))
|
chars.append((char, cid))
|
||||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
if cid == 32 and not font.is_multibyte():
|
||||||
word = []
|
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||||
for (char, cid, disp) in chars:
|
dx += textstate.wordspace * textstate.scaling * .01
|
||||||
word.append((char,cid,disp))
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||||
if self.splitwords and cid == 32 and not font.is_multibyte():
|
chars = []
|
||||||
if word:
|
self.render_chars(textmatrix, textstate, chars)
|
||||||
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
|
||||||
self.cur_item.add(item)
|
|
||||||
(dx,dy) = item.adv
|
|
||||||
dx += textstate.wordspace * textstate.scaling * .01
|
|
||||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
|
||||||
word = []
|
|
||||||
if word:
|
|
||||||
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
|
||||||
self.cur_item.add(item)
|
|
||||||
return
|
return
|
||||||
|
|
|
@ -23,6 +23,13 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
|
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
|
||||||
return (a*p+c*q, b*p+d*q)
|
return (a*p+c*q, b*p+d*q)
|
||||||
|
|
||||||
|
# display functions
|
||||||
|
def matrix2str((a,b,c,d,e,f)):
|
||||||
|
return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f)
|
||||||
|
def rect2str((x0,y0,x1,y1)):
|
||||||
|
return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1)
|
||||||
|
def point2str((x,y)):
|
||||||
|
return '(%.1f, %.1f)' % (x,y)
|
||||||
|
|
||||||
## Utilities
|
## Utilities
|
||||||
##
|
##
|
||||||
|
|
|
@ -5,15 +5,16 @@ def prof_main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
import hotshot, hotshot.stats
|
import hotshot, hotshot.stats
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s output.prof mod.func [args ...]' % argv[0]
|
print 'usage: %s module.function [args ...]' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
args = argv[1:]
|
args = argv[1:]
|
||||||
if len(args) < 2: return usage()
|
if len(args) < 1: return usage()
|
||||||
prof = args.pop(0)
|
|
||||||
name = args.pop(0)
|
name = args.pop(0)
|
||||||
|
prof = name+'.prof'
|
||||||
i = name.rindex('.')
|
i = name.rindex('.')
|
||||||
(modname, funcname) = (name[:i], name[i+1:])
|
(modname, funcname) = (name[:i], name[i+1:])
|
||||||
func = getattr(__import__(modname, fromlist=[modname]), funcname)
|
module = __import__(modname, fromlist=1)
|
||||||
|
func = getattr(module, funcname)
|
||||||
if args:
|
if args:
|
||||||
args.insert(0, argv[0])
|
args.insert(0, argv[0])
|
||||||
prof = hotshot.Profile(prof)
|
prof = hotshot.Profile(prof)
|
||||||
|
|
Loading…
Reference in New Issue