text positioning got right.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@87 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
f8510edffc
commit
6d91453187
|
@ -28,8 +28,8 @@ def get_textobjs(item, r=None):
|
|||
## PDFConverter
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
|
||||
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
|
||||
def __init__(self, rsrc, outfp, codec='ascii'):
|
||||
PDFPageAggregator.__init__(self, rsrc)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
@ -66,8 +66,8 @@ class SGMLConverter(PDFConverter):
|
|||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
|
||||
self.pagenum = pagenum
|
||||
self.pagepad = pagepad
|
||||
self.scale = scale
|
||||
|
@ -75,25 +75,12 @@ class HTMLConverter(PDFConverter):
|
|||
self.outfp.write('</head><body>\n')
|
||||
self.yoffset = self.pagepad
|
||||
self.cluster_margin = cluster_margin
|
||||
self.show_text_border = False
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
from cluster import cluster_pageobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
def f(item):
|
||||
if isinstance(item, FigureItem):
|
||||
for child in item.objs:
|
||||
f(child)
|
||||
elif isinstance(item, TextItem):
|
||||
if item.direction == 2:
|
||||
wmode = 'tb-rl'
|
||||
else:
|
||||
wmode = 'lr-tb'
|
||||
(x,_,_,y) = item.bbox
|
||||
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||
(wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale))
|
||||
self.outfp.write(enc(item.text, self.codec))
|
||||
self.outfp.write('</span>\n')
|
||||
(x0,y0,x1,y1) = page.bbox
|
||||
self.yoffset += y1
|
||||
if self.pagenum:
|
||||
|
@ -102,8 +89,26 @@ class HTMLConverter(PDFConverter):
|
|||
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
def draw(item):
|
||||
if isinstance(item, FigureItem):
|
||||
for child in item.objs:
|
||||
draw(child)
|
||||
elif isinstance(item, TextItem):
|
||||
if item.direction == 2:
|
||||
wmode = 'tb-rl'
|
||||
else:
|
||||
wmode = 'lr-tb'
|
||||
(x0,y0,x1,y1) = item.bbox
|
||||
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||
(wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
|
||||
self.outfp.write(enc(item.text, self.codec))
|
||||
self.outfp.write('</span>\n')
|
||||
if self.show_text_border:
|
||||
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
for child in page.objs:
|
||||
f(child)
|
||||
draw(child)
|
||||
if self.cluster_margin:
|
||||
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
|
||||
for ((x0,y0,x1,y1),_,objs) in clusters:
|
||||
|
@ -124,8 +129,8 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None, splitwords=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
|
||||
self.pagenum = pagenum
|
||||
if cluster_margin == None:
|
||||
cluster_margin = 0.5
|
||||
|
@ -288,9 +293,9 @@ def main(argv):
|
|||
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||
rsrc = PDFResourceManager()
|
||||
if outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
elif outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
elif outtype == 'tag':
|
||||
|
|
|
@ -3,7 +3,8 @@ import sys
|
|||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
||||
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix, \
|
||||
matrix2str, rect2str, point2str
|
||||
|
||||
|
||||
## PDFDevice
|
||||
|
@ -84,71 +85,48 @@ class TextItem(object):
|
|||
self.matrix = matrix
|
||||
self.font = font
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
self.origin = (tx,ty)
|
||||
self.direction = 0
|
||||
self.text = ''
|
||||
scaling *= .01
|
||||
adv = 0
|
||||
for (char,cid) in chars:
|
||||
self.text += char
|
||||
adv += font.char_width(cid)
|
||||
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||
if not self.font.is_vertical():
|
||||
# horizontal text
|
||||
spwidth = font.space_width()
|
||||
self.direction = 1
|
||||
w = 0
|
||||
dx = 0
|
||||
prev = ' '
|
||||
for (char,cid,t) in chars:
|
||||
if char:
|
||||
if prev != ' ' and spwidth < dx:
|
||||
self.text += ' '
|
||||
prev = char
|
||||
self.text += char
|
||||
dx = 0
|
||||
w += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
else:
|
||||
t *= .001
|
||||
dx -= t
|
||||
w -= t * fontsize * scaling
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||
ty += descent
|
||||
(w,h) = apply_matrix_norm(self.matrix, (w,size))
|
||||
self.adv = (w, 0)
|
||||
self.bbox = (tx, ty, tx+w, ty+h)
|
||||
self.adv = (dx, 0)
|
||||
self.bbox = (tx, ty, tx+dx, ty+dy)
|
||||
else:
|
||||
# vertical text
|
||||
self.direction = 2
|
||||
disp = 0
|
||||
h = 0
|
||||
for (char,cid,disp) in chars:
|
||||
if not char: continue
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||
self.text += font.to_unicode(cid)
|
||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
break
|
||||
for (char,cid,_) in chars[1:]:
|
||||
if not char: continue
|
||||
self.text += font.to_unicode(cid)
|
||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
(w,h) = apply_matrix_norm(self.matrix, (size,h))
|
||||
tx -= w/2
|
||||
(_,cid) = chars[0]
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
||||
tx -= dx/2
|
||||
ty += disp
|
||||
self.adv = (0, h)
|
||||
self.bbox = (tx, ty+h, tx+w, ty)
|
||||
self.adv = (0, dy)
|
||||
self.bbox = (tx, ty+dy, tx+dx, ty)
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
|
||||
(self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
|
||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
|
||||
(matrix2str(self.matrix), self.font, self.fontsize,
|
||||
rect2str(self.bbox), self.text, point2str(self.adv)))
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
##
|
||||
class PDFPageAggregator(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1, splitwords=False):
|
||||
def __init__(self, rsrc, pageno=1):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
self.pageno = pageno
|
||||
self.splitwords = splitwords
|
||||
self.stack = []
|
||||
return
|
||||
|
||||
|
@ -181,12 +159,22 @@ class PDFPageAggregator(PDFDevice):
|
|||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
return None
|
||||
|
||||
def render_chars(self, textmatrix, textstate, chars):
|
||||
if not chars: return (0, 0)
|
||||
item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
font = textstate.font
|
||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||
chars = []
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
chars.append((None, None, x))
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx -= x * textstate.scaling * .0001
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
chars = []
|
||||
else:
|
||||
for cid in font.decode(x):
|
||||
try:
|
||||
|
@ -194,20 +182,11 @@ class PDFPageAggregator(PDFDevice):
|
|||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid, font.char_disp(cid)))
|
||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||
word = []
|
||||
for (char, cid, disp) in chars:
|
||||
word.append((char,cid,disp))
|
||||
if self.splitwords and cid == 32 and not font.is_multibyte():
|
||||
if word:
|
||||
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
||||
self.cur_item.add(item)
|
||||
(dx,dy) = item.adv
|
||||
dx += textstate.wordspace * textstate.scaling * .01
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
word = []
|
||||
if word:
|
||||
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
||||
self.cur_item.add(item)
|
||||
chars.append((char, cid))
|
||||
if cid == 32 and not font.is_multibyte():
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx += textstate.wordspace * textstate.scaling * .01
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
chars = []
|
||||
self.render_chars(textmatrix, textstate, chars)
|
||||
return
|
||||
|
|
|
@ -23,6 +23,13 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
|||
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
|
||||
return (a*p+c*q, b*p+d*q)
|
||||
|
||||
# display functions
|
||||
def matrix2str((a,b,c,d,e,f)):
|
||||
return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f)
|
||||
def rect2str((x0,y0,x1,y1)):
|
||||
return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1)
|
||||
def point2str((x,y)):
|
||||
return '(%.1f, %.1f)' % (x,y)
|
||||
|
||||
## Utilities
|
||||
##
|
||||
|
|
|
@ -5,15 +5,16 @@ def prof_main(argv):
|
|||
import getopt
|
||||
import hotshot, hotshot.stats
|
||||
def usage():
|
||||
print 'usage: %s output.prof mod.func [args ...]' % argv[0]
|
||||
print 'usage: %s module.function [args ...]' % argv[0]
|
||||
return 100
|
||||
args = argv[1:]
|
||||
if len(args) < 2: return usage()
|
||||
prof = args.pop(0)
|
||||
if len(args) < 1: return usage()
|
||||
name = args.pop(0)
|
||||
prof = name+'.prof'
|
||||
i = name.rindex('.')
|
||||
(modname, funcname) = (name[:i], name[i+1:])
|
||||
func = getattr(__import__(modname, fromlist=[modname]), funcname)
|
||||
module = __import__(modname, fromlist=1)
|
||||
func = getattr(module, funcname)
|
||||
if args:
|
||||
args.insert(0, argv[0])
|
||||
prof = hotshot.Profile(prof)
|
||||
|
|
Loading…
Reference in New Issue