wordspace handling improved.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@55 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-12-25 15:09:54 +00:00
parent 33f709a0d8
commit 71be16febe
6 changed files with 108 additions and 48 deletions

View File

@ -3,7 +3,7 @@ import sys
stdout = sys.stdout
stderr = sys.stderr
from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix
mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
## PageItem
@ -37,47 +37,73 @@ class FigureItem(PageItem):
##
class TextItem(object):
def __init__(self, matrix, font, fontsize, width, text):
SPACE_WIDTH = 0.6
def __init__(self, matrix, font, fontsize, charspace, scaling, text):
self.matrix = matrix
self.font = font
(a,b,c,d,tx,ty) = self.matrix
(_,_,_,_,tx,ty) = self.matrix
self.origin = (tx,ty)
self.direction = 0
self.text = ''
if not self.font.is_vertical():
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
self.direction = 1
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
self.width = abs(self.width)
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
ty += descent
self.bbox = (tx, ty, tx+self.width, ty+self.height)
w = 0
dx = 0
prev = ' '
for t in text:
if isinstance(t, tuple):
if prev != ' ' and spwidth < dx:
self.text += ' '
(_,char) = t
self.text += char
prev = char
dx = 0
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
else:
dx -= t
w += t * fontsize * .001 * scaling * .01
self.adv = (w, 0)
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
self.bbox = (tx, ty, tx+w, ty+h)
else:
self.direction = 2
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width))
self.width = abs(self.width)
(disp,_) = text[0]
(_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001))
tx -= self.width/2
disp = 0
h = 0
for t in text:
if isinstance(t, tuple):
(disp,char) = t
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
break
for t in text:
if isinstance(t, tuple):
(_,char) = t
self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
self.adv = (0, h)
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
tx -= w/2
ty += disp
self.bbox = (tx, ty+self.height, tx+self.width, ty)
self.text = ''.join( c for (_,c) in text )
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
self.fontsize = max(w,h)
self.bbox = (tx, ty+h, tx+w, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
return
def __repr__(self):
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' %
(self.matrix, self.font, self.fontsize, self.bbox, self.text))
## TextConverter
## PageAggregator
##
class TextConverter(PDFDevice):
class PageAggregator(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
def __init__(self, rsrc, debug=0):
PDFDevice.__init__(self, rsrc, debug=debug)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self.stack = []
return
@ -109,14 +135,12 @@ class TextConverter(PDFDevice):
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return None
def render_string(self, textstate, textmatrix, size, seq, ratio=0.6):
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
spwidth = int(-font.char_width(32) * ratio) # space width
text = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
if not font.is_vertical() and x <= spwidth:
text.append((0, ' '))
text.append(x)
else:
chars = font.decode(x)
for cid in chars:
@ -125,11 +149,20 @@ class TextConverter(PDFDevice):
text.append((font.char_disp(cid), char))
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
s = self.handle_undefined_char(cidcoding, cid)
if s:
text.append(s)
unc = self.handle_undefined_char(cidcoding, cid)
if unc:
text.append(unc)
if cid == 32 and not font.is_multibyte():
if text:
item = TextItem(mult_matrix(textmatrix, self.ctm),
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item)
(dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
text = []
if text:
item = TextItem(mult_matrix(textmatrix, self.ctm),
font, textstate.fontsize, size, text)
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item)
return

View File

@ -2,11 +2,11 @@
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined
from pdflib.cmap import CMapDB
from pdflib.page import PageItem, FigureItem, TextItem, TextConverter
from cmap import CMapDB
from page import PageItem, FigureItem, TextItem, PageAggregator
def enc(x, codec):
@ -18,6 +18,16 @@ def encprops(props, codec):
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
## TextConverter
class TextConverter(PageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', debug=0):
PageAggregator.__init__(self, rsrc, debug=debug)
self.outfp = outfp
self.codec = codec
return
## SGMLConverter
##
class SGMLConverter(TextConverter):
@ -156,7 +166,7 @@ class TagExtractor(PDFDevice):
# pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass
def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0):
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
doc = PDFDocument(debug=debug)
fp = file(fname, 'rb')
parser = PDFParser(doc, fp, debug=debug)

View File

@ -73,10 +73,16 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,e+x,f+y)
def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to coordinates.'''
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
return (a*x+c*y, b*x+d*y)
## Fonts
##
@ -103,6 +109,9 @@ class PDFFont(object):
def is_vertical(self):
return False
def is_multibyte(self):
return False
def decode(self, bytes):
return map(ord, bytes)
@ -373,6 +382,9 @@ class PDFCIDFont(PDFFont):
def is_vertical(self):
return self.vertical
def is_multibyte(self):
return True
def decode(self, bytes):
return self.cmap.decode(bytes)
@ -498,7 +510,7 @@ class PDFDevice(object):
def end_figure(self, name):
return
def render_string(self, textstate, textmatrix, size, seq):
def render_string(self, textstate, textmatrix, seq):
raise NotImplementedError
def render_image(self, stream, size, matrix):
raise NotImplementedError
@ -928,15 +940,16 @@ class PDFPageInterpreter(object):
def do_TJ(self, seq):
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
textstate = self.textstate
matrix = translate_matrix(textstate.matrix, textstate.linematrix)
self.device.render_string(textstate, matrix, seq)
font = textstate.font
(a,b,c,d,e,f) = textstate.matrix
(lx,ly) = textstate.linematrix
s = ''.join( x for x in seq if isinstance(x, str) )
n = sum( x for x in seq if not isinstance(x, str) )
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
len(s) * textstate.charspace +
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
if not font.is_multibyte():
w += s.count(' ')*textstate.wordspace
w *= (textstate.scaling * .01)
(lx,ly) = textstate.linematrix
if font.is_vertical():
ly += w
else:

View File

@ -586,7 +586,7 @@ class PDFDocument(object):
self.parser.seek(index)
(_,objid1) = self.parser.nexttoken() # objid
(_,genno) = self.parser.nexttoken() # genno
assert objid1 == objid, (objid, objid1)
#assert objid1 == objid, (objid, objid1)
(_,kwd) = self.parser.nexttoken()
if kwd != KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)

View File

@ -32,11 +32,15 @@ endobj
>>
endobj
5 0 obj
<< /Length 46 >>
<< /Length 86 >>
stream
BT
/F1 24 Tf
1 0 0 1 100 700 TD
100 600 Td
0 Tw
( Hello World ) Tj
0 100 Td
100 Tw
( Hello World ) Tj
ET
endstream

View File

@ -89,7 +89,7 @@ def dumpallobjs(out, doc, codec=None):
dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n')
except:
pass
raise
dumptrailers(out, doc)
out.write('</pdf>')
return