wordspace handling improved.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@55 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-12-25 15:09:54 +00:00
parent 33f709a0d8
commit 71be16febe
6 changed files with 108 additions and 48 deletions

View File

@ -3,7 +3,7 @@ import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \ from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
## PageItem ## PageItem
@ -37,47 +37,73 @@ class FigureItem(PageItem):
## ##
class TextItem(object): class TextItem(object):
def __init__(self, matrix, font, fontsize, width, text): SPACE_WIDTH = 0.6
def __init__(self, matrix, font, fontsize, charspace, scaling, text):
self.matrix = matrix self.matrix = matrix
self.font = font self.font = font
(a,b,c,d,tx,ty) = self.matrix (_,_,_,_,tx,ty) = self.matrix
self.origin = (tx,ty) self.origin = (tx,ty)
self.direction = 0 self.direction = 0
self.text = ''
if not self.font.is_vertical(): if not self.font.is_vertical():
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
self.direction = 1 self.direction = 1
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize)) (_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
self.width = abs(self.width)
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
ty += descent ty += descent
self.bbox = (tx, ty, tx+self.width, ty+self.height) w = 0
dx = 0
prev = ' '
for t in text:
if isinstance(t, tuple):
if prev != ' ' and spwidth < dx:
self.text += ' '
(_,char) = t
self.text += char
prev = char
dx = 0
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
else:
dx -= t
w += t * fontsize * .001 * scaling * .01
self.adv = (w, 0)
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
self.bbox = (tx, ty, tx+w, ty+h)
else: else:
self.direction = 2 self.direction = 2
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width)) disp = 0
self.width = abs(self.width) h = 0
(disp,_) = text[0] for t in text:
(_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001)) if isinstance(t, tuple):
tx -= self.width/2 (disp,char) = t
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
break
for t in text:
if isinstance(t, tuple):
(_,char) = t
self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
self.adv = (0, h)
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
tx -= w/2
ty += disp ty += disp
self.bbox = (tx, ty+self.height, tx+self.width, ty) self.bbox = (tx, ty+h, tx+w, ty)
self.text = ''.join( c for (_,c) in text ) self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
self.fontsize = max(w,h)
return return
def __repr__(self): def __repr__(self):
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' % return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' %
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text)) (self.matrix, self.font, self.fontsize, self.bbox, self.text))
## TextConverter ## PageAggregator
## ##
class TextConverter(PDFDevice): class PageAggregator(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8', debug=0): def __init__(self, rsrc, debug=0):
PDFDevice.__init__(self, rsrc, debug=debug) PDFDevice.__init__(self, rsrc, debug=debug)
self.outfp = outfp
self.codec = codec
self.pageno = 0 self.pageno = 0
self.stack = [] self.stack = []
return return
@ -109,14 +135,12 @@ class TextConverter(PDFDevice):
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return None return None
def render_string(self, textstate, textmatrix, size, seq, ratio=0.6): def render_string(self, textstate, textmatrix, seq):
font = textstate.font font = textstate.font
spwidth = int(-font.char_width(32) * ratio) # space width
text = [] text = []
for x in seq: for x in seq:
if isinstance(x, int) or isinstance(x, float): if isinstance(x, int) or isinstance(x, float):
if not font.is_vertical() and x <= spwidth: text.append(x)
text.append((0, ' '))
else: else:
chars = font.decode(x) chars = font.decode(x)
for cid in chars: for cid in chars:
@ -125,11 +149,20 @@ class TextConverter(PDFDevice):
text.append((font.char_disp(cid), char)) text.append((font.char_disp(cid), char))
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args (cidcoding, cid) = e.args
s = self.handle_undefined_char(cidcoding, cid) unc = self.handle_undefined_char(cidcoding, cid)
if s: if unc:
text.append(s) text.append(unc)
if cid == 32 and not font.is_multibyte():
if text:
item = TextItem(mult_matrix(textmatrix, self.ctm),
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item)
(dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
text = []
if text: if text:
item = TextItem(mult_matrix(textmatrix, self.ctm), item = TextItem(mult_matrix(textmatrix, self.ctm),
font, textstate.fontsize, size, text) font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item) self.cur_item.add(item)
return return

View File

@ -2,11 +2,11 @@
import sys import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \ from pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined PDFPageInterpreter, PDFUnicodeNotDefined
from pdflib.cmap import CMapDB from cmap import CMapDB
from pdflib.page import PageItem, FigureItem, TextItem, TextConverter from page import PageItem, FigureItem, TextItem, PageAggregator
def enc(x, codec): def enc(x, codec):
@ -18,6 +18,16 @@ def encprops(props, codec):
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) ) return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
## TextConverter
class TextConverter(PageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', debug=0):
PageAggregator.__init__(self, rsrc, debug=debug)
self.outfp = outfp
self.codec = codec
return
## SGMLConverter ## SGMLConverter
## ##
class SGMLConverter(TextConverter): class SGMLConverter(TextConverter):
@ -156,7 +166,7 @@ class TagExtractor(PDFDevice):
# pdf2txt # pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass class TextExtractionNotAllowed(RuntimeError): pass
def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0): def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
doc = PDFDocument(debug=debug) doc = PDFDocument(debug=debug)
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp, debug=debug) parser = PDFParser(doc, fp, debug=debug)

View File

@ -73,10 +73,16 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
a0*c1+c0*d1, b0*c1+d0*d1, a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,e+x,f+y)
def apply_matrix((a,b,c,d,e,f), (x,y)): def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to coordinates.''' '''Applies a matrix to coordinates.'''
return (a*x+c*y+e, b*x+d*y+f) return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
return (a*x+c*y, b*x+d*y)
## Fonts ## Fonts
## ##
@ -103,6 +109,9 @@ class PDFFont(object):
def is_vertical(self): def is_vertical(self):
return False return False
def is_multibyte(self):
return False
def decode(self, bytes): def decode(self, bytes):
return map(ord, bytes) return map(ord, bytes)
@ -372,6 +381,9 @@ class PDFCIDFont(PDFFont):
def is_vertical(self): def is_vertical(self):
return self.vertical return self.vertical
def is_multibyte(self):
return True
def decode(self, bytes): def decode(self, bytes):
return self.cmap.decode(bytes) return self.cmap.decode(bytes)
@ -498,7 +510,7 @@ class PDFDevice(object):
def end_figure(self, name): def end_figure(self, name):
return return
def render_string(self, textstate, textmatrix, size, seq): def render_string(self, textstate, textmatrix, seq):
raise NotImplementedError raise NotImplementedError
def render_image(self, stream, size, matrix): def render_image(self, stream, size, matrix):
raise NotImplementedError raise NotImplementedError
@ -928,15 +940,16 @@ class PDFPageInterpreter(object):
def do_TJ(self, seq): def do_TJ(self, seq):
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
textstate = self.textstate textstate = self.textstate
matrix = translate_matrix(textstate.matrix, textstate.linematrix)
self.device.render_string(textstate, matrix, seq)
font = textstate.font font = textstate.font
(a,b,c,d,e,f) = textstate.matrix
(lx,ly) = textstate.linematrix
s = ''.join( x for x in seq if isinstance(x, str) ) s = ''.join( x for x in seq if isinstance(x, str) )
n = sum( x for x in seq if not isinstance(x, str) ) n = sum( x for x in seq if not isinstance(x, str) )
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize + w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
len(s) * textstate.charspace + if not font.is_multibyte():
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0 w += s.count(' ')*textstate.wordspace
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq) w *= (textstate.scaling * .01)
(lx,ly) = textstate.linematrix
if font.is_vertical(): if font.is_vertical():
ly += w ly += w
else: else:

View File

@ -586,7 +586,7 @@ class PDFDocument(object):
self.parser.seek(index) self.parser.seek(index)
(_,objid1) = self.parser.nexttoken() # objid (_,objid1) = self.parser.nexttoken() # objid
(_,genno) = self.parser.nexttoken() # genno (_,genno) = self.parser.nexttoken() # genno
assert objid1 == objid, (objid, objid1) #assert objid1 == objid, (objid, objid1)
(_,kwd) = self.parser.nexttoken() (_,kwd) = self.parser.nexttoken()
if kwd != KEYWORD_OBJ: if kwd != KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index) raise PDFSyntaxError('Invalid object spec: offset=%r' % index)

View File

@ -32,11 +32,15 @@ endobj
>> >>
endobj endobj
5 0 obj 5 0 obj
<< /Length 46 >> << /Length 86 >>
stream stream
BT BT
/F1 24 Tf /F1 24 Tf
1 0 0 1 100 700 TD 100 600 Td
0 Tw
( Hello World ) Tj
0 100 Td
100 Tw
( Hello World ) Tj ( Hello World ) Tj
ET ET
endstream endstream

View File

@ -89,7 +89,7 @@ def dumpallobjs(out, doc, codec=None):
dumpxml(out, obj, codec=codec) dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n') out.write('\n</object>\n\n')
except: except:
pass raise
dumptrailers(out, doc) dumptrailers(out, doc)
out.write('</pdf>') out.write('</pdf>')
return return