wordspace handling improved.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@55 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
33f709a0d8
commit
71be16febe
|
@ -3,7 +3,7 @@ import sys
|
|||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
|
||||
mult_matrix, apply_matrix
|
||||
mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
||||
|
||||
|
||||
## PageItem
|
||||
|
@ -37,47 +37,73 @@ class FigureItem(PageItem):
|
|||
##
|
||||
class TextItem(object):
|
||||
|
||||
def __init__(self, matrix, font, fontsize, width, text):
|
||||
SPACE_WIDTH = 0.6
|
||||
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, text):
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
(a,b,c,d,tx,ty) = self.matrix
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
self.origin = (tx,ty)
|
||||
self.direction = 0
|
||||
self.text = ''
|
||||
if not self.font.is_vertical():
|
||||
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
|
||||
self.direction = 1
|
||||
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
||||
self.width = abs(self.width)
|
||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
|
||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
|
||||
ty += descent
|
||||
self.bbox = (tx, ty, tx+self.width, ty+self.height)
|
||||
w = 0
|
||||
dx = 0
|
||||
prev = ' '
|
||||
for t in text:
|
||||
if isinstance(t, tuple):
|
||||
if prev != ' ' and spwidth < dx:
|
||||
self.text += ' '
|
||||
(_,char) = t
|
||||
self.text += char
|
||||
prev = char
|
||||
dx = 0
|
||||
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
||||
else:
|
||||
dx -= t
|
||||
w += t * fontsize * .001 * scaling * .01
|
||||
self.adv = (w, 0)
|
||||
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
|
||||
self.bbox = (tx, ty, tx+w, ty+h)
|
||||
else:
|
||||
self.direction = 2
|
||||
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width))
|
||||
self.width = abs(self.width)
|
||||
(disp,_) = text[0]
|
||||
(_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001))
|
||||
tx -= self.width/2
|
||||
disp = 0
|
||||
h = 0
|
||||
for t in text:
|
||||
if isinstance(t, tuple):
|
||||
(disp,char) = t
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||
self.text += char
|
||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
||||
break
|
||||
for t in text:
|
||||
if isinstance(t, tuple):
|
||||
(_,char) = t
|
||||
self.text += char
|
||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
||||
self.adv = (0, h)
|
||||
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
|
||||
tx -= w/2
|
||||
ty += disp
|
||||
self.bbox = (tx, ty+self.height, tx+self.width, ty)
|
||||
self.text = ''.join( c for (_,c) in text )
|
||||
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
|
||||
self.fontsize = max(w,h)
|
||||
self.bbox = (tx, ty+h, tx+w, ty)
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
|
||||
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
|
||||
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' %
|
||||
(self.matrix, self.font, self.fontsize, self.bbox, self.text))
|
||||
|
||||
|
||||
## TextConverter
|
||||
## PageAggregator
|
||||
##
|
||||
class TextConverter(PDFDevice):
|
||||
class PageAggregator(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
|
||||
def __init__(self, rsrc, debug=0):
|
||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
self.pageno = 0
|
||||
self.stack = []
|
||||
return
|
||||
|
@ -109,14 +135,12 @@ class TextConverter(PDFDevice):
|
|||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
return None
|
||||
|
||||
def render_string(self, textstate, textmatrix, size, seq, ratio=0.6):
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
font = textstate.font
|
||||
spwidth = int(-font.char_width(32) * ratio) # space width
|
||||
text = []
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
if not font.is_vertical() and x <= spwidth:
|
||||
text.append((0, ' '))
|
||||
text.append(x)
|
||||
else:
|
||||
chars = font.decode(x)
|
||||
for cid in chars:
|
||||
|
@ -125,11 +149,20 @@ class TextConverter(PDFDevice):
|
|||
text.append((font.char_disp(cid), char))
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
s = self.handle_undefined_char(cidcoding, cid)
|
||||
if s:
|
||||
text.append(s)
|
||||
unc = self.handle_undefined_char(cidcoding, cid)
|
||||
if unc:
|
||||
text.append(unc)
|
||||
if cid == 32 and not font.is_multibyte():
|
||||
if text:
|
||||
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
||||
font, textstate.fontsize, size, text)
|
||||
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
||||
self.cur_item.add(item)
|
||||
(dx,dy) = item.adv
|
||||
dx += textstate.wordspace * textstate.scaling * .01
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
text = []
|
||||
if text:
|
||||
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
||||
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
||||
self.cur_item.add(item)
|
||||
return
|
||||
|
|
|
@ -2,11 +2,11 @@
|
|||
import sys
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFDevice, PDFResourceManager, \
|
||||
PDFPageInterpreter, PDFUnicodeNotDefined
|
||||
from pdflib.cmap import CMapDB
|
||||
from pdflib.page import PageItem, FigureItem, TextItem, TextConverter
|
||||
from cmap import CMapDB
|
||||
from page import PageItem, FigureItem, TextItem, PageAggregator
|
||||
|
||||
|
||||
def enc(x, codec):
|
||||
|
@ -18,6 +18,16 @@ def encprops(props, codec):
|
|||
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
|
||||
|
||||
|
||||
## TextConverter
|
||||
class TextConverter(PageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='ascii', debug=0):
|
||||
PageAggregator.__init__(self, rsrc, debug=debug)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
||||
|
||||
## SGMLConverter
|
||||
##
|
||||
class SGMLConverter(TextConverter):
|
||||
|
@ -156,7 +166,7 @@ class TagExtractor(PDFDevice):
|
|||
# pdf2txt
|
||||
class TextExtractionNotAllowed(RuntimeError): pass
|
||||
|
||||
def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0):
|
||||
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
|
||||
doc = PDFDocument(debug=debug)
|
||||
fp = file(fname, 'rb')
|
||||
parser = PDFParser(doc, fp, debug=debug)
|
||||
|
|
|
@ -73,10 +73,16 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
|||
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||
|
||||
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
||||
return (a,b,c,d,e+x,f+y)
|
||||
|
||||
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||
'''Applies a matrix to coordinates.'''
|
||||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
|
||||
def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
|
||||
return (a*x+c*y, b*x+d*y)
|
||||
|
||||
|
||||
## Fonts
|
||||
##
|
||||
|
@ -103,6 +109,9 @@ class PDFFont(object):
|
|||
def is_vertical(self):
|
||||
return False
|
||||
|
||||
def is_multibyte(self):
|
||||
return False
|
||||
|
||||
def decode(self, bytes):
|
||||
return map(ord, bytes)
|
||||
|
||||
|
@ -373,6 +382,9 @@ class PDFCIDFont(PDFFont):
|
|||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def is_multibyte(self):
|
||||
return True
|
||||
|
||||
def decode(self, bytes):
|
||||
return self.cmap.decode(bytes)
|
||||
|
||||
|
@ -498,7 +510,7 @@ class PDFDevice(object):
|
|||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, size, seq):
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
raise NotImplementedError
|
||||
def render_image(self, stream, size, matrix):
|
||||
raise NotImplementedError
|
||||
|
@ -928,15 +940,16 @@ class PDFPageInterpreter(object):
|
|||
def do_TJ(self, seq):
|
||||
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
||||
textstate = self.textstate
|
||||
matrix = translate_matrix(textstate.matrix, textstate.linematrix)
|
||||
self.device.render_string(textstate, matrix, seq)
|
||||
font = textstate.font
|
||||
(a,b,c,d,e,f) = textstate.matrix
|
||||
(lx,ly) = textstate.linematrix
|
||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
||||
n = sum( x for x in seq if not isinstance(x, str) )
|
||||
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
|
||||
len(s) * textstate.charspace +
|
||||
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
|
||||
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
|
||||
w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
|
||||
if not font.is_multibyte():
|
||||
w += s.count(' ')*textstate.wordspace
|
||||
w *= (textstate.scaling * .01)
|
||||
(lx,ly) = textstate.linematrix
|
||||
if font.is_vertical():
|
||||
ly += w
|
||||
else:
|
||||
|
|
|
@ -586,7 +586,7 @@ class PDFDocument(object):
|
|||
self.parser.seek(index)
|
||||
(_,objid1) = self.parser.nexttoken() # objid
|
||||
(_,genno) = self.parser.nexttoken() # genno
|
||||
assert objid1 == objid, (objid, objid1)
|
||||
#assert objid1 == objid, (objid, objid1)
|
||||
(_,kwd) = self.parser.nexttoken()
|
||||
if kwd != KEYWORD_OBJ:
|
||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||
|
|
|
@ -32,11 +32,15 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 46 >>
|
||||
<< /Length 86 >>
|
||||
stream
|
||||
BT
|
||||
/F1 24 Tf
|
||||
1 0 0 1 100 700 TD
|
||||
100 600 Td
|
||||
0 Tw
|
||||
( Hello World ) Tj
|
||||
0 100 Td
|
||||
100 Tw
|
||||
( Hello World ) Tj
|
||||
ET
|
||||
endstream
|
||||
|
|
|
@ -89,7 +89,7 @@ def dumpallobjs(out, doc, codec=None):
|
|||
dumpxml(out, obj, codec=codec)
|
||||
out.write('\n</object>\n\n')
|
||||
except:
|
||||
pass
|
||||
raise
|
||||
dumptrailers(out, doc)
|
||||
out.write('</pdf>')
|
||||
return
|
||||
|
|
Loading…
Reference in New Issue