wordspace handling improved.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@55 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
33f709a0d8
commit
71be16febe
|
@ -3,7 +3,7 @@ import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
|
from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
|
||||||
mult_matrix, apply_matrix
|
mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
||||||
|
|
||||||
|
|
||||||
## PageItem
|
## PageItem
|
||||||
|
@ -37,47 +37,73 @@ class FigureItem(PageItem):
|
||||||
##
|
##
|
||||||
class TextItem(object):
|
class TextItem(object):
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, width, text):
|
SPACE_WIDTH = 0.6
|
||||||
|
|
||||||
|
def __init__(self, matrix, font, fontsize, charspace, scaling, text):
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
self.font = font
|
self.font = font
|
||||||
(a,b,c,d,tx,ty) = self.matrix
|
(_,_,_,_,tx,ty) = self.matrix
|
||||||
self.origin = (tx,ty)
|
self.origin = (tx,ty)
|
||||||
self.direction = 0
|
self.direction = 0
|
||||||
|
self.text = ''
|
||||||
if not self.font.is_vertical():
|
if not self.font.is_vertical():
|
||||||
|
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
|
||||||
self.direction = 1
|
self.direction = 1
|
||||||
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
(_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
|
||||||
self.width = abs(self.width)
|
|
||||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
|
|
||||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
|
|
||||||
ty += descent
|
ty += descent
|
||||||
self.bbox = (tx, ty, tx+self.width, ty+self.height)
|
w = 0
|
||||||
|
dx = 0
|
||||||
|
prev = ' '
|
||||||
|
for t in text:
|
||||||
|
if isinstance(t, tuple):
|
||||||
|
if prev != ' ' and spwidth < dx:
|
||||||
|
self.text += ' '
|
||||||
|
(_,char) = t
|
||||||
|
self.text += char
|
||||||
|
prev = char
|
||||||
|
dx = 0
|
||||||
|
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
||||||
|
else:
|
||||||
|
dx -= t
|
||||||
|
w += t * fontsize * .001 * scaling * .01
|
||||||
|
self.adv = (w, 0)
|
||||||
|
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
|
||||||
|
self.bbox = (tx, ty, tx+w, ty+h)
|
||||||
else:
|
else:
|
||||||
self.direction = 2
|
self.direction = 2
|
||||||
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width))
|
disp = 0
|
||||||
self.width = abs(self.width)
|
h = 0
|
||||||
(disp,_) = text[0]
|
for t in text:
|
||||||
(_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001))
|
if isinstance(t, tuple):
|
||||||
tx -= self.width/2
|
(disp,char) = t
|
||||||
|
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||||
|
self.text += char
|
||||||
|
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
||||||
|
break
|
||||||
|
for t in text:
|
||||||
|
if isinstance(t, tuple):
|
||||||
|
(_,char) = t
|
||||||
|
self.text += char
|
||||||
|
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
||||||
|
self.adv = (0, h)
|
||||||
|
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
|
||||||
|
tx -= w/2
|
||||||
ty += disp
|
ty += disp
|
||||||
self.bbox = (tx, ty+self.height, tx+self.width, ty)
|
self.bbox = (tx, ty+h, tx+w, ty)
|
||||||
self.text = ''.join( c for (_,c) in text )
|
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
|
||||||
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
|
|
||||||
self.fontsize = max(w,h)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
|
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' %
|
||||||
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
|
(self.matrix, self.font, self.fontsize, self.bbox, self.text))
|
||||||
|
|
||||||
|
|
||||||
## TextConverter
|
## PageAggregator
|
||||||
##
|
##
|
||||||
class TextConverter(PDFDevice):
|
class PageAggregator(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
|
def __init__(self, rsrc, debug=0):
|
||||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||||
self.outfp = outfp
|
|
||||||
self.codec = codec
|
|
||||||
self.pageno = 0
|
self.pageno = 0
|
||||||
self.stack = []
|
self.stack = []
|
||||||
return
|
return
|
||||||
|
@ -109,14 +135,12 @@ class TextConverter(PDFDevice):
|
||||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, size, seq, ratio=0.6):
|
def render_string(self, textstate, textmatrix, seq):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
spwidth = int(-font.char_width(32) * ratio) # space width
|
|
||||||
text = []
|
text = []
|
||||||
for x in seq:
|
for x in seq:
|
||||||
if isinstance(x, int) or isinstance(x, float):
|
if isinstance(x, int) or isinstance(x, float):
|
||||||
if not font.is_vertical() and x <= spwidth:
|
text.append(x)
|
||||||
text.append((0, ' '))
|
|
||||||
else:
|
else:
|
||||||
chars = font.decode(x)
|
chars = font.decode(x)
|
||||||
for cid in chars:
|
for cid in chars:
|
||||||
|
@ -125,11 +149,20 @@ class TextConverter(PDFDevice):
|
||||||
text.append((font.char_disp(cid), char))
|
text.append((font.char_disp(cid), char))
|
||||||
except PDFUnicodeNotDefined, e:
|
except PDFUnicodeNotDefined, e:
|
||||||
(cidcoding, cid) = e.args
|
(cidcoding, cid) = e.args
|
||||||
s = self.handle_undefined_char(cidcoding, cid)
|
unc = self.handle_undefined_char(cidcoding, cid)
|
||||||
if s:
|
if unc:
|
||||||
text.append(s)
|
text.append(unc)
|
||||||
|
if cid == 32 and not font.is_multibyte():
|
||||||
|
if text:
|
||||||
|
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
||||||
|
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
||||||
|
self.cur_item.add(item)
|
||||||
|
(dx,dy) = item.adv
|
||||||
|
dx += textstate.wordspace * textstate.scaling * .01
|
||||||
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||||
|
text = []
|
||||||
if text:
|
if text:
|
||||||
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
||||||
font, textstate.fontsize, size, text)
|
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return
|
return
|
||||||
|
|
|
@ -2,11 +2,11 @@
|
||||||
import sys
|
import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||||
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
|
from pdfinterp import PDFDevice, PDFResourceManager, \
|
||||||
PDFPageInterpreter, PDFUnicodeNotDefined
|
PDFPageInterpreter, PDFUnicodeNotDefined
|
||||||
from pdflib.cmap import CMapDB
|
from cmap import CMapDB
|
||||||
from pdflib.page import PageItem, FigureItem, TextItem, TextConverter
|
from page import PageItem, FigureItem, TextItem, PageAggregator
|
||||||
|
|
||||||
|
|
||||||
def enc(x, codec):
|
def enc(x, codec):
|
||||||
|
@ -18,6 +18,16 @@ def encprops(props, codec):
|
||||||
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
|
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
|
||||||
|
|
||||||
|
|
||||||
|
## TextConverter
|
||||||
|
class TextConverter(PageAggregator):
|
||||||
|
|
||||||
|
def __init__(self, rsrc, outfp, codec='ascii', debug=0):
|
||||||
|
PageAggregator.__init__(self, rsrc, debug=debug)
|
||||||
|
self.outfp = outfp
|
||||||
|
self.codec = codec
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## SGMLConverter
|
## SGMLConverter
|
||||||
##
|
##
|
||||||
class SGMLConverter(TextConverter):
|
class SGMLConverter(TextConverter):
|
||||||
|
@ -156,7 +166,7 @@ class TagExtractor(PDFDevice):
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
class TextExtractionNotAllowed(RuntimeError): pass
|
class TextExtractionNotAllowed(RuntimeError): pass
|
||||||
|
|
||||||
def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0):
|
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp, debug=debug)
|
||||||
|
|
|
@ -73,10 +73,16 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||||
a0*c1+c0*d1, b0*c1+d0*d1,
|
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||||
|
|
||||||
|
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
|
return (a,b,c,d,e+x,f+y)
|
||||||
|
|
||||||
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
'''Applies a matrix to coordinates.'''
|
'''Applies a matrix to coordinates.'''
|
||||||
return (a*x+c*y+e, b*x+d*y+f)
|
return (a*x+c*y+e, b*x+d*y+f)
|
||||||
|
|
||||||
|
def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
|
||||||
|
return (a*x+c*y, b*x+d*y)
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
## Fonts
|
||||||
##
|
##
|
||||||
|
@ -103,6 +109,9 @@ class PDFFont(object):
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def is_multibyte(self):
|
||||||
|
return False
|
||||||
|
|
||||||
def decode(self, bytes):
|
def decode(self, bytes):
|
||||||
return map(ord, bytes)
|
return map(ord, bytes)
|
||||||
|
|
||||||
|
@ -372,6 +381,9 @@ class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
return self.vertical
|
return self.vertical
|
||||||
|
|
||||||
|
def is_multibyte(self):
|
||||||
|
return True
|
||||||
|
|
||||||
def decode(self, bytes):
|
def decode(self, bytes):
|
||||||
return self.cmap.decode(bytes)
|
return self.cmap.decode(bytes)
|
||||||
|
@ -498,7 +510,7 @@ class PDFDevice(object):
|
||||||
def end_figure(self, name):
|
def end_figure(self, name):
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, size, seq):
|
def render_string(self, textstate, textmatrix, seq):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
def render_image(self, stream, size, matrix):
|
def render_image(self, stream, size, matrix):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
@ -928,15 +940,16 @@ class PDFPageInterpreter(object):
|
||||||
def do_TJ(self, seq):
|
def do_TJ(self, seq):
|
||||||
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
||||||
textstate = self.textstate
|
textstate = self.textstate
|
||||||
|
matrix = translate_matrix(textstate.matrix, textstate.linematrix)
|
||||||
|
self.device.render_string(textstate, matrix, seq)
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
(a,b,c,d,e,f) = textstate.matrix
|
|
||||||
(lx,ly) = textstate.linematrix
|
|
||||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
s = ''.join( x for x in seq if isinstance(x, str) )
|
||||||
n = sum( x for x in seq if not isinstance(x, str) )
|
n = sum( x for x in seq if not isinstance(x, str) )
|
||||||
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
|
w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
|
||||||
len(s) * textstate.charspace +
|
if not font.is_multibyte():
|
||||||
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
|
w += s.count(' ')*textstate.wordspace
|
||||||
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
|
w *= (textstate.scaling * .01)
|
||||||
|
(lx,ly) = textstate.linematrix
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
ly += w
|
ly += w
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -586,7 +586,7 @@ class PDFDocument(object):
|
||||||
self.parser.seek(index)
|
self.parser.seek(index)
|
||||||
(_,objid1) = self.parser.nexttoken() # objid
|
(_,objid1) = self.parser.nexttoken() # objid
|
||||||
(_,genno) = self.parser.nexttoken() # genno
|
(_,genno) = self.parser.nexttoken() # genno
|
||||||
assert objid1 == objid, (objid, objid1)
|
#assert objid1 == objid, (objid, objid1)
|
||||||
(_,kwd) = self.parser.nexttoken()
|
(_,kwd) = self.parser.nexttoken()
|
||||||
if kwd != KEYWORD_OBJ:
|
if kwd != KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||||
|
|
|
@ -32,11 +32,15 @@ endobj
|
||||||
>>
|
>>
|
||||||
endobj
|
endobj
|
||||||
5 0 obj
|
5 0 obj
|
||||||
<< /Length 46 >>
|
<< /Length 86 >>
|
||||||
stream
|
stream
|
||||||
BT
|
BT
|
||||||
/F1 24 Tf
|
/F1 24 Tf
|
||||||
1 0 0 1 100 700 TD
|
100 600 Td
|
||||||
|
0 Tw
|
||||||
|
( Hello World ) Tj
|
||||||
|
0 100 Td
|
||||||
|
100 Tw
|
||||||
( Hello World ) Tj
|
( Hello World ) Tj
|
||||||
ET
|
ET
|
||||||
endstream
|
endstream
|
||||||
|
|
|
@ -89,7 +89,7 @@ def dumpallobjs(out, doc, codec=None):
|
||||||
dumpxml(out, obj, codec=codec)
|
dumpxml(out, obj, codec=codec)
|
||||||
out.write('\n</object>\n\n')
|
out.write('\n</object>\n\n')
|
||||||
except:
|
except:
|
||||||
pass
|
raise
|
||||||
dumptrailers(out, doc)
|
dumptrailers(out, doc)
|
||||||
out.write('</pdf>')
|
out.write('</pdf>')
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue