CAUTION! changed the way of internal layout handling.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@184 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-02-27 03:59:25 +00:00
parent 2555b38836
commit 23be96c49e
5 changed files with 102 additions and 132 deletions

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Mon Feb 15 14:41:49 UTC 2010
Last Modified: Sat Feb 27 03:58:45 UTC 2010
<!-- hhmts end -->
</div>
@ -348,7 +348,8 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2010/02/15: Bugfixes. Thanks to Sean.
<li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -&gt; LTChar)
<li> 2010/02/15: Several bugfixes. Thanks to Sean.
<li> 2010/02/13: Bugfix and enhancement. Thanks to Andr&eacute; Auzi.
<li> 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe.
<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.

View File

@ -5,9 +5,9 @@ from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextLine
from utils import apply_matrix_pt, mult_matrix
from utils import enc, strbbox
from utils import enc, bbox2str
## PDFPageAggregator
@ -97,9 +97,8 @@ class PDFPageAggregator(PDFTextDevice):
self.cur_item.add(LTPolygon(gstate.linewidth, pts))
return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
if not chars: return (0, 0)
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
def render_char(self, matrix, font, fontsize, scaling, cid):
item = LTChar(matrix, font, fontsize, scaling, cid)
self.cur_item.add(item)
return item.adv
@ -202,15 +201,10 @@ class HTMLConverter(PDFConverter):
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
render(child)
elif isinstance(item, LTTextItem):
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
elif isinstance(item, LTChar):
self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' %
(item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.get_size()*self.scale))
self.write(item.text)
self.outfp.write('</span>\n')
if self.debug:
@ -271,35 +265,40 @@ class XMLConverter(PDFConverter):
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, strbbox(item.bbox), item.rotate))
(item.id, bbox2str(item.bbox), item.rotate))
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox)))
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' %
(item.linewidth, item.direction, bbox2str(item.bbox)))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox)))
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
(item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts()))
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' %
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
self.outfp.write('<figure id="%s" bbox="%s">\n' %
(item.id, bbox2str(item.bbox)))
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox))
self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox)))
self.outfp.write('<textbox id="%s" bbox="%s">\n' %
(item.id, bbox2str(item.bbox)))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
elif isinstance(item, LTChar):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
strbbox(item.bbox), item.fontsize))
bbox2str(item.bbox), item.get_size()))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
@ -310,7 +309,8 @@ class XMLConverter(PDFConverter):
name = self.write_image(item)
if name:
x = 'name="%s" ' % enc(name)
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' %
(x, item.type, item.width, item.height))
else:
assert 0, item
return
@ -352,7 +352,7 @@ class TagExtractor(PDFDevice):
def begin_page(self, page, ctm):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, strbbox(page.mediabox), page.rotate))
(self.pageno, bbox2str(page.mediabox), page.rotate))
return
def end_page(self, page):

View File

@ -2,7 +2,8 @@
import sys
from sys import maxint as INF
from utils import apply_matrix_norm, apply_matrix_pt
from utils import bsearch, strbbox
from utils import bsearch, bbox2str, matrix2str
from pdffont import PDFUnicodeNotDefined
@ -136,7 +137,7 @@ class LayoutItem(object):
return
def __repr__(self):
return ('<item bbox=%s>' % strbbox(self.bbox))
return ('<item bbox=%s>' % bbox2str(self.bbox))
def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
@ -203,7 +204,7 @@ class LayoutContainer(LayoutItem):
return
def __repr__(self):
return ('<group %s>' % strbbox(self.bbox))
return ('<group %s>' % bbox2str(self.bbox))
def __iter__(self):
return iter(self.objs)
@ -326,55 +327,59 @@ class LTAnon(LTText):
return 0
## LTTextItem
## LTChar
##
class LTTextItem(LayoutItem, LTText):
class LTChar(LayoutItem, LTText):
debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
def __init__(self, matrix, font, fontsize, scaling, cid):
self.matrix = matrix
self.font = font
self.fontsize = fontsize
self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling
#size = (font.get_ascent() - font.get_descent()) * fontsize
size = font.get_size() * fontsize
(_,_,_,_,tx,ty) = self.matrix
if not self.vertical:
# horizontal text
self.adv = (adv, 0)
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
self.adv = font.char_width(cid) * fontsize * scaling
try:
text = font.to_unichr(cid)
except PDFUnicodeNotDefined:
text = '?'
LTText.__init__(self, text)
# compute the boundary rectangle.
if self.vertical:
# vertical
size = font.get_size() * fontsize
displacement = (1000 - font.char_disp(cid)) * fontsize * .001
(_,displacement) = apply_matrix_norm(self.matrix, (0, displacement))
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
(_,_,_,_,tx,ty) = self.matrix
tx -= dx/2
ty += displacement
bbox = (tx, ty+dy, tx+dx, ty)
else:
# horizontal
size = font.get_size() * fontsize
descent = font.get_descent() * fontsize
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
(_,_,_,_,tx,ty) = self.matrix
ty += descent
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.adv = (0, adv)
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, bbox)
return
def __repr__(self):
if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, strbbox(self.bbox),
'(%.1f, %.1f)' % self.adv,
self.text))
return ('<char matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize,
bbox2str(self.bbox), self.adv, self.text))
else:
return '<text %r>' % self.text
return '<char %r>' % self.text
def get_margin(self):
return abs(self.fontsize)
return min(self.width, self.height)
def get_size(self):
return max(self.width, self.height)
def is_vertical(self):
return self.vertical
@ -383,7 +388,7 @@ class LTTextItem(LayoutItem, LTText):
(a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0
## LTFigure
##
class LTFigure(LayoutContainer):
@ -397,7 +402,8 @@ class LTFigure(LayoutContainer):
return
def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, strbbox(self.bbox), self.matrix))
return ('<figure id=%r bbox=%s matrix=%s>' %
(self.id, bbox2str(self.bbox), matrix2str(self.matrix)))
## LTTextLine
@ -411,7 +417,7 @@ class LTTextLine(LayoutContainer):
return
def __repr__(self):
return ('<textline %s(%s)>' % (strbbox(self.bbox), self.direction))
return ('<textline %s(%s)>' % (bbox2str(self.bbox), self.direction))
def get_margin(self):
return min(self.width, self.height)
@ -428,7 +434,7 @@ class LTTextLine(LayoutContainer):
if self.direction == 'V':
y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin:
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
@ -437,7 +443,7 @@ class LTTextLine(LayoutContainer):
else:
x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin:
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin:
objs.append(LTAnon(' '))
@ -461,7 +467,7 @@ class LTTextBox(LayoutContainer):
return
def __repr__(self):
return ('<textbox %s(%s) %r...>' % (strbbox(self.bbox), self.direction, self.get_text()[:20]))
return ('<textbox %s(%s) %r...>' % (bbox2str(self.bbox), self.direction, self.get_text()[:20]))
def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@ -517,7 +523,7 @@ class LTPage(LayoutContainer):
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, strbbox(self.bbox), self.rotate))
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, bbox2str(self.bbox), self.rotate))
def analyze_layout(self, laparams):
textobjs = []

View File

@ -59,9 +59,6 @@ class PDFTextDevice(PDFDevice):
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?'
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
return (0, 0)
def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
@ -82,76 +79,39 @@ class PDFTextDevice(PDFDevice):
def render_string_horizontal(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale):
chars = []
needspace = False
needcharspace = False
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx - obj*dxscale
y += dy
chars = []
needspace = False
x -= obj*dxscale
needcharspace = False
else:
for cid in font.decode(obj):
try:
char = font.to_unichr(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if needcharspace:
x += charspace
x += self.render_char(translate_matrix(matrix, (x,y)),
font, fontsize, scaling, cid)
needcharspace = True
if cid == 32 and wordspace:
if needspace:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx + wordspace
y += dy
chars = []
if chars:
if needspace:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
x += wordspace
return (x, y)
def render_string_vertical(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale):
chars = []
needspace = False
needcharspace = False
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy - obj*dxscale
chars = []
needspace = False
y -= obj*dxscale
needcharspace = False
else:
for cid in font.decode(obj):
try:
char = font.to_unichr(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if needcharspace:
y += charspace
y += self.render_char(translate_matrix(matrix, (x,y)),
font, fontsize, scaling, cid)
needcharspace = True
if cid == 32 and wordspace:
if needspace:
y += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx
y += dy + wordspace
chars = []
if chars:
if needspace:
y += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
y += wordspace
return (x, y)
def render_char(self, matrix, font, fontsize, scaling, cid):
return 0

View File

@ -136,9 +136,12 @@ def enc(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')
def strbbox((x0,y0,x1,y1)):
def bbox2str((x0,y0,x1,y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
def matrix2str((a,b,c,d,e,f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a,b,c,d,e,f)
## ObjIdRange
##