CAUTION! changed the way of internal layout handling.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@184 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-02-27 03:59:25 +00:00
parent 2555b38836
commit 23be96c49e
5 changed files with 102 additions and 132 deletions

View File

@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Mon Feb 15 14:41:49 UTC 2010 Last Modified: Sat Feb 27 03:58:45 UTC 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -348,7 +348,8 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2010/02/15: Bugfixes. Thanks to Sean. <li> 2010/02/27: Changed the way of internal layout handling. (LTTextItem -&gt; LTChar)
<li> 2010/02/15: Several bugfixes. Thanks to Sean.
<li> 2010/02/13: Bugfix and enhancement. Thanks to Andr&eacute; Auzi. <li> 2010/02/13: Bugfix and enhancement. Thanks to Andr&eacute; Auzi.
<li> 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe. <li> 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe.
<li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed. <li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed.

View File

@ -5,9 +5,9 @@ from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine from layout import LTFigure, LTImage, LTChar, LTTextBox, LTTextLine
from utils import apply_matrix_pt, mult_matrix from utils import apply_matrix_pt, mult_matrix
from utils import enc, strbbox from utils import enc, bbox2str
## PDFPageAggregator ## PDFPageAggregator
@ -97,9 +97,8 @@ class PDFPageAggregator(PDFTextDevice):
self.cur_item.add(LTPolygon(gstate.linewidth, pts)) self.cur_item.add(LTPolygon(gstate.linewidth, pts))
return return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): def render_char(self, matrix, font, fontsize, scaling, cid):
if not chars: return (0, 0) item = LTChar(matrix, font, fontsize, scaling, cid)
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
self.cur_item.add(item) self.cur_item.add(item)
return item.adv return item.adv
@ -202,15 +201,10 @@ class HTMLConverter(PDFConverter):
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTTextItem): elif isinstance(item, LTChar):
if item.vertical: self.outfp.write('<span style="position:absolute; left:%dpx; top:%dpx; font-size:%dpx;">' %
wmode = 'tb-rl' (item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
else: item.get_size()*self.scale))
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.write(item.text) self.write(item.text)
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.debug: if self.debug:
@ -271,35 +265,40 @@ class XMLConverter(PDFConverter):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.id, strbbox(item.bbox), item.rotate)) (item.id, bbox2str(item.bbox), item.rotate))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction: elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox))) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' %
(item.linewidth, item.direction, bbox2str(item.bbox)))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox))) self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
(item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTPolygon): elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts())) self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' %
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox))) self.outfp.write('<figure id="%s" bbox="%s">\n' %
(item.id, bbox2str(item.bbox)))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine): elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox)) self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textline>\n') self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox))) self.outfp.write('<textbox id="%s" bbox="%s">\n' %
(item.id, bbox2str(item.bbox)))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem): elif isinstance(item, LTChar):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' % self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
(enc(item.font.fontname), item.is_vertical(), (enc(item.font.fontname), item.is_vertical(),
strbbox(item.bbox), item.fontsize)) bbox2str(item.bbox), item.get_size()))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
@ -310,7 +309,8 @@ class XMLConverter(PDFConverter):
name = self.write_image(item) name = self.write_image(item)
if name: if name:
x = 'name="%s" ' % enc(name) x = 'name="%s" ' % enc(name)
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height)) self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' %
(x, item.type, item.width, item.height))
else: else:
assert 0, item assert 0, item
return return
@ -352,7 +352,7 @@ class TagExtractor(PDFDevice):
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, strbbox(page.mediabox), page.rotate)) (self.pageno, bbox2str(page.mediabox), page.rotate))
return return
def end_page(self, page): def end_page(self, page):

View File

@ -2,7 +2,8 @@
import sys import sys
from sys import maxint as INF from sys import maxint as INF
from utils import apply_matrix_norm, apply_matrix_pt from utils import apply_matrix_norm, apply_matrix_pt
from utils import bsearch, strbbox from utils import bsearch, bbox2str, matrix2str
from pdffont import PDFUnicodeNotDefined
@ -136,7 +137,7 @@ class LayoutItem(object):
return return
def __repr__(self): def __repr__(self):
return ('<item bbox=%s>' % strbbox(self.bbox)) return ('<item bbox=%s>' % bbox2str(self.bbox))
def set_bbox(self, (x0,y0,x1,y1)): def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0) if x1 < x0: (x0,x1) = (x1,x0)
@ -203,7 +204,7 @@ class LayoutContainer(LayoutItem):
return return
def __repr__(self): def __repr__(self):
return ('<group %s>' % strbbox(self.bbox)) return ('<group %s>' % bbox2str(self.bbox))
def __iter__(self): def __iter__(self):
return iter(self.objs) return iter(self.objs)
@ -326,55 +327,59 @@ class LTAnon(LTText):
return 0 return 0
## LTTextItem ## LTChar
## ##
class LTTextItem(LayoutItem, LTText): class LTChar(LayoutItem, LTText):
debug = 1 debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars): def __init__(self, matrix, font, fontsize, scaling, cid):
assert chars
self.matrix = matrix self.matrix = matrix
self.font = font self.font = font
self.fontsize = fontsize
self.vertical = font.is_vertical() self.vertical = font.is_vertical()
self.text = ''.join( char for (char,_) in chars ) self.adv = font.char_width(cid) * fontsize * scaling
adv = sum( font.char_width(cid) for (_,cid) in chars ) try:
adv = (adv * fontsize + (len(chars)-1)*charspace) * scaling text = font.to_unichr(cid)
#size = (font.get_ascent() - font.get_descent()) * fontsize except PDFUnicodeNotDefined:
size = font.get_size() * fontsize text = '?'
(_,_,_,_,tx,ty) = self.matrix LTText.__init__(self, text)
if not self.vertical: # compute the boundary rectangle.
# horizontal text if self.vertical:
self.adv = (adv, 0) # vertical
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size)) size = font.get_size() * fontsize
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) displacement = (1000 - font.char_disp(cid)) * fontsize * .001
(_,displacement) = apply_matrix_norm(self.matrix, (0, displacement))
(dx,dy) = apply_matrix_norm(self.matrix, (size, self.adv))
(_,_,_,_,tx,ty) = self.matrix
tx -= dx/2
ty += displacement
bbox = (tx, ty+dy, tx+dx, ty)
else:
# horizontal
size = font.get_size() * fontsize
descent = font.get_descent() * fontsize
(_,descent) = apply_matrix_norm(self.matrix, (0, descent))
(dx,dy) = apply_matrix_norm(self.matrix, (self.adv, size))
(_,_,_,_,tx,ty) = self.matrix
ty += descent ty += descent
bbox = (tx, ty, tx+dx, ty+dy) bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
self.adv = (0, adv)
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
return return
def __repr__(self): def __repr__(self):
if self.debug: if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' % return ('<char matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix, (matrix2str(self.matrix), self.font, self.fontsize,
self.font, self.fontsize, strbbox(self.bbox), bbox2str(self.bbox), self.adv, self.text))
'(%.1f, %.1f)' % self.adv,
self.text))
else: else:
return '<text %r>' % self.text return '<char %r>' % self.text
def get_margin(self): def get_margin(self):
return abs(self.fontsize) return min(self.width, self.height)
def get_size(self):
return max(self.width, self.height)
def is_vertical(self): def is_vertical(self):
return self.vertical return self.vertical
@ -383,7 +388,7 @@ class LTTextItem(LayoutItem, LTText):
(a,b,c,d,e,f) = self.matrix (a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0 return 0 < a*d and b*c <= 0
## LTFigure ## LTFigure
## ##
class LTFigure(LayoutContainer): class LTFigure(LayoutContainer):
@ -397,7 +402,8 @@ class LTFigure(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, strbbox(self.bbox), self.matrix)) return ('<figure id=%r bbox=%s matrix=%s>' %
(self.id, bbox2str(self.bbox), matrix2str(self.matrix)))
## LTTextLine ## LTTextLine
@ -411,7 +417,7 @@ class LTTextLine(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<textline %s(%s)>' % (strbbox(self.bbox), self.direction)) return ('<textline %s(%s)>' % (bbox2str(self.bbox), self.direction))
def get_margin(self): def get_margin(self):
return min(self.width, self.height) return min(self.width, self.height)
@ -428,7 +434,7 @@ class LTTextLine(LayoutContainer):
if self.direction == 'V': if self.direction == 'V':
y0 = -INF y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1): for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin: if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.get_margin() margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0: if obj.y1+margin < y0:
objs.append(LTAnon(' ')) objs.append(LTAnon(' '))
@ -437,7 +443,7 @@ class LTTextLine(LayoutContainer):
else: else:
x1 = INF x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0): for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin: if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.get_margin() margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin: if x1 < obj.x0-margin:
objs.append(LTAnon(' ')) objs.append(LTAnon(' '))
@ -461,7 +467,7 @@ class LTTextBox(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<textbox %s(%s) %r...>' % (strbbox(self.bbox), self.direction, self.get_text()[:20])) return ('<textbox %s(%s) %r...>' % (bbox2str(self.bbox), self.direction, self.get_text()[:20]))
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) ) return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
@ -517,7 +523,7 @@ class LTPage(LayoutContainer):
return return
def __repr__(self): def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, strbbox(self.bbox), self.rotate)) return ('<page id=%r bbox=%s rotate=%r>' % (self.id, bbox2str(self.bbox), self.rotate))
def analyze_layout(self, laparams): def analyze_layout(self, laparams):
textobjs = [] textobjs = []

View File

@ -59,9 +59,6 @@ class PDFTextDevice(PDFDevice):
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?' return '?'
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
return (0, 0)
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm) matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font font = textstate.font
@ -82,76 +79,39 @@ class PDFTextDevice(PDFDevice):
def render_string_horizontal(self, seq, matrix, (x,y), def render_string_horizontal(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale): font, fontsize, scaling, charspace, wordspace, dxscale):
chars = [] needcharspace = False
needspace = False
for obj in seq: for obj in seq:
if isinstance(obj, int) or isinstance(obj, float): if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, x -= obj*dxscale
fontsize, charspace, scaling, chars) needcharspace = False
x += dx - obj*dxscale
y += dy
chars = []
needspace = False
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
try: if needcharspace:
char = font.to_unichr(cid) x += charspace
except PDFUnicodeNotDefined, e: x += self.render_char(translate_matrix(matrix, (x,y)),
(cidcoding, cid) = e.args font, fontsize, scaling, cid)
char = self.handle_undefined_char(cidcoding, cid) needcharspace = True
chars.append((char, cid))
if cid == 32 and wordspace: if cid == 32 and wordspace:
if needspace: x += wordspace
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx + wordspace
y += dy
chars = []
if chars:
if needspace:
x += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
return (x, y) return (x, y)
def render_string_vertical(self, seq, matrix, (x,y), def render_string_vertical(self, seq, matrix, (x,y),
font, fontsize, scaling, charspace, wordspace, dxscale): font, fontsize, scaling, charspace, wordspace, dxscale):
chars = [] needcharspace = False
needspace = False
for obj in seq: for obj in seq:
if isinstance(obj, int) or isinstance(obj, float): if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, y -= obj*dxscale
fontsize, charspace, scaling, chars) needcharspace = False
x += dx
y += dy - obj*dxscale
chars = []
needspace = False
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
try: if needcharspace:
char = font.to_unichr(cid) y += charspace
except PDFUnicodeNotDefined, e: y += self.render_char(translate_matrix(matrix, (x,y)),
(cidcoding, cid) = e.args font, fontsize, scaling, cid)
char = self.handle_undefined_char(cidcoding, cid) needcharspace = True
chars.append((char, cid))
if cid == 32 and wordspace: if cid == 32 and wordspace:
if needspace: y += wordspace
y += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
needspace = True
x += dx
y += dy + wordspace
chars = []
if chars:
if needspace:
y += charspace
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
return (x, y) return (x, y)
def render_char(self, matrix, font, fontsize, scaling, cid):
return 0

View File

@ -136,9 +136,12 @@ def enc(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')
def strbbox((x0,y0,x1,y1)): def bbox2str((x0,y0,x1,y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
def matrix2str((a,b,c,d,e,f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a,b,c,d,e,f)
## ObjIdRange ## ObjIdRange
## ##