clustering bug fixed again

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@102 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-16 04:16:00 +00:00
parent f628c0d3fe
commit fa678ccb98
5 changed files with 58 additions and 53 deletions

View File

@ -1,8 +1,9 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys
from pdfdevice import PDFDevice from pdfdevice import PDFDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from utils import mult_matrix, translate_matrix, enc from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
## PDFPageAggregator ## PDFPageAggregator
@ -30,9 +31,9 @@ class PDFPageAggregator(PDFDevice):
self.cur_item.group_text(self.cluster_margin) self.cur_item.group_text(self.cluster_margin)
return self.cur_item return self.cur_item
def begin_figure(self, name, bbox): def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item) self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox) self.cur_item = LTFigure(name, bbox, matrix)
return return
def end_figure(self, _): def end_figure(self, _):
@ -47,11 +48,13 @@ class PDFPageAggregator(PDFDevice):
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return self.undefined_char return self.undefined_char
def paint_path(self, gstate, matrix, stroke, fill, evenodd, path): def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path) shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0] (_,x0,y0) = path[0]
(_,x1,y1) = path[1] (_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1: if y0 == y1:
# horizontal ruler # horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
@ -64,6 +67,10 @@ class PDFPageAggregator(PDFDevice):
(_,x1,y1) = path[1] (_,x1,y1) = path[1]
(_,x2,y2) = path[2] (_,x2,y2) = path[2]
(_,x3,y3) = path[3] (_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
@ -130,9 +137,6 @@ class TagExtractor(PDFDevice):
self.tag = None self.tag = None
return return
def render_image(self, stream, size, matrix):
return
def render_string(self, textstate, textmatrix, seq): def render_string(self, textstate, textmatrix, seq):
font = textstate.font font = textstate.font
text = '' text = ''
@ -204,7 +208,7 @@ class SGMLConverter(PDFConverter):
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox())) self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s">\n' % (item.id))
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
@ -268,7 +272,7 @@ class HTMLConverter(PDFConverter):
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LayoutContainer): elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)

View File

@ -150,7 +150,7 @@ class ClusterSet(object):
# add(objs): groups text objects if necessary. # add(objs): groups text objects if necessary.
def add(self, objs): def add(self, objs):
group = self.klass(objs, self.i) group = self.klass(self.i, objs)
self.i += 1 self.i += 1
for obj in objs: for obj in objs:
if obj in self.clusters: if obj in self.clusters:
@ -164,7 +164,16 @@ class ClusterSet(object):
r = set(self.clusters.itervalues()) r = set(self.clusters.itervalues())
for group in r: for group in r:
group.fixate() group.fixate()
return r return list(r)
def group_objs(objs, ratio, klass):
plane = Plane(objs)
cset = ClusterSet(klass)
for obj in objs:
margin = abs(obj.get_margin(ratio))
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
return cset.finish()
## LayoutItem ## LayoutItem
@ -256,16 +265,6 @@ class LayoutContainer(LayoutItem):
self.weight = sum( obj.get_weight() for obj in self.objs ) self.weight = sum( obj.get_weight() for obj in self.objs )
return return
def group_objs(self, ratio, klass):
plane = Plane(self.objs)
cset = ClusterSet(klass)
for obj in self.objs:
margin = abs(obj.get_margin(ratio))
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
self.objs = cset.finish()
return
def get_weight(self): def get_weight(self):
return self.weight return self.weight
@ -301,14 +300,6 @@ class LTRect(LayoutItem):
return return
## LTFigure
##
class LTFigure(LayoutContainer):
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
## LTText ## LTText
## ##
class LTText(LayoutItem): class LTText(LayoutItem):
@ -361,6 +352,19 @@ class LTText(LayoutItem):
return self.vertical return self.vertical
## LTFigure
##
class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix):
LayoutContainer.__init__(self, id, bbox)
self.matrix = matrix
return
def __repr__(self):
return ('<figure id=%r bbox=%s ctm=%r>' % (self.id, self.get_bbox(), self.ctm))
## LTTextBox ## LTTextBox
## ##
## A set of text objects that are grouped within ## A set of text objects that are grouped within
@ -446,7 +450,9 @@ class LTPage(LayoutContainer):
return return
def group_text(self, ratio): def group_text(self, ratio):
self.group_objs(ratio, LTTextBox) textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
if self.get_direction() == 'H': if self.get_direction() == 'H':
lines = reorder_vh(self.objs, +1) lines = reorder_vh(self.objs, +1)
else: else:

View File

@ -32,14 +32,14 @@ class PDFDevice(object):
return return
def end_page(self, page): def end_page(self, page):
return return
def begin_figure(self, name, bbox): def begin_figure(self, name, bbox, matrix):
return return
def end_figure(self, name): def end_figure(self, name):
return return
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path): def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return return
def render_string(self, textstate, textmatrix, seq): def render_string(self, textstate, textmatrix, seq):
return return
def render_image(self, stream, size, matrix): def render_image(self, stream, size):
return return

View File

@ -12,7 +12,7 @@ from psparser import PSException, PSTypeError, PSEOF, \
from pdftypes import PDFException, PDFStream, PDFObjRef, \ from pdftypes import PDFException, PDFStream, PDFObjRef, \
resolve1, int_value, float_value, num_value, \ resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \ from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
@ -424,7 +424,7 @@ class PDFPageInterpreter(object):
# stroke # stroke
def do_S(self): def do_S(self):
self.device.paint_path(self.graphicstate, self.ctm, True, False, False, self.curpath) self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = [] self.curpath = []
return return
# close-and-stroke # close-and-stroke
@ -434,24 +434,24 @@ class PDFPageInterpreter(object):
return return
# fill # fill
def do_f(self): def do_f(self):
self.device.paint_path(self.graphicstate, self.ctm, False, True, False, self.curpath) self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
self.curpath = [] self.curpath = []
return return
# fill (obsolete) # fill (obsolete)
do_F = do_f do_F = do_f
# fill-even-odd # fill-even-odd
def do_f_a(self): def do_f_a(self):
self.device.paint_path(self.graphicstate, self.ctm, False, True, True, self.curpath) self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = [] self.curpath = []
return return
# fill-and-stroke # fill-and-stroke
def do_B(self): def do_B(self):
self.device.paint_path(self.graphicstate, self.ctm, True, True, False, self.curpath) self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = [] self.curpath = []
return return
# fill-and-stroke-even-odd # fill-and-stroke-even-odd
def do_B_a(self): def do_B_a(self):
self.device.paint_path(self.graphicstate, self.ctm, True, True, True, self.curpath) self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = [] self.curpath = []
return return
# close-fill-and-stroke # close-fill-and-stroke
@ -686,20 +686,15 @@ class PDFPageInterpreter(object):
subtype = xobj.dic.get('Subtype') subtype = xobj.dic.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj.dic: if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
interpreter = self.dup() interpreter = self.dup()
(x0,y0,x1,y1) = list_value(xobj.dic['BBox']) bbox = list_value(xobj.dic['BBox'])
ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm) matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY))
(x0,y0) = apply_matrix(ctm, (x0,y0)) self.device.begin_figure(xobjid, bbox, matrix)
(x1,y1) = apply_matrix(ctm, (x1,y1)) interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
bbox = (x0,y0,x1,y1)
self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
(x0,y0) = apply_matrix(self.ctm, (0,0)) self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
(x1,y1) = apply_matrix(self.ctm, (1,1))
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
(w,h) = (xobj.dic['Width'], xobj.dic['Height']) (w,h) = (xobj.dic['Width'], xobj.dic['Height'])
self.device.render_image(xobj, (w,h), self.ctm) self.device.render_image(xobj, (w,h))
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
else: else:
# unsupported xobject type. # unsupported xobject type.

View File

@ -15,12 +15,12 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
def translate_matrix((a,b,c,d,e,f), (x,y)): def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,e+x,f+y) return (a,b,c,d,e+x,f+y)
def apply_matrix((a,b,c,d,e,f), (x,y)): def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to coordinates.''' '''Applies a matrix to a point.'''
return (a*x+c*y+e, b*x+d*y+f) return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)): def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))''' '''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q) return (a*p+c*q, b*p+d*q)