clustering bug fixed again

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@102 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-16 04:16:00 +00:00
parent f628c0d3fe
commit fa678ccb98
5 changed files with 58 additions and 53 deletions

View File

@ -1,8 +1,9 @@
#!/usr/bin/env python
import sys
from pdfdevice import PDFDevice
from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from utils import mult_matrix, translate_matrix, enc
from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
## PDFPageAggregator
@ -30,9 +31,9 @@ class PDFPageAggregator(PDFDevice):
self.cur_item.group_text(self.cluster_margin)
return self.cur_item
def begin_figure(self, name, bbox):
def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox)
self.cur_item = LTFigure(name, bbox, matrix)
return
def end_figure(self, _):
@ -47,11 +48,13 @@ class PDFPageAggregator(PDFDevice):
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
return self.undefined_char
def paint_path(self, gstate, matrix, stroke, fill, evenodd, path):
def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
@ -64,6 +67,10 @@ class PDFPageAggregator(PDFDevice):
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
@ -130,9 +137,6 @@ class TagExtractor(PDFDevice):
self.tag = None
return
def render_image(self, stream, size, matrix):
return
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
text = ''
@ -204,7 +208,7 @@ class SGMLConverter(PDFConverter):
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
self.outfp.write('<figure id="%s">\n' % (item.id))
for child in item:
render(child)
self.outfp.write('</figure>\n')
@ -268,7 +272,7 @@ class HTMLConverter(PDFConverter):
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LayoutContainer):
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
render(child)

View File

@ -150,7 +150,7 @@ class ClusterSet(object):
# add(objs): groups text objects if necessary.
def add(self, objs):
group = self.klass(objs, self.i)
group = self.klass(self.i, objs)
self.i += 1
for obj in objs:
if obj in self.clusters:
@ -164,7 +164,16 @@ class ClusterSet(object):
r = set(self.clusters.itervalues())
for group in r:
group.fixate()
return r
return list(r)
def group_objs(objs, ratio, klass):
plane = Plane(objs)
cset = ClusterSet(klass)
for obj in objs:
margin = abs(obj.get_margin(ratio))
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
return cset.finish()
## LayoutItem
@ -256,16 +265,6 @@ class LayoutContainer(LayoutItem):
self.weight = sum( obj.get_weight() for obj in self.objs )
return
def group_objs(self, ratio, klass):
plane = Plane(self.objs)
cset = ClusterSet(klass)
for obj in self.objs:
margin = abs(obj.get_margin(ratio))
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
self.objs = cset.finish()
return
def get_weight(self):
return self.weight
@ -301,14 +300,6 @@ class LTRect(LayoutItem):
return
## LTFigure
##
class LTFigure(LayoutContainer):
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
## LTText
##
class LTText(LayoutItem):
@ -361,6 +352,19 @@ class LTText(LayoutItem):
return self.vertical
## LTFigure
##
class LTFigure(LayoutContainer):
def __init__(self, id, bbox, matrix):
LayoutContainer.__init__(self, id, bbox)
self.matrix = matrix
return
def __repr__(self):
return ('<figure id=%r bbox=%s ctm=%r>' % (self.id, self.get_bbox(), self.ctm))
## LTTextBox
##
## A set of text objects that are grouped within
@ -446,7 +450,9 @@ class LTPage(LayoutContainer):
return
def group_text(self, ratio):
self.group_objs(ratio, LTTextBox)
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
if self.get_direction() == 'H':
lines = reorder_vh(self.objs, +1)
else:

View File

@ -32,14 +32,14 @@ class PDFDevice(object):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox):
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_string(self, textstate, textmatrix, seq):
return
def render_image(self, stream, size, matrix):
def render_image(self, stream, size):
return

View File

@ -12,7 +12,7 @@ from psparser import PSException, PSTypeError, PSEOF, \
from pdftypes import PDFException, PDFStream, PDFObjRef, \
resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
@ -424,7 +424,7 @@ class PDFPageInterpreter(object):
# stroke
def do_S(self):
self.device.paint_path(self.graphicstate, self.ctm, True, False, False, self.curpath)
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = []
return
# close-and-stroke
@ -434,24 +434,24 @@ class PDFPageInterpreter(object):
return
# fill
def do_f(self):
self.device.paint_path(self.graphicstate, self.ctm, False, True, False, self.curpath)
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
self.curpath = []
return
# fill (obsolete)
do_F = do_f
# fill-even-odd
def do_f_a(self):
self.device.paint_path(self.graphicstate, self.ctm, False, True, True, self.curpath)
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = []
return
# fill-and-stroke
def do_B(self):
self.device.paint_path(self.graphicstate, self.ctm, True, True, False, self.curpath)
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = []
return
# fill-and-stroke-even-odd
def do_B_a(self):
self.device.paint_path(self.graphicstate, self.ctm, True, True, True, self.curpath)
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = []
return
# close-fill-and-stroke
@ -686,20 +686,15 @@ class PDFPageInterpreter(object):
subtype = xobj.dic.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
interpreter = self.dup()
(x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
(x0,y0) = apply_matrix(ctm, (x0,y0))
(x1,y1) = apply_matrix(ctm, (x1,y1))
bbox = (x0,y0,x1,y1)
self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
bbox = list_value(xobj.dic['BBox'])
matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY))
self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
(x0,y0) = apply_matrix(self.ctm, (0,0))
(x1,y1) = apply_matrix(self.ctm, (1,1))
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
(w,h) = (xobj.dic['Width'], xobj.dic['Height'])
self.device.render_image(xobj, (w,h), self.ctm)
self.device.render_image(xobj, (w,h))
self.device.end_figure(xobjid)
else:
# unsupported xobject type.

View File

@ -15,12 +15,12 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,e+x,f+y)
def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to coordinates.'''
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a point.'''
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
'''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
return (a*p+c*q, b*p+d*q)