clustering bug fixed again
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@102 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
f628c0d3fe
commit
fa678ccb98
|
@ -1,8 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfdevice import PDFDevice
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
|
||||
from utils import mult_matrix, translate_matrix, enc
|
||||
from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
|
@ -30,9 +31,9 @@ class PDFPageAggregator(PDFDevice):
|
|||
self.cur_item.group_text(self.cluster_margin)
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox):
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
self.stack.append(self.cur_item)
|
||||
self.cur_item = LTFigure(name, bbox)
|
||||
self.cur_item = LTFigure(name, bbox, matrix)
|
||||
return
|
||||
|
||||
def end_figure(self, _):
|
||||
|
@ -47,11 +48,13 @@ class PDFPageAggregator(PDFDevice):
|
|||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
return self.undefined_char
|
||||
|
||||
def paint_path(self, gstate, matrix, stroke, fill, evenodd, path):
|
||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
shape = ''.join(x[0] for x in path)
|
||||
if shape == 'ml': # horizontal/vertical line
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
if y0 == y1:
|
||||
# horizontal ruler
|
||||
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
||||
|
@ -64,6 +67,10 @@ class PDFPageAggregator(PDFDevice):
|
|||
(_,x1,y1) = path[1]
|
||||
(_,x2,y2) = path[2]
|
||||
(_,x3,y3) = path[3]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
||||
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
|
||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||
|
@ -130,9 +137,6 @@ class TagExtractor(PDFDevice):
|
|||
self.tag = None
|
||||
return
|
||||
|
||||
def render_image(self, stream, size, matrix):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
font = textstate.font
|
||||
text = ''
|
||||
|
@ -204,7 +208,7 @@ class SGMLConverter(PDFConverter):
|
|||
elif isinstance(item, LTRect):
|
||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
||||
elif isinstance(item, LTFigure):
|
||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
self.outfp.write('<figure id="%s">\n' % (item.id))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</figure>\n')
|
||||
|
@ -268,7 +272,7 @@ class HTMLConverter(PDFConverter):
|
|||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LayoutContainer):
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
render(child)
|
||||
|
|
|
@ -150,7 +150,7 @@ class ClusterSet(object):
|
|||
|
||||
# add(objs): groups text objects if necessary.
|
||||
def add(self, objs):
|
||||
group = self.klass(objs, self.i)
|
||||
group = self.klass(self.i, objs)
|
||||
self.i += 1
|
||||
for obj in objs:
|
||||
if obj in self.clusters:
|
||||
|
@ -164,7 +164,16 @@ class ClusterSet(object):
|
|||
r = set(self.clusters.itervalues())
|
||||
for group in r:
|
||||
group.fixate()
|
||||
return r
|
||||
return list(r)
|
||||
|
||||
def group_objs(objs, ratio, klass):
|
||||
plane = Plane(objs)
|
||||
cset = ClusterSet(klass)
|
||||
for obj in objs:
|
||||
margin = abs(obj.get_margin(ratio))
|
||||
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
|
||||
cset.add(neighbors)
|
||||
return cset.finish()
|
||||
|
||||
|
||||
## LayoutItem
|
||||
|
@ -256,16 +265,6 @@ class LayoutContainer(LayoutItem):
|
|||
self.weight = sum( obj.get_weight() for obj in self.objs )
|
||||
return
|
||||
|
||||
def group_objs(self, ratio, klass):
|
||||
plane = Plane(self.objs)
|
||||
cset = ClusterSet(klass)
|
||||
for obj in self.objs:
|
||||
margin = abs(obj.get_margin(ratio))
|
||||
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
|
||||
cset.add(neighbors)
|
||||
self.objs = cset.finish()
|
||||
return
|
||||
|
||||
def get_weight(self):
|
||||
return self.weight
|
||||
|
||||
|
@ -301,14 +300,6 @@ class LTRect(LayoutItem):
|
|||
return
|
||||
|
||||
|
||||
## LTFigure
|
||||
##
|
||||
class LTFigure(LayoutContainer):
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
|
||||
|
||||
|
||||
## LTText
|
||||
##
|
||||
class LTText(LayoutItem):
|
||||
|
@ -361,6 +352,19 @@ class LTText(LayoutItem):
|
|||
return self.vertical
|
||||
|
||||
|
||||
## LTFigure
|
||||
##
|
||||
class LTFigure(LayoutContainer):
|
||||
|
||||
def __init__(self, id, bbox, matrix):
|
||||
LayoutContainer.__init__(self, id, bbox)
|
||||
self.matrix = matrix
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%s ctm=%r>' % (self.id, self.get_bbox(), self.ctm))
|
||||
|
||||
|
||||
## LTTextBox
|
||||
##
|
||||
## A set of text objects that are grouped within
|
||||
|
@ -446,7 +450,9 @@ class LTPage(LayoutContainer):
|
|||
return
|
||||
|
||||
def group_text(self, ratio):
|
||||
self.group_objs(ratio, LTTextBox)
|
||||
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
|
||||
otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
|
||||
self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
|
||||
if self.get_direction() == 'H':
|
||||
lines = reorder_vh(self.objs, +1)
|
||||
else:
|
||||
|
|
|
@ -32,14 +32,14 @@ class PDFDevice(object):
|
|||
return
|
||||
def end_page(self, page):
|
||||
return
|
||||
def begin_figure(self, name, bbox):
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
return
|
||||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path):
|
||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||
return
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
return
|
||||
def render_image(self, stream, size, matrix):
|
||||
def render_image(self, stream, size):
|
||||
return
|
||||
|
|
|
@ -12,7 +12,7 @@ from psparser import PSException, PSTypeError, PSEOF, \
|
|||
from pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||
resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value
|
||||
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
|
||||
from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY
|
||||
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \
|
||||
|
@ -424,7 +424,7 @@ class PDFPageInterpreter(object):
|
|||
|
||||
# stroke
|
||||
def do_S(self):
|
||||
self.device.paint_path(self.graphicstate, self.ctm, True, False, False, self.curpath)
|
||||
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
# close-and-stroke
|
||||
|
@ -434,24 +434,24 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
# fill
|
||||
def do_f(self):
|
||||
self.device.paint_path(self.graphicstate, self.ctm, False, True, False, self.curpath)
|
||||
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
# fill (obsolete)
|
||||
do_F = do_f
|
||||
# fill-even-odd
|
||||
def do_f_a(self):
|
||||
self.device.paint_path(self.graphicstate, self.ctm, False, True, True, self.curpath)
|
||||
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
# fill-and-stroke
|
||||
def do_B(self):
|
||||
self.device.paint_path(self.graphicstate, self.ctm, True, True, False, self.curpath)
|
||||
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
# fill-and-stroke-even-odd
|
||||
def do_B_a(self):
|
||||
self.device.paint_path(self.graphicstate, self.ctm, True, True, True, self.curpath)
|
||||
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
# close-fill-and-stroke
|
||||
|
@ -686,20 +686,15 @@ class PDFPageInterpreter(object):
|
|||
subtype = xobj.dic.get('Subtype')
|
||||
if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
|
||||
interpreter = self.dup()
|
||||
(x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
|
||||
ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
|
||||
(x0,y0) = apply_matrix(ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
||||
bbox = (x0,y0,x1,y1)
|
||||
self.device.begin_figure(xobjid, bbox)
|
||||
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
|
||||
bbox = list_value(xobj.dic['BBox'])
|
||||
matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY))
|
||||
self.device.begin_figure(xobjid, bbox, matrix)
|
||||
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||
self.device.end_figure(xobjid)
|
||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
|
||||
(x0,y0) = apply_matrix(self.ctm, (0,0))
|
||||
(x1,y1) = apply_matrix(self.ctm, (1,1))
|
||||
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
|
||||
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
|
||||
(w,h) = (xobj.dic['Width'], xobj.dic['Height'])
|
||||
self.device.render_image(xobj, (w,h), self.ctm)
|
||||
self.device.render_image(xobj, (w,h))
|
||||
self.device.end_figure(xobjid)
|
||||
else:
|
||||
# unsupported xobject type.
|
||||
|
|
|
@ -15,12 +15,12 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
|||
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
||||
return (a,b,c,d,e+x,f+y)
|
||||
|
||||
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||
'''Applies a matrix to coordinates.'''
|
||||
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
||||
'''Applies a matrix to a point.'''
|
||||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
|
||||
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
|
||||
'''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))'''
|
||||
return (a*p+c*q, b*p+d*q)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue