git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@96 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-05 12:26:29 +00:00
parent 13efd3faf4
commit a865b28bd9
4 changed files with 429 additions and 385 deletions

View File

@ -1,122 +1,18 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from utils import matrix2str, rect2str, point2str, pick, apply_matrix_norm from utils import apply_matrix_norm
INF = sys.maxint INF = sys.maxint
## PageItem ## pick
## ##
class PageItem(object): def pick(seq, func, maxobj=None):
maxscore = None
def __init__(self, (x0,y0,x1,y1)): for obj in seq:
#assert x0 <= x1 and y0 <= y1 score = func(obj)
self.x0 = x0 if maxscore == None or maxscore < score:
self.y0 = y0 (maxscore,maxobj) = (score,obj)
self.x1 = x1 return maxobj
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.bbox()))
def bbox(self):
return rect2str((self.x0, self.y0, self.x1, self.y1))
def hoverlap(self, obj):
assert isinstance(obj, PageItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, PageItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
class PageContainer(PageItem):
def __init__(self, bbox):
PageItem.__init__(self, bbox)
self.objs = []
return
def add(self, obj):
self.objs.append(obj)
return
class Page(PageContainer):
def __init__(self, id, bbox, rotate=0):
PageContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
## FigureItem
##
class FigureItem(PageContainer):
def __init__(self, id, bbox):
PageContainer.__init__(self, bbox)
self.id = id
return
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
## TextItem
##
class TextItem(PageItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
PageItem.__init__(self, bbox)
return
def __len__(self):
return len(self.text)
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
point2str(self.adv), self.text))
## bsearch ## bsearch
@ -156,38 +52,40 @@ def bsearch(objs, v0, v1):
## ##
## Reorders objects according to its writing direction. ## Reorders objects according to its writing direction.
## ##
def reorder_hv(objs, hdir): def reorder_vh(objs, hdir):
if 0 < hdir: if 0 < hdir:
hkey = (lambda obj: obj.x0) hkey = (lambda obj: obj.x0)
vkey = (lambda obj: -obj.y1)
else: else:
hkey = (lambda obj: -obj.x1) hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1) vkey = (lambda obj: -obj.y1)
r = [] r = []
line = [] line = []
for obj1 in sorted(objs, key=vkey): for obj in sorted(objs, key=vkey):
if line and not line[-1].voverlap(obj1): if line and not line[-1].voverlap(obj):
line.sort(key=hkey) line.sort(key=hkey)
r.append(line) r.append(line)
line = [] line = []
line.append(obj1) line.append(obj)
line.sort(key=hkey) line.sort(key=hkey)
r.append(line) r.append(line)
return r return r
def reorder_vh(objs, hdir): def reorder_hv(objs, hdir):
if 0 < hdir: if 0 < hdir:
hkey = (lambda obj: obj.x0) hkey = (lambda obj: obj.x0)
vkey = (lambda obj: -obj.y1)
else: else:
hkey = (lambda obj: -obj.x1) hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1) vkey = (lambda obj: -obj.y1)
r = [] r = []
line = [] line = []
for obj1 in sorted(objs, key=hkey): for obj in sorted(objs, key=hkey):
if line and not line[-1].hoverlap(obj1): if line and not line[-1].hoverlap(obj):
line.sort(key=vkey) line.sort(key=vkey)
r.append(line) r.append(line)
line = [] line = []
line.append(obj1) line.append(obj)
line.sort(key=vkey) line.sort(key=vkey)
r.append(line) r.append(line)
return r return r
@ -212,6 +110,7 @@ class Plane(object):
# place(obj): place an object in a certain area. # place(obj): place an object in a certain area.
def place(self, obj): def place(self, obj):
assert isinstance(obj, LayoutItem)
self.xobjs.append((obj.x0, obj)) self.xobjs.append((obj.x0, obj))
self.xobjs.append((obj.x1, obj)) self.xobjs.append((obj.x1, obj))
self.yobjs.append((obj.y0, obj)) self.yobjs.append((obj.y0, obj))
@ -232,80 +131,6 @@ class Plane(object):
return objs return objs
## TextBox
##
## A set of text objects that are clustered in
## a certain rectangular area.
##
class TextBox(PageItem):
def __init__(self, objs):
self.objs = set(objs)
self.vertical = False
self.length = None
return
def __repr__(self):
return ('<textbox %s %s items=%d>' % (self.bbox(), self.vertical, len(self.objs)))
def __len__(self):
return self.length
# merge(boxes): merges with other textboxes.
def merge(self, box):
self.objs.update(box.objs)
return
# finish(): determines its boundery and writing direction.
def finish(self):
assert self.objs
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
PageItem.__init__(self, (bx0, by0, bx1, by1))
self.length = sum( len(obj) for obj in self.objs )
for obj in self.objs:
self.vertical = obj.vertical
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if len(objs[0]) == 1 and len(objs[1]) == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
self.vertical = (h < v)
return
def lines(self, ratio):
if self.vertical:
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
for line in reorder_vh(objs, -1):
s = ''
y0 = -INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if obj.y1 < y0-margin:
s += ' '
s += obj.text
y0 = obj.y0
yield s
else:
objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
for line in reorder_hv(objs, +1):
s = ''
x1 = INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if x1+margin < obj.x0:
s += ' '
s += obj.text
x1 = obj.x1
yield s
return
## ClusterSet ## ClusterSet
## ##
## Maintains a set of TextBox objects. ## Maintains a set of TextBox objects.
@ -316,43 +141,272 @@ class TextBox(PageItem):
## ##
class ClusterSet(object): class ClusterSet(object):
def __init__(self): def __init__(self, klass):
self.clusters = {} self.clusters = {}
self.klass = klass
return return
# add(objs): groups text objects if necessary. # add(objs): groups text objects if necessary.
def add(self, objs): def add(self, objs):
c = TextBox(objs) group = self.klass(objs)
for obj in objs: for obj in objs:
if obj in self.clusters: if obj in self.clusters:
c.merge(self.clusters[obj]) group.merge(self.clusters[obj])
for obj in c.objs: for obj in group:
self.clusters[obj] = c self.clusters[obj] = group
return return
# finish(): returns all the TextBoxes in a page. # finish(): returns all the TextBoxes in a page.
def finish(self): def finish(self):
r = set(self.clusters.itervalues()) r = set(self.clusters.itervalues())
for textbox in r: for group in r:
textbox.finish() group.fixate()
return r return r
# cluster_textobjs
def cluster_textobjs(objs, ratio): ## LayoutItem
plane = Plane(objs) ##
cset = ClusterSet() class LayoutItem(object):
for obj in objs:
margin = abs(obj.fontsize * ratio) def __init__(self, id, bbox):
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin)) #assert x0 <= x1 and y0 <= y1
cset.add(neighbors) self.id = id
clusters = cset.finish() self.set_bbox(bbox)
vertical = ((sum( len(textbox) for textbox in clusters )/2) < return
sum( len(textbox) for textbox in clusters if textbox.vertical ))
if vertical: def set_bbox(self, (x0,y0,x1,y1)):
lines = reorder_hv(clusters, -1) self.x0 = x0
else: self.y0 = y0
lines = reorder_vh(clusters, +1) self.x1 = x1
r = [] self.y1 = y1
for line in lines: self.width = x1-x0
r.extend(line) self.height = y1-y0
return r return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.get_bbox()))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_margin(self, ratio):
return 0
def get_weight(self):
return 0
def get_direction(self):
return False
## LayoutContainer
##
class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, id, bbox)
if objs:
self.objs = set(objs)
else:
self.objs = set()
self.weight = None
return
def __repr__(self):
return ('<group %s(%d)>' % (self.get_bbox(), len(self.objs)))
def __iter__(self):
return iter(self.objs)
def add(self, obj):
self.objs.add(obj)
return
def merge(self, group):
self.objs.update(iter(group))
return
# fixate(): determines its boundery and writing direction.
def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
self.set_bbox((bx0, by0, bx1, by1))
self.weight = sum( obj.get_weight() for obj in self.objs )
return
def group_objs(self, ratio, klass):
plane = Plane(self.objs)
cset = ClusterSet(klass)
for obj in self.objs:
margin = abs(obj.get_margin(ratio))
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
self.objs = cset.finish()
return
def get_weight(self):
return self.weight
def get_direction(self):
return ((sum( obj.get_weight() for obj in self.objs )/2) <
sum( obj.get_weight() for obj in self.objs if obj.get_direction() ))
## FigureItem
##
class FigureItem(LayoutContainer):
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
## TextItem
##
class TextItem(LayoutItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, None, bbox)
return
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv,
self.text))
def get_margin(self, ratio):
return self.fontsize * ratio
def get_weight(self):
return len(self.text)
def get_direction(self):
return self.vertical
## TextBox
##
## A set of text objects that are clustered in
## a certain rectangular area.
##
class TextBox(LayoutContainer):
def __init__(self, objs):
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
self.vertical = False
return
def fixate(self):
LayoutContainer.fixate(self)
for obj in self.objs:
self.vertical = bool(obj.get_direction())
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
self.vertical = (h < v)
return
def get_direction(self):
return self.vertical
def get_lines(self, ratio):
if self.get_direction():
for line in reorder_hv(self.objs, -1):
s = ''
y0 = -INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if obj.y1+margin < y0:
s += ' '
s += obj.text
y0 = obj.y0
yield s
else:
for line in reorder_vh(self.objs, +1):
s = ''
x1 = INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if x1 < obj.x0-margin:
s += ' '
s += obj.text
x1 = obj.x1
yield s
return
## Page
##
class Page(LayoutContainer):
def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox)
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def fixate(self):
return
def group_text(self, ratio):
self.group_objs(ratio, TextBox)
if self.get_direction():
lines = reorder_hv(self.objs, -1)
else:
lines = reorder_vh(self.objs, +1)
self.objs = []
for line in lines:
self.objs.extend(line)
return

View File

@ -3,167 +3,39 @@ import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PDFPageAggregator from pdfdevice import PDFDevice, PDFPageAggregator
from layout import Page, FigureItem, TextItem, cluster_textobjs from layout import Page, LayoutContainer, TextItem, TextBox
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB from cmap import CMapDB
def enc(x, codec): # e(x): encode string
def e(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')
def encprops(props, codec):
if not props: return ''
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
def get_textobjs(item, r=None):
if r == None: r = []
if isinstance(item, TextItem):
r.append(item)
elif isinstance(item, Page):
for child in item.objs:
get_textobjs(child, r)
return r
## PDFConverter ## PDFConverter
##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii'): def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None):
PDFPageAggregator.__init__(self, rsrc) PDFPageAggregator.__init__(self, rsrc)
self.cluster_margin = cluster_margin
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
## SGMLConverter
##
class SGMLConverter(PDFConverter):
def end_page(self, page): def end_page(self, page):
page = PDFConverter.end_page(self, page) page = PDFPageAggregator.end_page(self, page)
def f(item):
bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox
if isinstance(item, FigureItem):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, bbox))
for child in item.objs:
f(child)
self.outfp.write('</figure>\n')
elif isinstance(item, TextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname, self.codec), item.vertical, bbox, item.fontsize))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</text>\n')
bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(page.id, bbox, page.rotate))
for child in page.objs:
f(child)
self.outfp.write('</page>\n')
return
## HTMLConverter
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
self.cluster_margin = cluster_margin
self.show_text_border = False
return
def end_page(self, page):
page = PDFConverter.end_page(self, page)
self.yoffset += page.y1
if self.pagenum:
self.outfp.write('<div style="position:absolute; top:%dpx;"><a name="%s">Page %s</a></div>' %
((self.yoffset-page.y1)*self.scale, page.id, page.id))
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(page.x0*self.scale, (self.yoffset-page.y1)*self.scale,
page.width*self.scale, page.height*self.scale))
def draw(item):
if isinstance(item, FigureItem):
for child in item.objs:
draw(child)
elif isinstance(item, TextItem):
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</span>\n')
if self.show_text_border:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.width*self.scale, self.height*self.scale))
for child in page.objs:
draw(child)
if self.cluster_margin: if self.cluster_margin:
clusters = cluster_textobjs(get_textobjs(page), self.cluster_margin) page.group_text(self.cluster_margin)
for textbox in clusters: return page
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(textbox.x0*self.scale, (self.yoffset-textbox.y1)*self.scale,
textbox.width*self.scale, textbox.height*self.scale))
self.yoffset += self.pagepad
return
def close(self): def write(self, text):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' % self.outfp.write(e(text, self.codec))
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n')
return
## TextConverter
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum
if cluster_margin == None:
cluster_margin = 0.5
self.cluster_margin = cluster_margin
self.word_margin = 0.2
return return
def end_page(self, page):
page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)
if self.cluster_margin:
textobjs = get_textobjs(page)
clusters = cluster_textobjs(textobjs, self.cluster_margin)
for textbox in clusters:
for line in textbox.lines(self.word_margin):
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
self.outfp.write('\n')
else:
for obj in page.objs:
if isinstance(obj, TextItem):
self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
self.outfp.write('\f')
return
def close(self):
return
## TagExtractor ## TagExtractor
## ##
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
@ -191,7 +63,7 @@ class TagExtractor(PDFDevice):
text += char text += char
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined, e:
pass pass
self.outfp.write(enc(text, self.codec)) self.write(text)
return return
def begin_page(self, page): def begin_page(self, page):
@ -207,18 +79,150 @@ class TagExtractor(PDFDevice):
return return
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
self.outfp.write('<%s%s>' % (enc(tag.name, self.codec), encprops(props, self.codec))) s = ''
if props:
s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (e(tag.name), s))
self.tag = tag self.tag = tag
return return
def end_tag(self): def end_tag(self):
assert self.tag assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name, self.codec)) self.outfp.write('</%s>' % e(self.tag.name))
self.tag = None self.tag = None
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):
self.outfp.write('<%s%s/>' % (enc(tag.name, self.codec), encprops(props, self.codec))) self.begin_tag(tag, props)
self.tag = None
return
## SGMLConverter
##
class SGMLConverter(PDFConverter):
def end_page(self, page):
def draw(item):
if isinstance(item, TextItem):
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
(e(item.font.fontname), item.get_direction(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LayoutContainer):
self.outfp.write('<group id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
draw(child)
self.outfp.write('</group>\n')
return
page = PDFConverter.end_page(self, page)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(page.id, page.get_bbox(), page.rotate))
draw(page)
self.outfp.write('</page>\n')
return
## HTMLConverter
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True,
pagepad=50, scale=1, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
self.codec)
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
self.show_text_border = False
return
def write_rect(self, color, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: 1px solid %s; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return
def end_page(self, page):
def draw(item):
if isinstance(item, Page):
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
if self.pagenum:
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-page.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
draw(child)
elif isinstance(item, TextItem):
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.write(item.text)
self.outfp.write('</span>\n')
if self.show_text_border:
self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LayoutContainer):
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
draw(child)
return
page = PDFConverter.end_page(self, page)
self.yoffset += page.y1
draw(page)
self.yoffset += self.pagepad
return
def close(self):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n')
return
## TextConverter
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False,
cluster_margin=None, word_margin=0.2):
if cluster_margin == None:
cluster_margin = 0.5
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
self.pagenum = pagenum
self.word_margin = word_margin
return
def end_page(self, page):
def draw(item):
if isinstance(item, TextItem):
self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
elif isinstance(item, TextBox):
for line in item.get_lines(self.word_margin):
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
self.outfp.write('\n')
elif isinstance(item, LayoutContainer):
for child in item:
draw(child)
page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)
draw(page)
self.outfp.write('\f')
return
def close(self):
return return

View File

@ -3,7 +3,7 @@ import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from layout import PageItem, Page, FigureItem, TextItem from layout import Page, FigureItem, TextItem
from utils import mult_matrix, translate_matrix from utils import mult_matrix, translate_matrix
@ -68,7 +68,8 @@ class PDFPageAggregator(PDFDevice):
def end_page(self, _): def end_page(self, _):
assert not self.stack assert not self.stack
assert isinstance(self.cur_item, PageItem) assert isinstance(self.cur_item, Page)
self.cur_item.fixate()
self.pageno += 1 self.pageno += 1
return self.cur_item return self.cur_item
@ -79,6 +80,7 @@ class PDFPageAggregator(PDFDevice):
def end_figure(self, _): def end_figure(self, _):
fig = self.cur_item fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop() self.cur_item = self.stack.pop()
self.cur_item.add(fig) self.cur_item.add(fig)
return return

View File

@ -23,13 +23,6 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))''' '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
return (a*p+c*q, b*p+d*q) return (a*p+c*q, b*p+d*q)
# display functions
def matrix2str((a,b,c,d,e,f)):
return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f)
def rect2str((x0,y0,x1,y1)):
return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1)
def point2str((x,y)):
return '(%.1f, %.1f)' % (x,y)
## Utilities ## Utilities
## ##
@ -98,12 +91,3 @@ def decode_text(s):
return unicode(s[2:], 'utf-16be', 'ignore') return unicode(s[2:], 'utf-16be', 'ignore')
else: else:
return ''.join( PDFDocEncoding[ord(c)] for c in s ) return ''.join( PDFDocEncoding[ord(c)] for c in s )
##
def pick(seq, func, maxobj=None):
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj