git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@96 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-05 12:26:29 +00:00
parent 13efd3faf4
commit a865b28bd9
4 changed files with 429 additions and 385 deletions

View File

@ -1,122 +1,18 @@
#!/usr/bin/env python
import sys
from utils import matrix2str, rect2str, point2str, pick, apply_matrix_norm
from utils import apply_matrix_norm
INF = sys.maxint
## PageItem
## pick
##
class PageItem(object):
def __init__(self, (x0,y0,x1,y1)):
#assert x0 <= x1 and y0 <= y1
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.bbox()))
def bbox(self):
return rect2str((self.x0, self.y0, self.x1, self.y1))
def hoverlap(self, obj):
assert isinstance(obj, PageItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, PageItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
class PageContainer(PageItem):
def __init__(self, bbox):
PageItem.__init__(self, bbox)
self.objs = []
return
def add(self, obj):
self.objs.append(obj)
return
class Page(PageContainer):
def __init__(self, id, bbox, rotate=0):
PageContainer.__init__(self, bbox)
self.id = id
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
## FigureItem
##
class FigureItem(PageContainer):
def __init__(self, id, bbox):
PageContainer.__init__(self, bbox)
self.id = id
return
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
## TextItem
##
class TextItem(PageItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
PageItem.__init__(self, bbox)
return
def __len__(self):
return len(self.text)
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
point2str(self.adv), self.text))
def pick(seq, func, maxobj=None):
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj
## bsearch
@ -156,38 +52,40 @@ def bsearch(objs, v0, v1):
##
## Reorders objects according to its writing direction.
##
def reorder_hv(objs, hdir):
def reorder_vh(objs, hdir):
if 0 < hdir:
hkey = (lambda obj: obj.x0)
vkey = (lambda obj: -obj.y1)
else:
hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1)
vkey = (lambda obj: -obj.y1)
r = []
line = []
for obj1 in sorted(objs, key=vkey):
if line and not line[-1].voverlap(obj1):
for obj in sorted(objs, key=vkey):
if line and not line[-1].voverlap(obj):
line.sort(key=hkey)
r.append(line)
line = []
line.append(obj1)
line.append(obj)
line.sort(key=hkey)
r.append(line)
return r
def reorder_vh(objs, hdir):
def reorder_hv(objs, hdir):
if 0 < hdir:
hkey = (lambda obj: obj.x0)
vkey = (lambda obj: -obj.y1)
else:
hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1)
vkey = (lambda obj: -obj.y1)
r = []
line = []
for obj1 in sorted(objs, key=hkey):
if line and not line[-1].hoverlap(obj1):
for obj in sorted(objs, key=hkey):
if line and not line[-1].hoverlap(obj):
line.sort(key=vkey)
r.append(line)
line = []
line.append(obj1)
line.append(obj)
line.sort(key=vkey)
r.append(line)
return r
@ -212,6 +110,7 @@ class Plane(object):
# place(obj): place an object in a certain area.
def place(self, obj):
assert isinstance(obj, LayoutItem)
self.xobjs.append((obj.x0, obj))
self.xobjs.append((obj.x1, obj))
self.yobjs.append((obj.y0, obj))
@ -232,80 +131,6 @@ class Plane(object):
return objs
## TextBox
##
## A set of text objects that are clustered in
## a certain rectangular area.
##
class TextBox(PageItem):
def __init__(self, objs):
self.objs = set(objs)
self.vertical = False
self.length = None
return
def __repr__(self):
return ('<textbox %s %s items=%d>' % (self.bbox(), self.vertical, len(self.objs)))
def __len__(self):
return self.length
# merge(boxes): merges with other textboxes.
def merge(self, box):
self.objs.update(box.objs)
return
# finish(): determines its boundery and writing direction.
def finish(self):
assert self.objs
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
PageItem.__init__(self, (bx0, by0, bx1, by1))
self.length = sum( len(obj) for obj in self.objs )
for obj in self.objs:
self.vertical = obj.vertical
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if len(objs[0]) == 1 and len(objs[1]) == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
self.vertical = (h < v)
return
def lines(self, ratio):
if self.vertical:
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
for line in reorder_vh(objs, -1):
s = ''
y0 = -INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if obj.y1 < y0-margin:
s += ' '
s += obj.text
y0 = obj.y0
yield s
else:
objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
for line in reorder_hv(objs, +1):
s = ''
x1 = INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if x1+margin < obj.x0:
s += ' '
s += obj.text
x1 = obj.x1
yield s
return
## ClusterSet
##
## Maintains a set of TextBox objects.
@ -316,43 +141,272 @@ class TextBox(PageItem):
##
class ClusterSet(object):
def __init__(self):
def __init__(self, klass):
self.clusters = {}
self.klass = klass
return
# add(objs): groups text objects if necessary.
def add(self, objs):
c = TextBox(objs)
group = self.klass(objs)
for obj in objs:
if obj in self.clusters:
c.merge(self.clusters[obj])
for obj in c.objs:
self.clusters[obj] = c
group.merge(self.clusters[obj])
for obj in group:
self.clusters[obj] = group
return
# finish(): returns all the TextBoxes in a page.
def finish(self):
r = set(self.clusters.itervalues())
for textbox in r:
textbox.finish()
for group in r:
group.fixate()
return r
# cluster_textobjs
def cluster_textobjs(objs, ratio):
plane = Plane(objs)
cset = ClusterSet()
for obj in objs:
margin = abs(obj.fontsize * ratio)
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
clusters = cset.finish()
vertical = ((sum( len(textbox) for textbox in clusters )/2) <
sum( len(textbox) for textbox in clusters if textbox.vertical ))
if vertical:
lines = reorder_hv(clusters, -1)
else:
lines = reorder_vh(clusters, +1)
r = []
for line in lines:
r.extend(line)
return r
## LayoutItem
##
class LayoutItem(object):
def __init__(self, id, bbox):
#assert x0 <= x1 and y0 <= y1
self.id = id
self.set_bbox(bbox)
return
def set_bbox(self, (x0,y0,x1,y1)):
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.width = x1-x0
self.height = y1-y0
return
def __repr__(self):
return ('<pageitem bbox=%s>' % (self.get_bbox()))
def hoverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
return 0
else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def voverlap(self, obj):
assert isinstance(obj, LayoutItem)
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
return 0
else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_margin(self, ratio):
return 0
def get_weight(self):
return 0
def get_direction(self):
return False
## LayoutContainer
##
class LayoutContainer(LayoutItem):
def __init__(self, id, bbox, objs=None):
LayoutItem.__init__(self, id, bbox)
if objs:
self.objs = set(objs)
else:
self.objs = set()
self.weight = None
return
def __repr__(self):
return ('<group %s(%d)>' % (self.get_bbox(), len(self.objs)))
def __iter__(self):
return iter(self.objs)
def add(self, obj):
self.objs.add(obj)
return
def merge(self, group):
self.objs.update(iter(group))
return
# fixate(): determines its boundery and writing direction.
def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
bx0 = min(bx0, obj.x0)
by0 = min(by0, obj.y0)
bx1 = max(bx1, obj.x1)
by1 = max(by1, obj.y1)
self.set_bbox((bx0, by0, bx1, by1))
self.weight = sum( obj.get_weight() for obj in self.objs )
return
def group_objs(self, ratio, klass):
plane = Plane(self.objs)
cset = ClusterSet(klass)
for obj in self.objs:
margin = abs(obj.get_margin(ratio))
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
cset.add(neighbors)
self.objs = cset.finish()
return
def get_weight(self):
return self.weight
def get_direction(self):
return ((sum( obj.get_weight() for obj in self.objs )/2) <
sum( obj.get_weight() for obj in self.objs if obj.get_direction() ))
## FigureItem
##
class FigureItem(LayoutContainer):
def __repr__(self):
return ('<figure id=%r bbox=%s>' % (self.id, self.get_bbox()))
## TextItem
##
class TextItem(LayoutItem):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
self.vertical = False
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
self.adv = (dx, 0)
bbox = (tx, ty, tx+dx, ty+dy)
else:
# vertical text
(_,cid) = chars[0]
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
tx -= dx/2
ty += disp
self.adv = (0, dy)
bbox = (tx, ty+dy, tx+dx, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
LayoutItem.__init__(self, None, bbox)
return
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv,
self.text))
def get_margin(self, ratio):
return self.fontsize * ratio
def get_weight(self):
return len(self.text)
def get_direction(self):
return self.vertical
## TextBox
##
## A set of text objects that are clustered in
## a certain rectangular area.
##
class TextBox(LayoutContainer):
def __init__(self, objs):
LayoutContainer.__init__(self, None, (0,0,0,0), objs)
self.vertical = False
return
def fixate(self):
LayoutContainer.fixate(self)
for obj in self.objs:
self.vertical = bool(obj.get_direction())
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
self.vertical = (h < v)
return
def get_direction(self):
return self.vertical
def get_lines(self, ratio):
if self.get_direction():
for line in reorder_hv(self.objs, -1):
s = ''
y0 = -INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if obj.y1+margin < y0:
s += ' '
s += obj.text
y0 = obj.y0
yield s
else:
for line in reorder_vh(self.objs, +1):
s = ''
x1 = INF
for obj in line:
margin = abs(obj.fontsize * ratio)
if x1 < obj.x0-margin:
s += ' '
s += obj.text
x1 = obj.x1
yield s
return
## Page
##
class Page(LayoutContainer):
def __init__(self, id, bbox, rotate=0):
LayoutContainer.__init__(self, id, bbox)
self.rotate = rotate
return
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def fixate(self):
return
def group_text(self, ratio):
self.group_objs(ratio, TextBox)
if self.get_direction():
lines = reorder_hv(self.objs, -1)
else:
lines = reorder_vh(self.objs, +1)
self.objs = []
for line in lines:
self.objs.extend(line)
return

View File

@ -3,167 +3,39 @@ import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, PDFPageAggregator
from layout import Page, FigureItem, TextItem, cluster_textobjs
from layout import Page, LayoutContainer, TextItem, TextBox
from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB
def enc(x, codec):
# e(x): encode string
def e(x, codec='ascii'):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
return x.encode(codec, 'xmlcharrefreplace')
def encprops(props, codec):
if not props: return ''
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
def get_textobjs(item, r=None):
if r == None: r = []
if isinstance(item, TextItem):
r.append(item)
elif isinstance(item, Page):
for child in item.objs:
get_textobjs(child, r)
return r
## PDFConverter
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii'):
def __init__(self, rsrc, outfp, codec='ascii', cluster_margin=None):
PDFPageAggregator.__init__(self, rsrc)
self.cluster_margin = cluster_margin
self.outfp = outfp
self.codec = codec
return
## SGMLConverter
##
class SGMLConverter(PDFConverter):
def end_page(self, page):
page = PDFConverter.end_page(self, page)
def f(item):
bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox
if isinstance(item, FigureItem):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, bbox))
for child in item.objs:
f(child)
self.outfp.write('</figure>\n')
elif isinstance(item, TextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname, self.codec), item.vertical, bbox, item.fontsize))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</text>\n')
bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(page.id, bbox, page.rotate))
for child in page.objs:
f(child)
self.outfp.write('</page>\n')
return
## HTMLConverter
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
self.cluster_margin = cluster_margin
self.show_text_border = False
return
def end_page(self, page):
page = PDFConverter.end_page(self, page)
self.yoffset += page.y1
if self.pagenum:
self.outfp.write('<div style="position:absolute; top:%dpx;"><a name="%s">Page %s</a></div>' %
((self.yoffset-page.y1)*self.scale, page.id, page.id))
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(page.x0*self.scale, (self.yoffset-page.y1)*self.scale,
page.width*self.scale, page.height*self.scale))
def draw(item):
if isinstance(item, FigureItem):
for child in item.objs:
draw(child)
elif isinstance(item, TextItem):
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.outfp.write(enc(item.text, self.codec))
self.outfp.write('</span>\n')
if self.show_text_border:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.width*self.scale, self.height*self.scale))
for child in page.objs:
draw(child)
page = PDFPageAggregator.end_page(self, page)
if self.cluster_margin:
clusters = cluster_textobjs(get_textobjs(page), self.cluster_margin)
for textbox in clusters:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(textbox.x0*self.scale, (self.yoffset-textbox.y1)*self.scale,
textbox.width*self.scale, textbox.height*self.scale))
self.yoffset += self.pagepad
return
page.group_text(self.cluster_margin)
return page
def close(self):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n')
return
## TextConverter
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum
if cluster_margin == None:
cluster_margin = 0.5
self.cluster_margin = cluster_margin
self.word_margin = 0.2
def write(self, text):
self.outfp.write(e(text, self.codec))
return
def end_page(self, page):
page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)
if self.cluster_margin:
textobjs = get_textobjs(page)
clusters = cluster_textobjs(textobjs, self.cluster_margin)
for textbox in clusters:
for line in textbox.lines(self.word_margin):
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
self.outfp.write('\n')
else:
for obj in page.objs:
if isinstance(obj, TextItem):
self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
self.outfp.write('\f')
return
def close(self):
return
## TagExtractor
##
class TagExtractor(PDFDevice):
@ -191,7 +63,7 @@ class TagExtractor(PDFDevice):
text += char
except PDFUnicodeNotDefined, e:
pass
self.outfp.write(enc(text, self.codec))
self.write(text)
return
def begin_page(self, page):
@ -207,18 +79,150 @@ class TagExtractor(PDFDevice):
return
def begin_tag(self, tag, props=None):
self.outfp.write('<%s%s>' % (enc(tag.name, self.codec), encprops(props, self.codec)))
s = ''
if props:
s = ''.join( ' %s="%s"' % (e(k), e(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (e(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name, self.codec))
self.outfp.write('</%s>' % e(self.tag.name))
self.tag = None
return
def do_tag(self, tag, props=None):
self.outfp.write('<%s%s/>' % (enc(tag.name, self.codec), encprops(props, self.codec)))
self.begin_tag(tag, props)
self.tag = None
return
## SGMLConverter
##
class SGMLConverter(PDFConverter):
def end_page(self, page):
def draw(item):
if isinstance(item, TextItem):
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
(e(item.font.fontname), item.get_direction(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LayoutContainer):
self.outfp.write('<group id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
draw(child)
self.outfp.write('</group>\n')
return
page = PDFConverter.end_page(self, page)
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(page.id, page.get_bbox(), page.rotate))
draw(page)
self.outfp.write('</page>\n')
return
## HTMLConverter
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True,
pagepad=50, scale=1, cluster_margin=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
self.codec)
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
self.show_text_border = False
return
def write_rect(self, color, x, y, w, h):
self.outfp.write('<span style="position:absolute; border: 1px solid %s; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(color, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return
def end_page(self, page):
def draw(item):
if isinstance(item, Page):
self.write_rect('gray', item.x0, self.yoffset-item.y1, item.width, item.height)
if self.pagenum:
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-page.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
draw(child)
elif isinstance(item, TextItem):
if item.vertical:
wmode = 'tb-rl'
else:
wmode = 'lr-tb'
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
' left:%dpx; top:%dpx; font-size:%dpx;">' %
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.fontsize*self.scale))
self.write(item.text)
self.outfp.write('</span>\n')
if self.show_text_border:
self.write_rect('red', item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LayoutContainer):
self.write_rect('blue', item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
draw(child)
return
page = PDFConverter.end_page(self, page)
self.yoffset += page.y1
draw(page)
self.yoffset += self.pagepad
return
def close(self):
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.outfp.write('</body></html>\n')
return
## TextConverter
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=False,
cluster_margin=None, word_margin=0.2):
if cluster_margin == None:
cluster_margin = 0.5
PDFConverter.__init__(self, rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
self.pagenum = pagenum
self.word_margin = word_margin
return
def end_page(self, page):
def draw(item):
if isinstance(item, TextItem):
self.outfp.write(obj.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
elif isinstance(item, TextBox):
for line in item.get_lines(self.word_margin):
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
self.outfp.write('\n')
elif isinstance(item, LayoutContainer):
for child in item:
draw(child)
page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)
draw(page)
self.outfp.write('\f')
return
def close(self):
return

View File

@ -3,7 +3,7 @@ import sys
stdout = sys.stdout
stderr = sys.stderr
from pdffont import PDFUnicodeNotDefined
from layout import PageItem, Page, FigureItem, TextItem
from layout import Page, FigureItem, TextItem
from utils import mult_matrix, translate_matrix
@ -68,7 +68,8 @@ class PDFPageAggregator(PDFDevice):
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, PageItem)
assert isinstance(self.cur_item, Page)
self.cur_item.fixate()
self.pageno += 1
return self.cur_item
@ -79,6 +80,7 @@ class PDFPageAggregator(PDFDevice):
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return

View File

@ -23,13 +23,6 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
return (a*p+c*q, b*p+d*q)
# display functions
def matrix2str((a,b,c,d,e,f)):
return '[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % (a,b,c,d,e,f)
def rect2str((x0,y0,x1,y1)):
return '(%.1f, %.1f)-(%.1f, %.1f)' % (x0,y0,x1,y1)
def point2str((x,y)):
return '(%.1f, %.1f)' % (x,y)
## Utilities
##
@ -98,12 +91,3 @@ def decode_text(s):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
##
def pick(seq, func, maxobj=None):
maxscore = None
for obj in seq:
score = func(obj)
if maxscore == None or maxscore < score:
(maxscore,maxobj) = (score,obj)
return maxobj