layout analysis improved.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@93 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
43e5c05307
commit
fd27d16acc
|
@ -1,11 +1,18 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfdevice import PageItem
|
||||
from utils import pick
|
||||
INF = sys.maxint
|
||||
|
||||
|
||||
## binary search
|
||||
## bsearch
|
||||
##
|
||||
## Finds objects whose coordinates overlap with [v0,v1].
|
||||
## It performs binary search so that the processing time
|
||||
## should be around O(log n).
|
||||
##
|
||||
def bsearch(objs, v0, v1):
|
||||
assert v0 <= v1
|
||||
if v1 <= v0: return []
|
||||
i0 = 0
|
||||
i1 = len(objs)-1
|
||||
while i0 <= i1:
|
||||
|
@ -31,27 +38,79 @@ def bsearch(objs, v0, v1):
|
|||
return []
|
||||
|
||||
|
||||
## reorder_hv, reorder_vh
|
||||
##
|
||||
## Reorders objects according to its writing direction.
|
||||
##
|
||||
def reorder_hv(objs, hdir):
|
||||
if 0 < hdir:
|
||||
hkey = (lambda obj: obj.x0)
|
||||
else:
|
||||
hkey = (lambda obj: -obj.x1)
|
||||
vkey = (lambda obj: -obj.y1)
|
||||
r = []
|
||||
line = []
|
||||
for obj1 in sorted(objs, key=vkey):
|
||||
if line and not line[-1].voverlap(obj1):
|
||||
line.sort(key=hkey)
|
||||
r.append(line)
|
||||
line = []
|
||||
line.append(obj1)
|
||||
line.sort(key=hkey)
|
||||
r.append(line)
|
||||
return r
|
||||
|
||||
def reorder_vh(objs, hdir):
|
||||
if 0 < hdir:
|
||||
hkey = (lambda obj: obj.x0)
|
||||
else:
|
||||
hkey = (lambda obj: -obj.x1)
|
||||
vkey = (lambda obj: -obj.y1)
|
||||
r = []
|
||||
line = []
|
||||
for obj1 in sorted(objs, key=hkey):
|
||||
if line and not line[-1].hoverlap(obj1):
|
||||
line.sort(key=vkey)
|
||||
r.append(line)
|
||||
line = []
|
||||
line.append(obj1)
|
||||
line.sort(key=vkey)
|
||||
r.append(line)
|
||||
return r
|
||||
|
||||
|
||||
## Plane
|
||||
##
|
||||
## A data structure for objects placed on a plane.
|
||||
## Can efficiently find objects in a certain rectangular area.
|
||||
## It maintains two parallel lists of objects, each of
|
||||
## which is sorted by its x or y coordinate.
|
||||
##
|
||||
class Plane(object):
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, objs):
|
||||
self.xobjs = []
|
||||
self.yobjs = []
|
||||
for obj in objs:
|
||||
self.place(obj)
|
||||
self.fixate()
|
||||
return
|
||||
|
||||
def add(self, (x0,y0,x1,y1), obj):
|
||||
self.xobjs.append((x0, obj))
|
||||
self.xobjs.append((x1, obj))
|
||||
self.yobjs.append((y0, obj))
|
||||
self.yobjs.append((y1, obj))
|
||||
# place(obj): place an object in a certain area.
|
||||
def place(self, obj):
|
||||
self.xobjs.append((obj.x0, obj))
|
||||
self.xobjs.append((obj.x1, obj))
|
||||
self.yobjs.append((obj.y0, obj))
|
||||
self.yobjs.append((obj.y1, obj))
|
||||
return
|
||||
|
||||
def finish(self):
|
||||
# fixate(): you must call this after adding all objects.
|
||||
def fixate(self):
|
||||
self.xobjs.sort()
|
||||
self.yobjs.sort()
|
||||
return
|
||||
|
||||
# find(): finds objects that are in a certain area.
|
||||
def find(self, (x0,y0,x1,y1)):
|
||||
xobjs = set(bsearch(self.xobjs, x0, x1))
|
||||
yobjs = set(bsearch(self.yobjs, y0, y1))
|
||||
|
@ -59,68 +118,127 @@ class Plane(object):
|
|||
return objs
|
||||
|
||||
|
||||
## TextBox
|
||||
##
|
||||
## A set of text objects that are clustered in
|
||||
## a certain rectangular area.
|
||||
##
|
||||
class TextBox(PageItem):
|
||||
|
||||
def __init__(self, objs):
|
||||
self.objs = set(objs)
|
||||
self.vertical = False
|
||||
self.length = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textbox %s %s items=%d>' % (self.bbox(), self.vertical, len(self.objs)))
|
||||
|
||||
def __len__(self):
|
||||
return self.length
|
||||
|
||||
# merge(boxes): merges with other textboxes.
|
||||
def merge(self, box):
|
||||
self.objs.update(box.objs)
|
||||
return
|
||||
|
||||
# finish(): determines its boundery and writing direction.
|
||||
def finish(self):
|
||||
assert self.objs
|
||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||
for obj in self.objs:
|
||||
bx0 = min(bx0, obj.x0)
|
||||
by0 = min(by0, obj.y0)
|
||||
bx1 = max(bx1, obj.x1)
|
||||
by1 = max(by1, obj.y1)
|
||||
PageItem.__init__(self, (bx0, by0, bx1, by1))
|
||||
self.length = sum( len(obj) for obj in self.objs )
|
||||
for obj in self.objs:
|
||||
self.vertical = obj.vertical
|
||||
break
|
||||
if 2 <= len(self.objs):
|
||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||
if len(objs[0]) == 1 and len(objs[1]) == 1:
|
||||
h = objs[0].voverlap(objs[1])
|
||||
v = objs[0].hoverlap(objs[1])
|
||||
self.vertical = (h < v)
|
||||
return
|
||||
|
||||
def lines(self, ratio):
|
||||
if self.vertical:
|
||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||
for line in reorder_vh(objs, -1):
|
||||
s = ''
|
||||
y0 = -INF
|
||||
for obj in line:
|
||||
margin = abs(obj.fontsize * ratio)
|
||||
if obj.y1 < y0-margin:
|
||||
s += ' '
|
||||
s += obj.text
|
||||
y0 = obj.y0
|
||||
yield s
|
||||
else:
|
||||
objs = sorted(self.objs, key=lambda obj: obj.x0-obj.y1)
|
||||
for line in reorder_hv(objs, +1):
|
||||
s = ''
|
||||
x1 = INF
|
||||
for obj in line:
|
||||
margin = abs(obj.fontsize * ratio)
|
||||
if x1+margin < obj.x0:
|
||||
s += ' '
|
||||
s += obj.text
|
||||
x1 = obj.x1
|
||||
yield s
|
||||
return
|
||||
|
||||
|
||||
## ClusterSet
|
||||
##
|
||||
## Maintains a set of TextBox objects.
|
||||
## It incrementally constructs TextBox objects
|
||||
## and group them when necessary. It gives
|
||||
## a sequence of TextBox objects that represent
|
||||
## the text stream of that page.
|
||||
##
|
||||
class ClusterSet(object):
|
||||
|
||||
def __init__(self):
|
||||
self.clusters = {}
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
self.clusters[obj] = (obj,)
|
||||
return
|
||||
|
||||
def merge(self, objs):
|
||||
allobjs = set(objs)
|
||||
# add(objs): groups text objects if necessary.
|
||||
def add(self, objs):
|
||||
c = TextBox(objs)
|
||||
for obj in objs:
|
||||
if obj in self.clusters:
|
||||
allobjs.update(self.clusters[obj])
|
||||
c = tuple(allobjs)
|
||||
for obj in allobjs:
|
||||
c.merge(self.clusters[obj])
|
||||
for obj in c.objs:
|
||||
self.clusters[obj] = c
|
||||
return
|
||||
|
||||
# finish(): returns all the TextBoxes in a page.
|
||||
def finish(self):
|
||||
return set(self.clusters.itervalues())
|
||||
r = set(self.clusters.itervalues())
|
||||
for textbox in r:
|
||||
textbox.finish()
|
||||
return r
|
||||
|
||||
|
||||
def cluster_pageobjs(objs, ratio):
|
||||
idx = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
||||
plane = Plane()
|
||||
for obj in objs:
|
||||
plane.add(obj.bbox, obj)
|
||||
plane.finish()
|
||||
# cluster_textobjs
|
||||
def cluster_textobjs(objs, ratio):
|
||||
plane = Plane(objs)
|
||||
cset = ClusterSet()
|
||||
for obj in objs:
|
||||
(bx0,by0,bx1,by1) = obj.bbox
|
||||
margin = abs(obj.fontsize * ratio)
|
||||
x0 = min(bx0,bx1)
|
||||
y0 = min(by0,by1)
|
||||
x1 = max(bx0,bx1)
|
||||
y1 = max(by0,by1)
|
||||
found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
|
||||
if len(found) == 1:
|
||||
cset.add(found.pop())
|
||||
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
|
||||
cset.add(neighbors)
|
||||
clusters = cset.finish()
|
||||
vertical = ((sum( len(textbox) for textbox in clusters )/2) <
|
||||
sum( len(textbox) for textbox in clusters if textbox.vertical ))
|
||||
if vertical:
|
||||
lines = reorder_hv(clusters, -1)
|
||||
else:
|
||||
cset.merge(found)
|
||||
clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]])
|
||||
lines = reorder_vh(clusters, +1)
|
||||
r = []
|
||||
for objs in clusters:
|
||||
objs = sorted(objs, key=lambda obj: idx[obj])
|
||||
h = v = 0
|
||||
(bx0,by0,bx1,by1) = objs[0].bbox
|
||||
(lx0,ly0,_,_) = objs[0].bbox
|
||||
for obj in objs[1:]:
|
||||
(x0,y0,x1,y1) = obj.bbox
|
||||
if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0):
|
||||
v += 1
|
||||
else:
|
||||
h += 1
|
||||
(lx0,ly0) = (x0,y0)
|
||||
bx0 = min(bx0, x0)
|
||||
bx1 = max(bx1, x1)
|
||||
by0 = min(by0, y0)
|
||||
by1 = max(by1, y1)
|
||||
r.append(((bx0,by0,bx1,by1), h < v, objs))
|
||||
for line in lines:
|
||||
r.extend(line)
|
||||
return r
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import sys
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfdevice import PDFDevice, PageItem, FigureItem, TextItem, PDFPageAggregator
|
||||
from pdfdevice import PDFDevice, PageItem, Page, FigureItem, TextItem, PDFPageAggregator
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from cmap import CMapDB
|
||||
|
||||
|
@ -19,7 +19,7 @@ def get_textobjs(item, r=None):
|
|||
if r == None: r = []
|
||||
if isinstance(item, TextItem):
|
||||
r.append(item)
|
||||
elif isinstance(item, PageItem):
|
||||
elif isinstance(item, Page):
|
||||
for child in item.objs:
|
||||
get_textobjs(child, r)
|
||||
return r
|
||||
|
@ -49,8 +49,8 @@ class SGMLConverter(PDFConverter):
|
|||
f(child)
|
||||
self.outfp.write('</figure>\n')
|
||||
elif isinstance(item, TextItem):
|
||||
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(enc(item.font.fontname, self.codec), item.direction, bbox, item.fontsize))
|
||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(enc(item.font.fontname, self.codec), item.vertical, bbox, item.fontsize))
|
||||
self.outfp.write(enc(item.text, self.codec))
|
||||
self.outfp.write('</text>\n')
|
||||
bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox
|
||||
|
@ -79,42 +79,45 @@ class HTMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def end_page(self, page):
|
||||
from cluster import cluster_pageobjs
|
||||
from cluster import cluster_textobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
(x0,y0,x1,y1) = page.bbox
|
||||
self.yoffset += y1
|
||||
self.yoffset += page.y1
|
||||
if self.pagenum:
|
||||
self.outfp.write('<div style="position:absolute; top:%dpx;"><a name="%s">Page %s</a></div>' %
|
||||
((self.yoffset-y1)*self.scale, page.id, page.id))
|
||||
((self.yoffset-page.y1)*self.scale, page.id, page.id))
|
||||
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
(page.x0*self.scale, (self.yoffset-page.y1)*self.scale,
|
||||
page.width*self.scale, page.height*self.scale))
|
||||
def draw(item):
|
||||
if isinstance(item, FigureItem):
|
||||
for child in item.objs:
|
||||
draw(child)
|
||||
elif isinstance(item, TextItem):
|
||||
if item.direction == 2:
|
||||
if item.vertical:
|
||||
wmode = 'tb-rl'
|
||||
else:
|
||||
wmode = 'lr-tb'
|
||||
(x0,y0,x1,y1) = item.bbox
|
||||
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||
(wmode, x0*self.scale, (self.yoffset-y1)*self.scale, item.fontsize*self.scale))
|
||||
self.outfp.write('<span style="position:absolute; writing-mode:%s;'
|
||||
' left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||
(wmode, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
||||
item.fontsize*self.scale))
|
||||
self.outfp.write(enc(item.text, self.codec))
|
||||
self.outfp.write('</span>\n')
|
||||
if self.show_text_border:
|
||||
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
(item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
||||
item.width*self.scale, self.height*self.scale))
|
||||
for child in page.objs:
|
||||
draw(child)
|
||||
if self.cluster_margin:
|
||||
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
|
||||
for ((x0,y0,x1,y1),_,objs) in clusters:
|
||||
clusters = cluster_textobjs(get_textobjs(page), self.cluster_margin)
|
||||
for textbox in clusters:
|
||||
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
(textbox.x0*self.scale, (self.yoffset-textbox.y1)*self.scale,
|
||||
textbox.width*self.scale, textbox.height*self.scale))
|
||||
self.yoffset += self.pagepad
|
||||
return
|
||||
|
||||
|
@ -135,30 +138,25 @@ class TextConverter(PDFConverter):
|
|||
if cluster_margin == None:
|
||||
cluster_margin = 0.5
|
||||
self.cluster_margin = cluster_margin
|
||||
self.word_margin = 0.2
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
from cluster import cluster_pageobjs
|
||||
from cluster import cluster_textobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.pagenum:
|
||||
self.outfp.write('Page %d\n' % page.id)
|
||||
if self.cluster_margin:
|
||||
textobjs = get_textobjs(page)
|
||||
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
|
||||
for (_,vertical,objs) in clusters:
|
||||
for (i,item) in enumerate(objs):
|
||||
(x0,y0,x1,y1) = item.bbox
|
||||
if (i and
|
||||
((not vertical and (y1 < ly0 or ly1 < y0)) or
|
||||
(vertical and (x1 < lx0 or lx1 < x0)))):
|
||||
clusters = cluster_textobjs(textobjs, self.cluster_margin)
|
||||
for textbox in clusters:
|
||||
for line in textbox.lines(self.word_margin):
|
||||
self.outfp.write(line.encode(self.codec, 'replace')+'\n')
|
||||
self.outfp.write('\n')
|
||||
(lx0,ly0,lx1,ly1) = (x0,y0,x1,y1)
|
||||
self.outfp.write(item.text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n\n')
|
||||
else:
|
||||
for item in page.objs:
|
||||
if isinstance(item, TextItem):
|
||||
self.outfp.write(item.text.encode(self.codec, 'replace'))
|
||||
for obj in page.objs:
|
||||
if isinstance(obj, TextItem):
|
||||
self.outfp.write(obj.text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n')
|
||||
self.outfp.write('\f')
|
||||
return
|
||||
|
|
|
@ -52,74 +52,119 @@ class PDFDevice(object):
|
|||
return
|
||||
|
||||
|
||||
## PageItem
|
||||
## Page
|
||||
##
|
||||
class PageItem(object):
|
||||
|
||||
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
||||
self.id = id
|
||||
self.bbox = (x0, y0, x1, y1)
|
||||
self.rotate = rotate
|
||||
self.objs = []
|
||||
def __init__(self, (x0,y0,x1,y1)):
|
||||
#assert x0 <= x1 and y0 <= y1
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
self.y1 = y1
|
||||
self.width = x1-x0
|
||||
self.height = y1-y0
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<page id=%r bbox=%r rotate=%r>' % (self.id, self.bbox, self.rotate))
|
||||
return ('<pageitem bbox=%s>' % (self.bbox()))
|
||||
|
||||
def bbox(self):
|
||||
return rect2str((self.x0, self.y0, self.x1, self.y1))
|
||||
|
||||
def hoverlap(self, obj):
|
||||
assert isinstance(obj, PageItem)
|
||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||
|
||||
def voverlap(self, obj):
|
||||
assert isinstance(obj, PageItem)
|
||||
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
||||
return 0
|
||||
else:
|
||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||
|
||||
|
||||
class PageContainer(PageItem):
|
||||
|
||||
def __init__(self, bbox):
|
||||
PageItem.__init__(self, bbox)
|
||||
self.objs = []
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.append(obj)
|
||||
return
|
||||
|
||||
class Page(PageContainer):
|
||||
|
||||
def __init__(self, id, bbox, rotate=0):
|
||||
PageContainer.__init__(self, bbox)
|
||||
self.id = id
|
||||
self.rotate = rotate
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.bbox(), self.rotate))
|
||||
|
||||
|
||||
## FigureItem
|
||||
##
|
||||
class FigureItem(PageItem):
|
||||
class FigureItem(PageContainer):
|
||||
|
||||
def __init__(self, id, bbox):
|
||||
PageContainer.__init__(self, bbox)
|
||||
self.id = id
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<figure id=%r bbox=%r>' % (self.id, self.bbox))
|
||||
return ('<figure id=%r bbox=%s>' % (self.id, self.bbox()))
|
||||
|
||||
|
||||
## TextItem
|
||||
##
|
||||
class TextItem(object):
|
||||
class TextItem(PageItem):
|
||||
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
assert chars
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
self.direction = 0
|
||||
self.text = ''
|
||||
adv = 0
|
||||
for (char,cid) in chars:
|
||||
self.text += char
|
||||
adv += font.char_width(cid)
|
||||
self.vertical = self.font.is_vertical()
|
||||
self.text = ''.join( char for (char,_) in chars )
|
||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||
if not self.font.is_vertical():
|
||||
if not self.vertical:
|
||||
# horizontal text
|
||||
self.direction = 1
|
||||
self.vertical = False
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (adv,size))
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||
ty += descent
|
||||
self.adv = (dx, 0)
|
||||
self.bbox = (tx, ty, tx+dx, ty+dy)
|
||||
bbox = (tx, ty, tx+dx, ty+dy)
|
||||
else:
|
||||
# vertical text
|
||||
self.direction = 2
|
||||
(_,cid) = chars[0]
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-font.char_disp(cid))*fontsize*.001))
|
||||
(dx,dy) = apply_matrix_norm(self.matrix, (size,adv))
|
||||
tx -= dx/2
|
||||
ty += disp
|
||||
self.adv = (0, dy)
|
||||
self.bbox = (tx, ty+dy, tx+dx, ty)
|
||||
bbox = (tx, ty+dy, tx+dx, ty)
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||
PageItem.__init__(self, bbox)
|
||||
return
|
||||
|
||||
def __len__(self):
|
||||
return len(self.text)
|
||||
|
||||
def __repr__(self):
|
||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s text=%r adv=%s>' %
|
||||
(matrix2str(self.matrix), self.font, self.fontsize,
|
||||
rect2str(self.bbox), self.text, point2str(self.adv)))
|
||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||
(matrix2str(self.matrix), self.font, self.fontsize, self.bbox(),
|
||||
point2str(self.adv), self.text))
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
|
@ -133,7 +178,7 @@ class PDFPageAggregator(PDFDevice):
|
|||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate)
|
||||
self.cur_item = Page(self.pageno, page.mediabox, page.rotate)
|
||||
return
|
||||
|
||||
def end_page(self, _):
|
||||
|
@ -177,7 +222,8 @@ class PDFPageAggregator(PDFDevice):
|
|||
|
||||
def render_chars(self, textmatrix, textstate, chars):
|
||||
if not chars: return (0, 0)
|
||||
item = TextItem(textmatrix, textstate.font, textstate.fontsize, textstate.charspace, textstate.scaling, chars)
|
||||
item = TextItem(textmatrix, textstate.font, textstate.fontsize,
|
||||
textstate.charspace, textstate.scaling, chars)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
|
@ -199,7 +245,7 @@ class PDFPageAggregator(PDFDevice):
|
|||
(cidcoding, cid) = e.args
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid))
|
||||
if cid == 32 and not font.is_multibyte():
|
||||
if textstate.wordspace and not font.is_multibyte() and cid == 32:
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx += textstate.wordspace * textstate.scaling * .01
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
|
|
|
@ -359,9 +359,6 @@ class PDFFont(object):
|
|||
def string_width(self, s):
|
||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||
|
||||
def space_width(self):
|
||||
return max(self.char_width(32), self.char_width(44), self.char_width(46)) * 0.5
|
||||
|
||||
|
||||
# PDFSimpleFont
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
@ -572,9 +569,6 @@ class PDFCIDFont(PDFFont):
|
|||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
def space_width(self):
|
||||
return 0
|
||||
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
|
|
|
@ -98,3 +98,12 @@ def decode_text(s):
|
|||
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||
else:
|
||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||
|
||||
##
|
||||
def pick(seq, func, maxobj=None):
|
||||
maxscore = None
|
||||
for obj in seq:
|
||||
score = func(obj)
|
||||
if maxscore == None or maxscore < score:
|
||||
(maxscore,maxobj) = (score,obj)
|
||||
return maxobj
|
||||
|
|
Loading…
Reference in New Issue