layout analysis improved.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@120 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-07-21 07:55:19 +00:00
parent 0113486b76
commit 8a5bec5065
14 changed files with 263 additions and 525 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Jul 12 00:36:44 JST 2009 Last Modified: Tue Jul 21 16:24:26 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -191,6 +191,7 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>"). Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
</ul> </ul>
<p> <p>
<dt> <code>-D <em>direction</em></code>
<dt> <code>-M <em>char_margin</em></code> <dt> <code>-M <em>char_margin</em></code>
<dt> <code>-L <em>line_margin</em></code> <dt> <code>-L <em>line_margin</em></code>
<dt> <code>-W <em>word_margin</em></code> <dt> <code>-W <em>word_margin</em></code>
@ -318,6 +319,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2009/07/21: Improvement in layout analysis.
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes. <li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported. <li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
<li> 2009/03/30: Text output mode added. <li> 2009/03/30: Text output mode added.

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
__version__ = '20090711' __version__ = '20090721'
if __name__ == '__main__': print __version__ if __name__ == '__main__': print __version__

View File

@ -199,7 +199,9 @@ class CMapDB(object):
cmapdb = {} cmapdb = {}
@classmethod @classmethod
def initialize(klass, dirname, cdbdirname=None): def initialize(klass, dirname=None, cdbdirname=None):
if not dirname:
dirname = find_cmap_path()
klass.dirname = dirname klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname klass.cdbdirname = cdbdirname or dirname
return return

View File

@ -2,7 +2,7 @@
import sys import sys
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdffont import PDFUnicodeNotDefined from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
@ -10,10 +10,9 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
## ##
class PDFPageAggregator(PDFDevice): class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None): def __init__(self, rsrc, pageno=1, laparams=None):
PDFDevice.__init__(self, rsrc) PDFDevice.__init__(self, rsrc)
self.char_margin = char_margin self.laparams = laparams
self.line_margin = line_margin
self.undefined_char = '?' self.undefined_char = '?'
self.pageno = pageno self.pageno = pageno
self.stack = [] self.stack = []
@ -27,9 +26,9 @@ class PDFPageAggregator(PDFDevice):
assert not self.stack assert not self.stack
assert isinstance(self.cur_item, LTPage) assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate() self.cur_item.fixate()
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1 self.pageno += 1
if self.char_margin != None and self.line_margin != None:
self.cur_item.group_text(self.char_margin, self.line_margin)
return self.cur_item return self.cur_item
def begin_figure(self, name, bbox, matrix): def begin_figure(self, name, bbox, matrix):
@ -79,8 +78,8 @@ class PDFPageAggregator(PDFDevice):
def render_chars(self, textmatrix, textstate, chars): def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0) if not chars: return (0, 0)
item = LTText(textmatrix, textstate.font, textstate.fontsize, item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars) textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item) self.cur_item.add(item)
return item.adv return item.adv
@ -116,13 +115,10 @@ class PDFPageAggregator(PDFDevice):
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
char_margin=None, line_margin=None, word_margin=None): PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
char_margin=char_margin, line_margin=line_margin)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.word_margin = word_margin
return return
def write(self, text): def write(self, text):
@ -202,17 +198,6 @@ class SGMLConverter(PDFConverter):
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTText):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTAnon):
if item.text == ' ':
self.outfp.write('<space>\n')
elif item.text == '\n':
self.outfp.write('<newline>\n')
elif isinstance(item, LTLine): elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox())) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
@ -222,11 +207,26 @@ class SGMLConverter(PDFConverter):
for child in item: for child in item:
render(child) render(child)
self.outfp.write('</figure>\n') self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item.get_lines(self.word_margin): for child in item:
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n', item.text)
else:
assert 0, item
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
@ -237,11 +237,9 @@ class SGMLConverter(PDFConverter):
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
char_margin=None, line_margin=None, word_margin=None,
scale=1, showpageno=True, pagepad=50): scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
@ -268,7 +266,7 @@ class HTMLConverter(PDFConverter):
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTText): elif isinstance(item, LTTextItem):
if item.vertical: if item.vertical:
wmode = 'tb-rl' wmode = 'tb-rl'
else: else:
@ -281,13 +279,14 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTAnon):
pass
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
for child in item:
render(child)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item.get_lines(self.word_margin): for child in item:
render(child) render(child)
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
@ -307,11 +306,9 @@ class HTMLConverter(PDFConverter):
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
char_margin=None, line_margin=None, word_margin=None,
showpageno=False): showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno self.showpageno = showpageno
return return
@ -322,14 +319,12 @@ class TextConverter(PDFConverter):
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTText): if isinstance(item, LTText):
self.write(item.text+'\n') self.write(item.text)
elif isinstance(item, LTTextBox):
for obj in item.get_lines(self.word_margin):
self.write(obj.text)
self.write('\n')
elif isinstance(item, LayoutContainer): elif isinstance(item, LayoutContainer):
for child in item: for child in item:
render(child) render(child)
if isinstance(item, LTTextBox):
self.write('\n')
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.showpageno: if self.showpageno:
self.write('Page %d\n' % page.id) self.write('Page %d\n' % page.id)

View File

@ -4,50 +4,24 @@ from pdfminer.utils import apply_matrix_norm, bsearch
INF = sys.maxint INF = sys.maxint
## reorder_hv, reorder_vh ## LAParams
## chop_hv, chop_vh
## ##
## Reorders objects according to its writing direction. class LAParams(object):
##
def reorder_vh(objs, hdir):
if 0 < hdir:
hkey = (lambda obj: obj.x0)
vkey = (lambda obj: -obj.y1)
else:
hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1)
r = []
line = []
for obj in sorted(objs, key=vkey):
if line:
v = line[-1].voverlap(obj) * 2
if v < obj.height or v < line[-1].height:
line.sort(key=hkey)
r.append(line)
line = []
line.append(obj)
line.sort(key=hkey)
r.append(line)
return r
def reorder_hv(objs, hdir): def __init__(self,
if 0 < hdir: direction=None,
hkey = (lambda obj: obj.x0) char_margin=1.0,
vkey = (lambda obj: -obj.y1) line_margin=0.5,
else: word_margin=0.1):
hkey = (lambda obj: -obj.x1) self.direction = direction
vkey = (lambda obj: -obj.y1) self.char_margin = char_margin
r = [] self.line_margin = line_margin
line = [] self.word_margin = word_margin
for obj in sorted(objs, key=hkey): return
if line and not line[-1].hoverlap(obj):
line.sort(key=vkey) def __repr__(self):
r.append(line) return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
line = [] (self.direction, self.char_margin, self.line_margin, self.word_margin))
line.append(obj)
line.sort(key=vkey)
r.append(line)
return r
## Plane ## Plane
@ -91,12 +65,6 @@ class Plane(object):
## ClusterSet ## ClusterSet
## ##
## Maintains a set of LTTextBox objects.
## It incrementally constructs LTTextBox objects
## and group them when necessary. It gives
## a sequence of LTTextBox objects that represent
## the text stream of that page.
##
class ClusterSet(object): class ClusterSet(object):
def __init__(self, klass): def __init__(self, klass):
@ -123,16 +91,18 @@ class ClusterSet(object):
group.fixate() group.fixate()
return list(r) return list(r)
def group_objs(objs, hratio, vratio, klass): @classmethod
plane = Plane(objs) def build(klass, objs, hratio, vratio, objtype):
cset = ClusterSet(klass) plane = Plane(objs)
for obj in objs: cset = ClusterSet(objtype)
margin = obj.get_margin() for obj in objs:
hmargin = hratio * margin margin = obj.get_margin()
vmargin = vratio * margin hmargin = hratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin)) vmargin = vratio * margin
cset.add(neighbors) neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
return cset.finish() assert obj in neighbors, obj
cset.add(neighbors)
return cset.finish()
## LayoutItem ## LayoutItem
@ -140,11 +110,12 @@ def group_objs(objs, hratio, vratio, klass):
class LayoutItem(object): class LayoutItem(object):
def __init__(self, bbox): def __init__(self, bbox):
#assert x0 <= x1 and y0 <= y1
self.set_bbox(bbox) self.set_bbox(bbox)
return return
def set_bbox(self, (x0,y0,x1,y1)): def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
if y1 < y0: (y0,y1) = (y1,y0)
self.x0 = x0 self.x0 = x0
self.y0 = y0 self.y0 = y0
self.x1 = x1 self.x1 = x1
@ -203,6 +174,9 @@ class LayoutContainer(LayoutItem):
def __iter__(self): def __iter__(self):
return iter(self.objs) return iter(self.objs)
def __len__(self):
return len(self.objs)
def add(self, obj): def add(self, obj):
self.objs.add(obj) self.objs.add(obj)
return return
@ -212,7 +186,7 @@ class LayoutContainer(LayoutItem):
return return
# fixate(): determines its boundery and writing direction. # fixate(): determines its boundery and writing direction.
def fixate(self, direction=None): def fixate(self):
if not self.width and self.objs: if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs: for obj in self.objs:
@ -228,14 +202,7 @@ class LayoutContainer(LayoutItem):
return self.weight return self.weight
def get_direction(self): def get_direction(self):
if not self.objs: return None return None
d = {}
for obj in self.objs:
k = obj.get_direction()
if k not in d: d[k] = 0
d[k] += 1
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
return direction
## LTLine ## LTLine
@ -259,21 +226,37 @@ class LTRect(LayoutItem):
return return
## LTAnon ## LTText
## ##
class LTAnon(object): class LTText(object):
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
return return
def __repr__(self):
return '<text %r>' % self.text
def get_weight(self):
return len(self.text)
def is_upright(self):
return True
## LTAnon
##
class LTAnon(LTText):
def get_weight(self): def get_weight(self):
return 0 return 0
## LTText ## LTTextItem
## ##
class LTText(LayoutItem): class LTTextItem(LayoutItem, LTText):
debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars): def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars assert chars
@ -307,21 +290,25 @@ class LTText(LayoutItem):
return return
def __repr__(self): def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' % if self.debug:
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix, return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
self.font, self.fontsize, self.get_bbox(), ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
'(%.1f, %.1f)' % self.adv, self.font, self.fontsize, self.get_bbox(),
self.text)) '(%.1f, %.1f)' % self.adv,
self.text))
else:
return '<text %r>' % self.text
def get_margin(self): def get_margin(self):
return abs(self.fontsize) return abs(self.fontsize)
def get_weight(self):
return len(self.text)
def is_vertical(self): def is_vertical(self):
return self.vertical return self.vertical
def is_upright(self):
(a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0
## LTFigure ## LTFigure
## ##
@ -336,6 +323,54 @@ class LTFigure(LayoutContainer):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix)) return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
## LTTextLine
##
class LTTextLine(LayoutContainer):
def __init__(self, id, objs, direction, word_margin):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
self.word_margin = word_margin
return
def __repr__(self):
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
def get_margin(self):
return min(self.width, self.height)
def get_direction(self):
return self.direction
def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def fixate(self):
LayoutContainer.fixate(self)
objs = []
if self.direction == 'V':
y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
objs.append(obj)
y0 = obj.y0
else:
x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin:
objs.append(LTAnon(' '))
objs.append(obj)
x1 = obj.x1
objs.append(LTAnon('\n'))
self.objs = objs
return
## LTTextBox ## LTTextBox
## ##
## A set of text objects that are grouped within ## A set of text objects that are grouped within
@ -343,65 +378,55 @@ class LTFigure(LayoutContainer):
## ##
class LTTextBox(LayoutContainer): class LTTextBox(LayoutContainer):
def __init__(self, id, objs): def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs) LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = None self.direction = direction
return return
def __repr__(self): def __repr__(self):
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction)) return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
def fixate(self, direction='H'): def get_text(self):
LayoutContainer.fixate(self, direction=direction) return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
if not direction:
if any( obj.is_vertical() for obj in self.objs ): def fixate(self):
direction = 'V' LayoutContainer.fixate(self)
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
if h < v:
direction = 'V'
self.direction = direction
if self.direction == 'V': if self.direction == 'V':
self.lines = reorder_hv(self.objs, -1) self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else: else:
self.lines = reorder_vh(self.objs, +1) self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
self.objs = []
for line in self.lines:
self.objs.extend(line)
return return
def get_direction(self): def get_direction(self):
return self.direction return self.direction
def get_lines(self, word_margin):
if self.get_direction() == 'V': def tsort(objs, f):
for line in self.lines: gi = dict( (obj,[]) for obj in objs )
y0 = -INF go = dict( (obj,[]) for obj in objs )
for obj in line: for obj1 in objs:
if not isinstance(obj, LTText): continue for obj2 in objs:
if word_margin: if f(obj1, obj2): # obj1 -> obj2
margin = word_margin * obj.get_margin() go[obj1].append(obj2)
if obj.y1+margin < y0: gi[obj2].append(obj1)
yield LTAnon(' ') r = objs[:]
yield obj s = []
y0 = obj.y0 while r:
yield LTAnon('\n') for obj in r:
if not go[obj] or gi[obj]: continue
for c in go[obj]:
gi[c].remove(obj)
del gi[obj]
del go[obj]
r.remove(obj)
s.append(obj)
break
else: else:
for line in self.lines: obj = r.pop()
x1 = INF del gi[obj]
for obj in line: del go[obj]
if not isinstance(obj, LTText): continue s.append(obj)
if word_margin: return s
margin = word_margin * obj.get_margin()
if x1 < obj.x0-margin:
yield LTAnon(' ')
yield obj
x1 = obj.x1
yield LTAnon('\n')
return
## LTPage ## LTPage
@ -416,19 +441,39 @@ class LTPage(LayoutContainer):
def __repr__(self): def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate)) return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def fixate(self, dirtection='H'): def analyze_layout(self, laparams):
return textobjs = []
otherobjs = []
def group_text(self, char_margin, line_margin): for obj in self.objs:
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ] if isinstance(obj, LTText) and obj.is_upright():
objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ] textobjs.append(obj)
if self.get_direction() == 'V': else:
objs += group_objs(textobjs, line_margin, char_margin, LTTextBox) otherobjs.append(obj)
lines = reorder_hv(objs, -1) if laparams.direction == 'V':
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)))
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
def vorder(obj1, obj2):
if obj1.voverlap(obj2):
return obj2.x1 < obj1.x1
elif obj1.hoverlap(obj2):
return obj2.y1 < obj1.y1
else:
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
boxes = tsort(boxes, vorder)
else: else:
objs += group_objs(textobjs, char_margin, line_margin, LTTextBox) lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
lines = reorder_vh(objs, +1) (lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)))
self.objs = [] boxes = ClusterSet.build(lines, 0, laparams.line_margin,
for line in lines: (lambda id,objs: LTTextBox(id, objs, 'H')))
self.objs.extend(line) def horder(obj1, obj2):
if obj1.hoverlap(obj2):
return obj2.y1 < obj1.y1
elif obj1.voverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return return

View File

@ -756,9 +756,8 @@ class PDFPageInterpreter(object):
## ##
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''): def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
if not doc.is_extractable: if not doc.is_extractable:
@ -768,5 +767,4 @@ def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
if pagenos and (pageno not in pagenos): continue if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page) interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break if maxpages and maxpages <= pageno+1: break
fp.close()
return return

View File

@ -47,6 +47,10 @@ def bsearch(objs, v0):
(v, obj) = objs[i] (v, obj) = objs[i]
if v0 == v: if v0 == v:
(i0,i1) = (i,i+1) (i0,i1) = (i,i+1)
while 0 < i0 and objs[i0-1][0] == v0:
i0 -= 1
while i1 < len(objs)-1 and objs[i1][0] == v0:
i1 += 1
break break
elif v0 < v: elif v0 < v:
i1 = i i1 = i

View File

@ -1,7 +1,7 @@
# GNUMakefile for test # GNUMakefile for test
PYTHON=python PYTHON=python
PDF2TXT=$(PYTHON) ../tools/pdf2txt.py PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
HTMLS= \ HTMLS= \
simple1.html \ simple1.html \

View File

@ -14,7 +14,7 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''', PDF parser that can be used for other purposes instead of text analysis.''',
keywords='pdf parser, pdf converter, text mining', keywords=['pdf parser', 'pdf converter', 'text mining'],
license='MIT/X', license='MIT/X',
author='Yusuke Shinyama', author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu', author_email='yusuke at cs dot nyu dot edu',

View File

View File

@ -19,7 +19,10 @@ import sys
# comment out at runtime. # comment out at runtime.
import cgitb; cgitb.enable() import cgitb; cgitb.enable()
import os, os.path, re, cgi, time, random, codecs, logging, traceback import os, os.path, re, cgi, time, random, codecs, logging, traceback
import pdflib.pdf2txt from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from pdfminer.cmap import CMapDB
# quote HTML metacharacters # quote HTML metacharacters
@ -35,6 +38,7 @@ def url(base, **kw):
r.append('%s=%s' % (k, v)) r.append('%s=%s' % (k, v))
return base+'&'.join(r) return base+'&'.join(r)
## convert ## convert
## ##
class FileSizeExceeded(ValueError): pass class FileSizeExceeded(ValueError): pass
@ -54,13 +58,16 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10,
infp.close() infp.close()
# perform conversion and # perform conversion and
# send the results over the network. # send the results over the network.
pdflib.pdf2txt.CMapDB.initialize('.', './CDBCMap') CMapDB.initialize()
rsrc = pdflib.pdf2txt.PDFResourceManager() rsrc = PDFResourceManager()
laparams = LAParams()
if html: if html:
device = pdflib.pdf2txt.HTMLConverter(rsrc, outfp, codec=codec) device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
else: else:
device = pdflib.pdf2txt.TextConverter(rsrc, outfp, codec=codec) device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
pdflib.pdf2txt.convert(rsrc, device, path, pagenos, maxpages=maxpages) fp = file(path, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
fp.close()
return return

View File

@ -5,17 +5,18 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_p
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmap import CMapDB, find_cmap_path from pdfminer.cmap import CMapDB, find_cmap_path
from pdfminer.layout import LAParams
# main # main
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-M char_margin] [-L line_margin] [-W word_margin] ' '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0]) '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -31,12 +32,10 @@ def main(argv):
outfile = None outfile = None
outtype = None outtype = None
codec = 'utf-8' codec = 'utf-8'
char_margin = 1.0
line_margin = 0.3
word_margin = 0.2
pageno = 1 pageno = 1
scale = 1 scale = 1
showpageno = True showpageno = True
laparams = LAParams()
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
@ -47,9 +46,10 @@ def main(argv):
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
elif k == '-M': char_margin = float(v) elif k == '-D': laparams.direction = v
elif k == '-L': line_margin = float(v) elif k == '-M': laparams.char_margin = float(v)
elif k == '-W': word_margin = float(v) elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
# #
CMapDB.debug = debug CMapDB.debug = debug
PDFResourceManager.debug = debug PDFResourceManager.debug = debug
@ -74,20 +74,19 @@ def main(argv):
else: else:
outfp = sys.stdout outfp = sys.stdout
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'sgml': elif outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec) device = TagExtractor(rsrc, outfp, codec=codec)
else: else:
return usage() return usage()
for fname in args: for fname in args:
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password) fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close() device.close()
return return

View File

@ -1,152 +0,0 @@
#!/usr/bin/env python
import sys, sgmllib
__all__ = [ 'Document', 'Page', 'Text', 'PDFSGMLParser' ]
def fixed(x):
return int(float(x)*1000)
def getbbox(s):
(a,b,c,d) = s.split(',')
return (fixed(a),fixed(b),fixed(c),fixed(d))
## Document
##
class Document:
def __init__(self):
self.pages = []
return
def __repr__(self):
return '<Document: pages=%r>' % self.pages
def get_pages(self):
return self.pages
def add_page(self, page):
self.pages.append(page)
return
def add_text(self, text):
self.pages[-1].add_text(text)
return
## Page
##
class Page:
def __init__(self, pageid, bbox, rotate):
self.pageid = pageid
self.bbox = bbox
self.rotate = rotate
self.texts = []
return
def __repr__(self):
return '<Page(%s): texts=%r>' % (self.pageid, self.texts)
def get_texts(self):
return self.texts
def add_text(self, text):
self.texts.append(text)
return
## Text
##
class Text:
def __init__(self, font, direction, bbox, size):
self.font = font
self.direction = direction
self.bbox = bbox
self.size = size
self.data = ''
return
def __repr__(self):
return '<Text: %r>' % (self.data)
def add_data(self, data):
self.data += data
return
## PDFSGMLParser
##
class PDFSGMLParser(sgmllib.SGMLParser):
def __init__(self, doc):
sgmllib.SGMLParser.__init__(self)
self.doc = doc
self.curtext = None
return
def start_document(self, attrs):
return
def end_document(self):
return
def start_page(self, attrs):
attrs = dict(attrs)
pageid = attrs['id']
bbox = getbbox(attrs['bbox'])
rotate = int(attrs['rotate'])
page = Page(pageid, bbox, rotate)
self.doc.add_page(page)
return
def end_page(self):
return
def start_text(self, attrs):
attrs = dict(attrs)
font = attrs['font']
direction = attrs['direction']
bbox = getbbox(attrs['bbox'])
size = fixed(attrs['fontsize'])
text = Text(font, direction, bbox, size)
self.curtext = text
return
def end_text(self):
assert self.curtext
self.doc.add_text(self.curtext)
self.curtext = None
return
def handle_data(self, data):
if not self.curtext: return
self.curtext.add_data(data)
return
def feedfile(self, fp, encoding='utf-8'):
for line in fp:
line = unicode(line, encoding, 'ignore')
self.feed(line)
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-c encoding] [file ...]' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dc:')
except getopt.GetoptError:
return usage()
encoding = 'utf-8'
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-c': encoding = v
for fname in args:
doc = Document()
parser = PDFSGMLParser(doc)
parser.feedfile(fname, encoding)
parser.close()
print doc
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -1,162 +0,0 @@
#!/usr/bin/env python
import sys
from sgml import PDFSGMLParser, Document
stdout = sys.stdout
stderr = sys.stderr
try:
import pygame
from pygame.locals import *
except ImportError:
print >>stderr, 'you need pygame'
sys.exit(111)
def scale(x):
return int(x*0.002)
## FontManager
##
class FontManager:
fonts = {}
#default_font = '/Library/Fonts/Vera.ttf'
default_font = '/usr/share/fonts/truetype/kochi/kochi-gothic.ttf'
@classmethod
def get_font(klass, path, size):
if not path:
path = klass.default_font
size = int(size)
k = (path,size)
if k not in klass.fonts:
font = pygame.font.Font(path, size)
klass.fonts[k] = font
else:
font = klass.fonts[k]
return font
## PDFViewer
##
class PDFViewer:
BGCOLOR = (255,255,255)
FGCOLOR = (0,0,0)
def __init__(self, display, doc):
self.display = display
self.buf = None
self.pages = doc.get_pages()
self.render_page(0)
return
def render_page(self, pageno):
print >>stderr, 'rendering: page=%d...' % pageno
page = self.pages[pageno]
(x,y,w,h) = page.bbox
self.width = scale(w)
self.height = scale(h)
self.buf = pygame.Surface((self.width, self.height))
self.buf.fill(self.BGCOLOR)
for text in page.get_texts():
font = FontManager.get_font(None, scale(text.size*0.7))
(x,y,w,h) = text.bbox
r = font.render(text.data, 1, self.FGCOLOR)
self.buf.blit(r, (scale(x), self.height-scale(y)))
self.pageno = pageno
self.pos = (0,0)
self.refresh()
return
def refresh(self):
size = self.display.get_size()
self.display.blit(self.buf, (0,0), (self.pos, size))
pygame.display.flip()
return
STEP = 8
def run(self):
loop = True
key = None
(w,h) = self.display.get_size()
xmax = self.width - w
ymax = self.height - h
while loop:
for e in pygame.event.get():
if e.type == VIDEOEXPOSE:
self.refresh()
elif e.type == KEYDOWN:
if e.key in (K_ESCAPE, K_RETURN, K_q):
loop = False
break
elif e.key == K_SPACE:
if self.pageno < len(self.pages)-1:
self.render_page(self.pageno+1)
elif e.key == K_b:
if 0 < self.pageno:
self.render_page(self.pageno-1)
else:
key = e.key
elif e.type == KEYUP:
key = None
if key:
(x,y) = self.pos
if key in (K_h, K_LEFT, K_KP4):
x = max(0, x-self.STEP)
elif key in (K_l, K_RIGHT, K_KP6):
x = min(xmax, x+self.STEP)
elif key in (K_k, K_UP, K_KP8):
y = max(0, y-self.STEP)
elif key in (K_j, K_DOWN, K_KP2):
y = min(ymax, y+self.STEP)
self.pos = (x,y)
self.refresh()
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-c encoding] file' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dc:P:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
encoding = 'utf-8'
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
password = ''
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-c': encoding = v
elif k == '-P': password = v
#
fname = args.pop(0)
if fname.endswith('.pdf'):
# convert .pdf to sgml
import tempfile
from pdf2txt import CMapDB, PDFResourceManager, pdf2txt
print >>stderr, 'reading %r...' % fname
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug)
fp = tempfile.TemporaryFile()
pdf2txt(fp, rsrc, fname, None, encoding, password=password, debug=debug)
fp.seek(0)
else:
fp = file(fname, 'rb')
doc = Document()
parser = PDFSGMLParser(doc)
parser.feedfile(fp, encoding)
parser.close()
fp.close()
#
pygame.init()
pygame.display.set_mode((640,480))
PDFViewer(pygame.display.get_surface(), doc).run()
return
if __name__ == '__main__': sys.exit(main(sys.argv))