layout analysis improved.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@120 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
0113486b76
commit
8a5bec5065
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sun Jul 12 00:36:44 JST 2009
|
||||
Last Modified: Tue Jul 21 16:24:26 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -191,6 +191,7 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
|
|||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||
</ul>
|
||||
<p>
|
||||
<dt> <code>-D <em>direction</em></code>
|
||||
<dt> <code>-M <em>char_margin</em></code>
|
||||
<dt> <code>-L <em>line_margin</em></code>
|
||||
<dt> <code>-W <em>word_margin</em></code>
|
||||
|
@ -318,6 +319,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<hr noshade>
|
||||
<h2>Changes</h2>
|
||||
<ul>
|
||||
<li> 2009/07/21: Improvement in layout analysis.
|
||||
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
|
||||
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
||||
<li> 2009/03/30: Text output mode added.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
__version__ = '20090711'
|
||||
__version__ = '20090721'
|
||||
|
||||
if __name__ == '__main__': print __version__
|
||||
|
|
|
@ -199,7 +199,9 @@ class CMapDB(object):
|
|||
cmapdb = {}
|
||||
|
||||
@classmethod
|
||||
def initialize(klass, dirname, cdbdirname=None):
|
||||
def initialize(klass, dirname=None, cdbdirname=None):
|
||||
if not dirname:
|
||||
dirname = find_cmap_path()
|
||||
klass.dirname = dirname
|
||||
klass.cdbdirname = cdbdirname or dirname
|
||||
return
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import sys
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
|
||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
|
||||
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||
|
||||
|
||||
|
@ -10,10 +10,9 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
|||
##
|
||||
class PDFPageAggregator(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
|
||||
def __init__(self, rsrc, pageno=1, laparams=None):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
self.char_margin = char_margin
|
||||
self.line_margin = line_margin
|
||||
self.laparams = laparams
|
||||
self.undefined_char = '?'
|
||||
self.pageno = pageno
|
||||
self.stack = []
|
||||
|
@ -27,9 +26,9 @@ class PDFPageAggregator(PDFDevice):
|
|||
assert not self.stack
|
||||
assert isinstance(self.cur_item, LTPage)
|
||||
self.cur_item.fixate()
|
||||
if self.laparams:
|
||||
self.cur_item.analyze_layout(self.laparams)
|
||||
self.pageno += 1
|
||||
if self.char_margin != None and self.line_margin != None:
|
||||
self.cur_item.group_text(self.char_margin, self.line_margin)
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
|
@ -79,7 +78,7 @@ class PDFPageAggregator(PDFDevice):
|
|||
|
||||
def render_chars(self, textmatrix, textstate, chars):
|
||||
if not chars: return (0, 0)
|
||||
item = LTText(textmatrix, textstate.font, textstate.fontsize,
|
||||
item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
|
||||
textstate.charspace, textstate.scaling, chars)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
@ -116,13 +115,10 @@ class PDFPageAggregator(PDFDevice):
|
|||
##
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||
char_margin=None, line_margin=None, word_margin=None):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
|
||||
char_margin=char_margin, line_margin=line_margin)
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
self.word_margin = word_margin
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
|
@ -202,17 +198,6 @@ class SGMLConverter(PDFConverter):
|
|||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</page>\n')
|
||||
elif isinstance(item, LTText):
|
||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(enc(item.font.fontname), item.is_vertical(),
|
||||
item.get_bbox(), item.fontsize))
|
||||
self.write(item.text)
|
||||
self.outfp.write('</text>\n')
|
||||
elif isinstance(item, LTAnon):
|
||||
if item.text == ' ':
|
||||
self.outfp.write('<space>\n')
|
||||
elif item.text == '\n':
|
||||
self.outfp.write('<newline>\n')
|
||||
elif isinstance(item, LTLine):
|
||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||
elif isinstance(item, LTRect):
|
||||
|
@ -222,11 +207,26 @@ class SGMLConverter(PDFConverter):
|
|||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</figure>\n')
|
||||
elif isinstance(item, LTTextLine):
|
||||
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textline>\n')
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||
for child in item.get_lines(self.word_margin):
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textbox>\n')
|
||||
elif isinstance(item, LTTextItem):
|
||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||
(enc(item.font.fontname), item.is_vertical(),
|
||||
item.get_bbox(), item.fontsize))
|
||||
self.write(item.text)
|
||||
self.outfp.write('</text>\n')
|
||||
elif isinstance(item, LTText):
|
||||
self.outfp.write('<text>%s</text>\n', item.text)
|
||||
else:
|
||||
assert 0, item
|
||||
return
|
||||
page = PDFConverter.end_page(self, page)
|
||||
render(page)
|
||||
|
@ -237,11 +237,9 @@ class SGMLConverter(PDFConverter):
|
|||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||
char_margin=None, line_margin=None, word_margin=None,
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, showpageno=True, pagepad=50):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
self.pagepad = pagepad
|
||||
self.scale = scale
|
||||
|
@ -268,7 +266,7 @@ class HTMLConverter(PDFConverter):
|
|||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTText):
|
||||
elif isinstance(item, LTTextItem):
|
||||
if item.vertical:
|
||||
wmode = 'tb-rl'
|
||||
else:
|
||||
|
@ -281,13 +279,14 @@ class HTMLConverter(PDFConverter):
|
|||
self.outfp.write('</span>\n')
|
||||
if self.debug:
|
||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTAnon):
|
||||
pass
|
||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTTextLine):
|
||||
for child in item:
|
||||
render(child)
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
for child in item.get_lines(self.word_margin):
|
||||
for child in item:
|
||||
render(child)
|
||||
return
|
||||
page = PDFConverter.end_page(self, page)
|
||||
|
@ -307,11 +306,9 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||
char_margin=None, line_margin=None, word_margin=None,
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
showpageno=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.showpageno = showpageno
|
||||
return
|
||||
|
||||
|
@ -322,14 +319,12 @@ class TextConverter(PDFConverter):
|
|||
def end_page(self, page):
|
||||
def render(item):
|
||||
if isinstance(item, LTText):
|
||||
self.write(item.text+'\n')
|
||||
elif isinstance(item, LTTextBox):
|
||||
for obj in item.get_lines(self.word_margin):
|
||||
self.write(obj.text)
|
||||
self.write('\n')
|
||||
self.write(item.text)
|
||||
elif isinstance(item, LayoutContainer):
|
||||
for child in item:
|
||||
render(child)
|
||||
if isinstance(item, LTTextBox):
|
||||
self.write('\n')
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.showpageno:
|
||||
self.write('Page %d\n' % page.id)
|
||||
|
|
|
@ -4,50 +4,24 @@ from pdfminer.utils import apply_matrix_norm, bsearch
|
|||
INF = sys.maxint
|
||||
|
||||
|
||||
## reorder_hv, reorder_vh
|
||||
## chop_hv, chop_vh
|
||||
## LAParams
|
||||
##
|
||||
## Reorders objects according to its writing direction.
|
||||
##
|
||||
def reorder_vh(objs, hdir):
|
||||
if 0 < hdir:
|
||||
hkey = (lambda obj: obj.x0)
|
||||
vkey = (lambda obj: -obj.y1)
|
||||
else:
|
||||
hkey = (lambda obj: -obj.x1)
|
||||
vkey = (lambda obj: -obj.y1)
|
||||
r = []
|
||||
line = []
|
||||
for obj in sorted(objs, key=vkey):
|
||||
if line:
|
||||
v = line[-1].voverlap(obj) * 2
|
||||
if v < obj.height or v < line[-1].height:
|
||||
line.sort(key=hkey)
|
||||
r.append(line)
|
||||
line = []
|
||||
line.append(obj)
|
||||
line.sort(key=hkey)
|
||||
r.append(line)
|
||||
return r
|
||||
class LAParams(object):
|
||||
|
||||
def reorder_hv(objs, hdir):
|
||||
if 0 < hdir:
|
||||
hkey = (lambda obj: obj.x0)
|
||||
vkey = (lambda obj: -obj.y1)
|
||||
else:
|
||||
hkey = (lambda obj: -obj.x1)
|
||||
vkey = (lambda obj: -obj.y1)
|
||||
r = []
|
||||
line = []
|
||||
for obj in sorted(objs, key=hkey):
|
||||
if line and not line[-1].hoverlap(obj):
|
||||
line.sort(key=vkey)
|
||||
r.append(line)
|
||||
line = []
|
||||
line.append(obj)
|
||||
line.sort(key=vkey)
|
||||
r.append(line)
|
||||
return r
|
||||
def __init__(self,
|
||||
direction=None,
|
||||
char_margin=1.0,
|
||||
line_margin=0.5,
|
||||
word_margin=0.1):
|
||||
self.direction = direction
|
||||
self.char_margin = char_margin
|
||||
self.line_margin = line_margin
|
||||
self.word_margin = word_margin
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
|
||||
(self.direction, self.char_margin, self.line_margin, self.word_margin))
|
||||
|
||||
|
||||
## Plane
|
||||
|
@ -91,12 +65,6 @@ class Plane(object):
|
|||
|
||||
## ClusterSet
|
||||
##
|
||||
## Maintains a set of LTTextBox objects.
|
||||
## It incrementally constructs LTTextBox objects
|
||||
## and group them when necessary. It gives
|
||||
## a sequence of LTTextBox objects that represent
|
||||
## the text stream of that page.
|
||||
##
|
||||
class ClusterSet(object):
|
||||
|
||||
def __init__(self, klass):
|
||||
|
@ -123,14 +91,16 @@ class ClusterSet(object):
|
|||
group.fixate()
|
||||
return list(r)
|
||||
|
||||
def group_objs(objs, hratio, vratio, klass):
|
||||
@classmethod
|
||||
def build(klass, objs, hratio, vratio, objtype):
|
||||
plane = Plane(objs)
|
||||
cset = ClusterSet(klass)
|
||||
cset = ClusterSet(objtype)
|
||||
for obj in objs:
|
||||
margin = obj.get_margin()
|
||||
hmargin = hratio * margin
|
||||
vmargin = vratio * margin
|
||||
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||
assert obj in neighbors, obj
|
||||
cset.add(neighbors)
|
||||
return cset.finish()
|
||||
|
||||
|
@ -140,11 +110,12 @@ def group_objs(objs, hratio, vratio, klass):
|
|||
class LayoutItem(object):
|
||||
|
||||
def __init__(self, bbox):
|
||||
#assert x0 <= x1 and y0 <= y1
|
||||
self.set_bbox(bbox)
|
||||
return
|
||||
|
||||
def set_bbox(self, (x0,y0,x1,y1)):
|
||||
if x1 < x0: (x0,x1) = (x1,x0)
|
||||
if y1 < y0: (y0,y1) = (y1,y0)
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
self.x1 = x1
|
||||
|
@ -203,6 +174,9 @@ class LayoutContainer(LayoutItem):
|
|||
def __iter__(self):
|
||||
return iter(self.objs)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.objs)
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.add(obj)
|
||||
return
|
||||
|
@ -212,7 +186,7 @@ class LayoutContainer(LayoutItem):
|
|||
return
|
||||
|
||||
# fixate(): determines its boundery and writing direction.
|
||||
def fixate(self, direction=None):
|
||||
def fixate(self):
|
||||
if not self.width and self.objs:
|
||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||
for obj in self.objs:
|
||||
|
@ -228,14 +202,7 @@ class LayoutContainer(LayoutItem):
|
|||
return self.weight
|
||||
|
||||
def get_direction(self):
|
||||
if not self.objs: return None
|
||||
d = {}
|
||||
for obj in self.objs:
|
||||
k = obj.get_direction()
|
||||
if k not in d: d[k] = 0
|
||||
d[k] += 1
|
||||
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
|
||||
return direction
|
||||
return None
|
||||
|
||||
|
||||
## LTLine
|
||||
|
@ -259,21 +226,37 @@ class LTRect(LayoutItem):
|
|||
return
|
||||
|
||||
|
||||
## LTAnon
|
||||
## LTText
|
||||
##
|
||||
class LTAnon(object):
|
||||
class LTText(object):
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<text %r>' % self.text
|
||||
|
||||
def get_weight(self):
|
||||
return len(self.text)
|
||||
|
||||
def is_upright(self):
|
||||
return True
|
||||
|
||||
|
||||
## LTAnon
|
||||
##
|
||||
class LTAnon(LTText):
|
||||
|
||||
def get_weight(self):
|
||||
return 0
|
||||
|
||||
|
||||
## LTText
|
||||
## LTTextItem
|
||||
##
|
||||
class LTText(LayoutItem):
|
||||
class LTTextItem(LayoutItem, LTText):
|
||||
|
||||
debug = 1
|
||||
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
assert chars
|
||||
|
@ -307,21 +290,25 @@ class LTText(LayoutItem):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
if self.debug:
|
||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
|
||||
self.font, self.fontsize, self.get_bbox(),
|
||||
'(%.1f, %.1f)' % self.adv,
|
||||
self.text))
|
||||
else:
|
||||
return '<text %r>' % self.text
|
||||
|
||||
def get_margin(self):
|
||||
return abs(self.fontsize)
|
||||
|
||||
def get_weight(self):
|
||||
return len(self.text)
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def is_upright(self):
|
||||
(a,b,c,d,e,f) = self.matrix
|
||||
return 0 < a*d and b*c <= 0
|
||||
|
||||
|
||||
## LTFigure
|
||||
##
|
||||
|
@ -336,6 +323,54 @@ class LTFigure(LayoutContainer):
|
|||
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
|
||||
|
||||
|
||||
## LTTextLine
|
||||
##
|
||||
class LTTextLine(LayoutContainer):
|
||||
|
||||
def __init__(self, id, objs, direction, word_margin):
|
||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||
self.direction = direction
|
||||
self.word_margin = word_margin
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
|
||||
|
||||
def get_margin(self):
|
||||
return min(self.width, self.height)
|
||||
|
||||
def get_direction(self):
|
||||
return self.direction
|
||||
|
||||
def get_text(self):
|
||||
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
||||
|
||||
def fixate(self):
|
||||
LayoutContainer.fixate(self)
|
||||
objs = []
|
||||
if self.direction == 'V':
|
||||
y0 = -INF
|
||||
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
||||
if isinstance(obj, LTTextItem) and self.word_margin:
|
||||
margin = self.word_margin * obj.get_margin()
|
||||
if obj.y1+margin < y0:
|
||||
objs.append(LTAnon(' '))
|
||||
objs.append(obj)
|
||||
y0 = obj.y0
|
||||
else:
|
||||
x1 = INF
|
||||
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
||||
if isinstance(obj, LTTextItem) and self.word_margin:
|
||||
margin = self.word_margin * obj.get_margin()
|
||||
if x1 < obj.x0-margin:
|
||||
objs.append(LTAnon(' '))
|
||||
objs.append(obj)
|
||||
x1 = obj.x1
|
||||
objs.append(LTAnon('\n'))
|
||||
self.objs = objs
|
||||
return
|
||||
|
||||
|
||||
## LTTextBox
|
||||
##
|
||||
## A set of text objects that are grouped within
|
||||
|
@ -343,65 +378,55 @@ class LTFigure(LayoutContainer):
|
|||
##
|
||||
class LTTextBox(LayoutContainer):
|
||||
|
||||
def __init__(self, id, objs):
|
||||
def __init__(self, id, objs, direction):
|
||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||
self.direction = None
|
||||
self.direction = direction
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
|
||||
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
|
||||
|
||||
def fixate(self, direction='H'):
|
||||
LayoutContainer.fixate(self, direction=direction)
|
||||
if not direction:
|
||||
if any( obj.is_vertical() for obj in self.objs ):
|
||||
direction = 'V'
|
||||
if 2 <= len(self.objs):
|
||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
||||
h = objs[0].voverlap(objs[1])
|
||||
v = objs[0].hoverlap(objs[1])
|
||||
if h < v:
|
||||
direction = 'V'
|
||||
self.direction = direction
|
||||
def get_text(self):
|
||||
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||
|
||||
def fixate(self):
|
||||
LayoutContainer.fixate(self)
|
||||
if self.direction == 'V':
|
||||
self.lines = reorder_hv(self.objs, -1)
|
||||
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
||||
else:
|
||||
self.lines = reorder_vh(self.objs, +1)
|
||||
self.objs = []
|
||||
for line in self.lines:
|
||||
self.objs.extend(line)
|
||||
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
||||
return
|
||||
|
||||
def get_direction(self):
|
||||
return self.direction
|
||||
|
||||
def get_lines(self, word_margin):
|
||||
if self.get_direction() == 'V':
|
||||
for line in self.lines:
|
||||
y0 = -INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, LTText): continue
|
||||
if word_margin:
|
||||
margin = word_margin * obj.get_margin()
|
||||
if obj.y1+margin < y0:
|
||||
yield LTAnon(' ')
|
||||
yield obj
|
||||
y0 = obj.y0
|
||||
yield LTAnon('\n')
|
||||
|
||||
def tsort(objs, f):
|
||||
gi = dict( (obj,[]) for obj in objs )
|
||||
go = dict( (obj,[]) for obj in objs )
|
||||
for obj1 in objs:
|
||||
for obj2 in objs:
|
||||
if f(obj1, obj2): # obj1 -> obj2
|
||||
go[obj1].append(obj2)
|
||||
gi[obj2].append(obj1)
|
||||
r = objs[:]
|
||||
s = []
|
||||
while r:
|
||||
for obj in r:
|
||||
if not go[obj] or gi[obj]: continue
|
||||
for c in go[obj]:
|
||||
gi[c].remove(obj)
|
||||
del gi[obj]
|
||||
del go[obj]
|
||||
r.remove(obj)
|
||||
s.append(obj)
|
||||
break
|
||||
else:
|
||||
for line in self.lines:
|
||||
x1 = INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, LTText): continue
|
||||
if word_margin:
|
||||
margin = word_margin * obj.get_margin()
|
||||
if x1 < obj.x0-margin:
|
||||
yield LTAnon(' ')
|
||||
yield obj
|
||||
x1 = obj.x1
|
||||
yield LTAnon('\n')
|
||||
return
|
||||
obj = r.pop()
|
||||
del gi[obj]
|
||||
del go[obj]
|
||||
s.append(obj)
|
||||
return s
|
||||
|
||||
|
||||
## LTPage
|
||||
|
@ -416,19 +441,39 @@ class LTPage(LayoutContainer):
|
|||
def __repr__(self):
|
||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
||||
|
||||
def fixate(self, dirtection='H'):
|
||||
return
|
||||
|
||||
def group_text(self, char_margin, line_margin):
|
||||
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
|
||||
objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
|
||||
if self.get_direction() == 'V':
|
||||
objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
|
||||
lines = reorder_hv(objs, -1)
|
||||
def analyze_layout(self, laparams):
|
||||
textobjs = []
|
||||
otherobjs = []
|
||||
for obj in self.objs:
|
||||
if isinstance(obj, LTText) and obj.is_upright():
|
||||
textobjs.append(obj)
|
||||
else:
|
||||
objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
|
||||
lines = reorder_vh(objs, +1)
|
||||
self.objs = []
|
||||
for line in lines:
|
||||
self.objs.extend(line)
|
||||
otherobjs.append(obj)
|
||||
if laparams.direction == 'V':
|
||||
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
|
||||
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)))
|
||||
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
|
||||
(lambda id,objs: LTTextBox(id, objs, 'V')))
|
||||
def vorder(obj1, obj2):
|
||||
if obj1.voverlap(obj2):
|
||||
return obj2.x1 < obj1.x1
|
||||
elif obj1.hoverlap(obj2):
|
||||
return obj2.y1 < obj1.y1
|
||||
else:
|
||||
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
|
||||
boxes = tsort(boxes, vorder)
|
||||
else:
|
||||
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
||||
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)))
|
||||
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
|
||||
(lambda id,objs: LTTextBox(id, objs, 'H')))
|
||||
def horder(obj1, obj2):
|
||||
if obj1.hoverlap(obj2):
|
||||
return obj2.y1 < obj1.y1
|
||||
elif obj1.voverlap(obj2):
|
||||
return obj1.x1 < obj2.x0
|
||||
else:
|
||||
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
|
||||
boxes = tsort(boxes, horder)
|
||||
self.objs = otherobjs + boxes
|
||||
return
|
||||
|
|
|
@ -756,9 +756,8 @@ class PDFPageInterpreter(object):
|
|||
##
|
||||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||
|
||||
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
||||
doc = PDFDocument()
|
||||
fp = file(fname, 'rb')
|
||||
parser = PDFParser(doc, fp)
|
||||
doc.initialize(password)
|
||||
if not doc.is_extractable:
|
||||
|
@ -768,5 +767,4 @@ def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
|||
if pagenos and (pageno not in pagenos): continue
|
||||
interpreter.process_page(page)
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
fp.close()
|
||||
return
|
||||
|
|
|
@ -47,6 +47,10 @@ def bsearch(objs, v0):
|
|||
(v, obj) = objs[i]
|
||||
if v0 == v:
|
||||
(i0,i1) = (i,i+1)
|
||||
while 0 < i0 and objs[i0-1][0] == v0:
|
||||
i0 -= 1
|
||||
while i1 < len(objs)-1 and objs[i1][0] == v0:
|
||||
i1 += 1
|
||||
break
|
||||
elif v0 < v:
|
||||
i1 = i
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# GNUMakefile for test
|
||||
|
||||
PYTHON=python
|
||||
PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
|
||||
|
||||
HTMLS= \
|
||||
simple1.html \
|
||||
|
|
2
setup.py
2
setup.py
|
@ -14,7 +14,7 @@ other extra information such as font information or ruled lines.
|
|||
It includes a PDF converter that can transform PDF files
|
||||
into other text formats (such as HTML). It has an extensible
|
||||
PDF parser that can be used for other purposes instead of text analysis.''',
|
||||
keywords='pdf parser, pdf converter, text mining',
|
||||
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
||||
license='MIT/X',
|
||||
author='Yusuke Shinyama',
|
||||
author_email='yusuke at cs dot nyu dot edu',
|
||||
|
|
|
@ -19,7 +19,10 @@ import sys
|
|||
# comment out at runtime.
|
||||
import cgitb; cgitb.enable()
|
||||
import os, os.path, re, cgi, time, random, codecs, logging, traceback
|
||||
import pdflib.pdf2txt
|
||||
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
|
||||
from pdfminer.converter import HTMLConverter, TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.cmap import CMapDB
|
||||
|
||||
|
||||
# quote HTML metacharacters
|
||||
|
@ -35,6 +38,7 @@ def url(base, **kw):
|
|||
r.append('%s=%s' % (k, v))
|
||||
return base+'&'.join(r)
|
||||
|
||||
|
||||
## convert
|
||||
##
|
||||
class FileSizeExceeded(ValueError): pass
|
||||
|
@ -54,13 +58,16 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10,
|
|||
infp.close()
|
||||
# perform conversion and
|
||||
# send the results over the network.
|
||||
pdflib.pdf2txt.CMapDB.initialize('.', './CDBCMap')
|
||||
rsrc = pdflib.pdf2txt.PDFResourceManager()
|
||||
CMapDB.initialize()
|
||||
rsrc = PDFResourceManager()
|
||||
laparams = LAParams()
|
||||
if html:
|
||||
device = pdflib.pdf2txt.HTMLConverter(rsrc, outfp, codec=codec)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||
else:
|
||||
device = pdflib.pdf2txt.TextConverter(rsrc, outfp, codec=codec)
|
||||
pdflib.pdf2txt.convert(rsrc, device, path, pagenos, maxpages=maxpages)
|
||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||
fp = file(path, 'rb')
|
||||
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
|
||||
fp.close()
|
||||
return
|
||||
|
||||
|
||||
|
|
|
@ -5,17 +5,18 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_p
|
|||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
|
||||
from pdfminer.cmap import CMapDB, find_cmap_path
|
||||
from pdfminer.layout import LAParams
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||
'[-M char_margin] [-L line_margin] [-W word_margin] '
|
||||
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
@ -31,12 +32,10 @@ def main(argv):
|
|||
outfile = None
|
||||
outtype = None
|
||||
codec = 'utf-8'
|
||||
char_margin = 1.0
|
||||
line_margin = 0.3
|
||||
word_margin = 0.2
|
||||
pageno = 1
|
||||
scale = 1
|
||||
showpageno = True
|
||||
laparams = LAParams()
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
elif k == '-C': cmapdir = v
|
||||
|
@ -47,9 +46,10 @@ def main(argv):
|
|||
elif k == '-c': codec = v
|
||||
elif k == '-o': outfile = v
|
||||
elif k == '-s': scale = float(v)
|
||||
elif k == '-M': char_margin = float(v)
|
||||
elif k == '-L': line_margin = float(v)
|
||||
elif k == '-W': word_margin = float(v)
|
||||
elif k == '-D': laparams.direction = v
|
||||
elif k == '-M': laparams.char_margin = float(v)
|
||||
elif k == '-L': laparams.line_margin = float(v)
|
||||
elif k == '-W': laparams.word_margin = float(v)
|
||||
#
|
||||
CMapDB.debug = debug
|
||||
PDFResourceManager.debug = debug
|
||||
|
@ -74,20 +74,19 @@ def main(argv):
|
|||
else:
|
||||
outfp = sys.stdout
|
||||
if outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||
elif outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||
else:
|
||||
return usage()
|
||||
for fname in args:
|
||||
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
|
||||
fp = file(fname, 'rb')
|
||||
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
|
||||
fp.close()
|
||||
device.close()
|
||||
return
|
||||
|
||||
|
|
152
tools/sgml.py
152
tools/sgml.py
|
@ -1,152 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, sgmllib
|
||||
__all__ = [ 'Document', 'Page', 'Text', 'PDFSGMLParser' ]
|
||||
|
||||
def fixed(x):
|
||||
return int(float(x)*1000)
|
||||
def getbbox(s):
|
||||
(a,b,c,d) = s.split(',')
|
||||
return (fixed(a),fixed(b),fixed(c),fixed(d))
|
||||
|
||||
|
||||
## Document
|
||||
##
|
||||
class Document:
|
||||
|
||||
def __init__(self):
|
||||
self.pages = []
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<Document: pages=%r>' % self.pages
|
||||
|
||||
def get_pages(self):
|
||||
return self.pages
|
||||
|
||||
def add_page(self, page):
|
||||
self.pages.append(page)
|
||||
return
|
||||
|
||||
def add_text(self, text):
|
||||
self.pages[-1].add_text(text)
|
||||
return
|
||||
|
||||
|
||||
## Page
|
||||
##
|
||||
class Page:
|
||||
|
||||
def __init__(self, pageid, bbox, rotate):
|
||||
self.pageid = pageid
|
||||
self.bbox = bbox
|
||||
self.rotate = rotate
|
||||
self.texts = []
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<Page(%s): texts=%r>' % (self.pageid, self.texts)
|
||||
|
||||
def get_texts(self):
|
||||
return self.texts
|
||||
|
||||
def add_text(self, text):
|
||||
self.texts.append(text)
|
||||
return
|
||||
|
||||
|
||||
## Text
|
||||
##
|
||||
class Text:
|
||||
|
||||
def __init__(self, font, direction, bbox, size):
|
||||
self.font = font
|
||||
self.direction = direction
|
||||
self.bbox = bbox
|
||||
self.size = size
|
||||
self.data = ''
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<Text: %r>' % (self.data)
|
||||
|
||||
def add_data(self, data):
|
||||
self.data += data
|
||||
return
|
||||
|
||||
|
||||
## PDFSGMLParser
|
||||
##
|
||||
class PDFSGMLParser(sgmllib.SGMLParser):
|
||||
|
||||
def __init__(self, doc):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
self.doc = doc
|
||||
self.curtext = None
|
||||
return
|
||||
|
||||
def start_document(self, attrs):
|
||||
return
|
||||
def end_document(self):
|
||||
return
|
||||
|
||||
def start_page(self, attrs):
|
||||
attrs = dict(attrs)
|
||||
pageid = attrs['id']
|
||||
bbox = getbbox(attrs['bbox'])
|
||||
rotate = int(attrs['rotate'])
|
||||
page = Page(pageid, bbox, rotate)
|
||||
self.doc.add_page(page)
|
||||
return
|
||||
def end_page(self):
|
||||
return
|
||||
|
||||
def start_text(self, attrs):
|
||||
attrs = dict(attrs)
|
||||
font = attrs['font']
|
||||
direction = attrs['direction']
|
||||
bbox = getbbox(attrs['bbox'])
|
||||
size = fixed(attrs['fontsize'])
|
||||
text = Text(font, direction, bbox, size)
|
||||
self.curtext = text
|
||||
return
|
||||
def end_text(self):
|
||||
assert self.curtext
|
||||
self.doc.add_text(self.curtext)
|
||||
self.curtext = None
|
||||
return
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self.curtext: return
|
||||
self.curtext.add_data(data)
|
||||
return
|
||||
|
||||
def feedfile(self, fp, encoding='utf-8'):
|
||||
for line in fp:
|
||||
line = unicode(line, encoding, 'ignore')
|
||||
self.feed(line)
|
||||
return
|
||||
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-c encoding] [file ...]' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dc:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
encoding = 'utf-8'
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
elif k == '-c': encoding = v
|
||||
for fname in args:
|
||||
doc = Document()
|
||||
parser = PDFSGMLParser(doc)
|
||||
parser.feedfile(fname, encoding)
|
||||
parser.close()
|
||||
print doc
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
162
tools/viewpdf.py
162
tools/viewpdf.py
|
@ -1,162 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from sgml import PDFSGMLParser, Document
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
try:
|
||||
import pygame
|
||||
from pygame.locals import *
|
||||
except ImportError:
|
||||
print >>stderr, 'you need pygame'
|
||||
sys.exit(111)
|
||||
|
||||
|
||||
def scale(x):
|
||||
return int(x*0.002)
|
||||
|
||||
|
||||
## FontManager
|
||||
##
|
||||
class FontManager:
|
||||
|
||||
fonts = {}
|
||||
#default_font = '/Library/Fonts/Vera.ttf'
|
||||
default_font = '/usr/share/fonts/truetype/kochi/kochi-gothic.ttf'
|
||||
|
||||
@classmethod
|
||||
def get_font(klass, path, size):
|
||||
if not path:
|
||||
path = klass.default_font
|
||||
size = int(size)
|
||||
k = (path,size)
|
||||
if k not in klass.fonts:
|
||||
font = pygame.font.Font(path, size)
|
||||
klass.fonts[k] = font
|
||||
else:
|
||||
font = klass.fonts[k]
|
||||
return font
|
||||
|
||||
|
||||
## PDFViewer
|
||||
##
|
||||
class PDFViewer:
|
||||
|
||||
BGCOLOR = (255,255,255)
|
||||
FGCOLOR = (0,0,0)
|
||||
|
||||
def __init__(self, display, doc):
|
||||
self.display = display
|
||||
self.buf = None
|
||||
self.pages = doc.get_pages()
|
||||
self.render_page(0)
|
||||
return
|
||||
|
||||
def render_page(self, pageno):
|
||||
print >>stderr, 'rendering: page=%d...' % pageno
|
||||
page = self.pages[pageno]
|
||||
(x,y,w,h) = page.bbox
|
||||
self.width = scale(w)
|
||||
self.height = scale(h)
|
||||
self.buf = pygame.Surface((self.width, self.height))
|
||||
self.buf.fill(self.BGCOLOR)
|
||||
for text in page.get_texts():
|
||||
font = FontManager.get_font(None, scale(text.size*0.7))
|
||||
(x,y,w,h) = text.bbox
|
||||
r = font.render(text.data, 1, self.FGCOLOR)
|
||||
self.buf.blit(r, (scale(x), self.height-scale(y)))
|
||||
self.pageno = pageno
|
||||
self.pos = (0,0)
|
||||
self.refresh()
|
||||
return
|
||||
|
||||
def refresh(self):
|
||||
size = self.display.get_size()
|
||||
self.display.blit(self.buf, (0,0), (self.pos, size))
|
||||
pygame.display.flip()
|
||||
return
|
||||
|
||||
STEP = 8
|
||||
def run(self):
|
||||
loop = True
|
||||
key = None
|
||||
(w,h) = self.display.get_size()
|
||||
xmax = self.width - w
|
||||
ymax = self.height - h
|
||||
while loop:
|
||||
for e in pygame.event.get():
|
||||
if e.type == VIDEOEXPOSE:
|
||||
self.refresh()
|
||||
elif e.type == KEYDOWN:
|
||||
if e.key in (K_ESCAPE, K_RETURN, K_q):
|
||||
loop = False
|
||||
break
|
||||
elif e.key == K_SPACE:
|
||||
if self.pageno < len(self.pages)-1:
|
||||
self.render_page(self.pageno+1)
|
||||
elif e.key == K_b:
|
||||
if 0 < self.pageno:
|
||||
self.render_page(self.pageno-1)
|
||||
else:
|
||||
key = e.key
|
||||
elif e.type == KEYUP:
|
||||
key = None
|
||||
if key:
|
||||
(x,y) = self.pos
|
||||
if key in (K_h, K_LEFT, K_KP4):
|
||||
x = max(0, x-self.STEP)
|
||||
elif key in (K_l, K_RIGHT, K_KP6):
|
||||
x = min(xmax, x+self.STEP)
|
||||
elif key in (K_k, K_UP, K_KP8):
|
||||
y = max(0, y-self.STEP)
|
||||
elif key in (K_j, K_DOWN, K_KP2):
|
||||
y = min(ymax, y+self.STEP)
|
||||
self.pos = (x,y)
|
||||
self.refresh()
|
||||
return
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-c encoding] file' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dc:P:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
debug = 0
|
||||
encoding = 'utf-8'
|
||||
cmapdir = 'CMap'
|
||||
cdbcmapdir = 'CDBCMap'
|
||||
password = ''
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
elif k == '-c': encoding = v
|
||||
elif k == '-P': password = v
|
||||
#
|
||||
fname = args.pop(0)
|
||||
if fname.endswith('.pdf'):
|
||||
# convert .pdf to sgml
|
||||
import tempfile
|
||||
from pdf2txt import CMapDB, PDFResourceManager, pdf2txt
|
||||
print >>stderr, 'reading %r...' % fname
|
||||
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
||||
rsrc = PDFResourceManager(debug=debug)
|
||||
fp = tempfile.TemporaryFile()
|
||||
pdf2txt(fp, rsrc, fname, None, encoding, password=password, debug=debug)
|
||||
fp.seek(0)
|
||||
else:
|
||||
fp = file(fname, 'rb')
|
||||
doc = Document()
|
||||
parser = PDFSGMLParser(doc)
|
||||
parser.feedfile(fp, encoding)
|
||||
parser.close()
|
||||
fp.close()
|
||||
#
|
||||
pygame.init()
|
||||
pygame.display.set_mode((640,480))
|
||||
PDFViewer(pygame.display.get_surface(), doc).run()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
Loading…
Reference in New Issue