layout analysis improved.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@120 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
0113486b76
commit
8a5bec5065
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Jul 12 00:36:44 JST 2009
|
Last Modified: Tue Jul 21 16:24:26 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -191,6 +191,7 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
|
||||||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||||
</ul>
|
</ul>
|
||||||
<p>
|
<p>
|
||||||
|
<dt> <code>-D <em>direction</em></code>
|
||||||
<dt> <code>-M <em>char_margin</em></code>
|
<dt> <code>-M <em>char_margin</em></code>
|
||||||
<dt> <code>-L <em>line_margin</em></code>
|
<dt> <code>-L <em>line_margin</em></code>
|
||||||
<dt> <code>-W <em>word_margin</em></code>
|
<dt> <code>-W <em>word_margin</em></code>
|
||||||
|
@ -318,6 +319,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2009/07/21: Improvement in layout analysis.
|
||||||
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
|
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
|
||||||
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
|
||||||
<li> 2009/03/30: Text output mode added.
|
<li> 2009/03/30: Text output mode added.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__version__ = '20090711'
|
__version__ = '20090721'
|
||||||
|
|
||||||
if __name__ == '__main__': print __version__
|
if __name__ == '__main__': print __version__
|
||||||
|
|
|
@ -199,7 +199,9 @@ class CMapDB(object):
|
||||||
cmapdb = {}
|
cmapdb = {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def initialize(klass, dirname, cdbdirname=None):
|
def initialize(klass, dirname=None, cdbdirname=None):
|
||||||
|
if not dirname:
|
||||||
|
dirname = find_cmap_path()
|
||||||
klass.dirname = dirname
|
klass.dirname = dirname
|
||||||
klass.cdbdirname = cdbdirname or dirname
|
klass.cdbdirname = cdbdirname or dirname
|
||||||
return
|
return
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
import sys
|
import sys
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
from pdfminer.pdffont import PDFUnicodeNotDefined
|
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
|
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
|
||||||
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,10 +10,9 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||||
##
|
##
|
||||||
class PDFPageAggregator(PDFDevice):
|
class PDFPageAggregator(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
|
def __init__(self, rsrc, pageno=1, laparams=None):
|
||||||
PDFDevice.__init__(self, rsrc)
|
PDFDevice.__init__(self, rsrc)
|
||||||
self.char_margin = char_margin
|
self.laparams = laparams
|
||||||
self.line_margin = line_margin
|
|
||||||
self.undefined_char = '?'
|
self.undefined_char = '?'
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
self.stack = []
|
self.stack = []
|
||||||
|
@ -27,9 +26,9 @@ class PDFPageAggregator(PDFDevice):
|
||||||
assert not self.stack
|
assert not self.stack
|
||||||
assert isinstance(self.cur_item, LTPage)
|
assert isinstance(self.cur_item, LTPage)
|
||||||
self.cur_item.fixate()
|
self.cur_item.fixate()
|
||||||
|
if self.laparams:
|
||||||
|
self.cur_item.analyze_layout(self.laparams)
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
if self.char_margin != None and self.line_margin != None:
|
|
||||||
self.cur_item.group_text(self.char_margin, self.line_margin)
|
|
||||||
return self.cur_item
|
return self.cur_item
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
def begin_figure(self, name, bbox, matrix):
|
||||||
|
@ -79,7 +78,7 @@ class PDFPageAggregator(PDFDevice):
|
||||||
|
|
||||||
def render_chars(self, textmatrix, textstate, chars):
|
def render_chars(self, textmatrix, textstate, chars):
|
||||||
if not chars: return (0, 0)
|
if not chars: return (0, 0)
|
||||||
item = LTText(textmatrix, textstate.font, textstate.fontsize,
|
item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
|
||||||
textstate.charspace, textstate.scaling, chars)
|
textstate.charspace, textstate.scaling, chars)
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return item.adv
|
return item.adv
|
||||||
|
@ -116,13 +115,10 @@ class PDFPageAggregator(PDFDevice):
|
||||||
##
|
##
|
||||||
class PDFConverter(PDFPageAggregator):
|
class PDFConverter(PDFPageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||||
char_margin=None, line_margin=None, word_margin=None):
|
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
||||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
|
|
||||||
char_margin=char_margin, line_margin=line_margin)
|
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
self.word_margin = word_margin
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text):
|
||||||
|
@ -202,17 +198,6 @@ class SGMLConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write('</page>\n')
|
||||||
elif isinstance(item, LTText):
|
|
||||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
|
||||||
(enc(item.font.fontname), item.is_vertical(),
|
|
||||||
item.get_bbox(), item.fontsize))
|
|
||||||
self.write(item.text)
|
|
||||||
self.outfp.write('</text>\n')
|
|
||||||
elif isinstance(item, LTAnon):
|
|
||||||
if item.text == ' ':
|
|
||||||
self.outfp.write('<space>\n')
|
|
||||||
elif item.text == '\n':
|
|
||||||
self.outfp.write('<newline>\n')
|
|
||||||
elif isinstance(item, LTLine):
|
elif isinstance(item, LTLine):
|
||||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
||||||
elif isinstance(item, LTRect):
|
elif isinstance(item, LTRect):
|
||||||
|
@ -222,11 +207,26 @@ class SGMLConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</figure>\n')
|
self.outfp.write('</figure>\n')
|
||||||
|
elif isinstance(item, LTTextLine):
|
||||||
|
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
self.outfp.write('</textline>\n')
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item.get_lines(self.word_margin):
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textbox>\n')
|
self.outfp.write('</textbox>\n')
|
||||||
|
elif isinstance(item, LTTextItem):
|
||||||
|
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
|
||||||
|
(enc(item.font.fontname), item.is_vertical(),
|
||||||
|
item.get_bbox(), item.fontsize))
|
||||||
|
self.write(item.text)
|
||||||
|
self.outfp.write('</text>\n')
|
||||||
|
elif isinstance(item, LTText):
|
||||||
|
self.outfp.write('<text>%s</text>\n', item.text)
|
||||||
|
else:
|
||||||
|
assert 0, item
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
|
@ -237,11 +237,9 @@ class SGMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
char_margin=None, line_margin=None, word_margin=None,
|
|
||||||
scale=1, showpageno=True, pagepad=50):
|
scale=1, showpageno=True, pagepad=50):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
@ -268,7 +266,7 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTTextItem):
|
||||||
if item.vertical:
|
if item.vertical:
|
||||||
wmode = 'tb-rl'
|
wmode = 'tb-rl'
|
||||||
else:
|
else:
|
||||||
|
@ -281,13 +279,14 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('</span>\n')
|
self.outfp.write('</span>\n')
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTAnon):
|
|
||||||
pass
|
|
||||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
|
elif isinstance(item, LTTextLine):
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
for child in item.get_lines(self.word_margin):
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
|
@ -307,11 +306,9 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
char_margin=None, line_margin=None, word_margin=None,
|
|
||||||
showpageno=False):
|
showpageno=False):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -322,14 +319,12 @@ class TextConverter(PDFConverter):
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTText):
|
if isinstance(item, LTText):
|
||||||
self.write(item.text+'\n')
|
self.write(item.text)
|
||||||
elif isinstance(item, LTTextBox):
|
|
||||||
for obj in item.get_lines(self.word_margin):
|
|
||||||
self.write(obj.text)
|
|
||||||
self.write('\n')
|
|
||||||
elif isinstance(item, LayoutContainer):
|
elif isinstance(item, LayoutContainer):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
|
if isinstance(item, LTTextBox):
|
||||||
|
self.write('\n')
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.write('Page %d\n' % page.id)
|
self.write('Page %d\n' % page.id)
|
||||||
|
|
|
@ -4,50 +4,24 @@ from pdfminer.utils import apply_matrix_norm, bsearch
|
||||||
INF = sys.maxint
|
INF = sys.maxint
|
||||||
|
|
||||||
|
|
||||||
## reorder_hv, reorder_vh
|
## LAParams
|
||||||
## chop_hv, chop_vh
|
|
||||||
##
|
##
|
||||||
## Reorders objects according to its writing direction.
|
class LAParams(object):
|
||||||
##
|
|
||||||
def reorder_vh(objs, hdir):
|
|
||||||
if 0 < hdir:
|
|
||||||
hkey = (lambda obj: obj.x0)
|
|
||||||
vkey = (lambda obj: -obj.y1)
|
|
||||||
else:
|
|
||||||
hkey = (lambda obj: -obj.x1)
|
|
||||||
vkey = (lambda obj: -obj.y1)
|
|
||||||
r = []
|
|
||||||
line = []
|
|
||||||
for obj in sorted(objs, key=vkey):
|
|
||||||
if line:
|
|
||||||
v = line[-1].voverlap(obj) * 2
|
|
||||||
if v < obj.height or v < line[-1].height:
|
|
||||||
line.sort(key=hkey)
|
|
||||||
r.append(line)
|
|
||||||
line = []
|
|
||||||
line.append(obj)
|
|
||||||
line.sort(key=hkey)
|
|
||||||
r.append(line)
|
|
||||||
return r
|
|
||||||
|
|
||||||
def reorder_hv(objs, hdir):
|
def __init__(self,
|
||||||
if 0 < hdir:
|
direction=None,
|
||||||
hkey = (lambda obj: obj.x0)
|
char_margin=1.0,
|
||||||
vkey = (lambda obj: -obj.y1)
|
line_margin=0.5,
|
||||||
else:
|
word_margin=0.1):
|
||||||
hkey = (lambda obj: -obj.x1)
|
self.direction = direction
|
||||||
vkey = (lambda obj: -obj.y1)
|
self.char_margin = char_margin
|
||||||
r = []
|
self.line_margin = line_margin
|
||||||
line = []
|
self.word_margin = word_margin
|
||||||
for obj in sorted(objs, key=hkey):
|
return
|
||||||
if line and not line[-1].hoverlap(obj):
|
|
||||||
line.sort(key=vkey)
|
def __repr__(self):
|
||||||
r.append(line)
|
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
|
||||||
line = []
|
(self.direction, self.char_margin, self.line_margin, self.word_margin))
|
||||||
line.append(obj)
|
|
||||||
line.sort(key=vkey)
|
|
||||||
r.append(line)
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
## Plane
|
## Plane
|
||||||
|
@ -91,12 +65,6 @@ class Plane(object):
|
||||||
|
|
||||||
## ClusterSet
|
## ClusterSet
|
||||||
##
|
##
|
||||||
## Maintains a set of LTTextBox objects.
|
|
||||||
## It incrementally constructs LTTextBox objects
|
|
||||||
## and group them when necessary. It gives
|
|
||||||
## a sequence of LTTextBox objects that represent
|
|
||||||
## the text stream of that page.
|
|
||||||
##
|
|
||||||
class ClusterSet(object):
|
class ClusterSet(object):
|
||||||
|
|
||||||
def __init__(self, klass):
|
def __init__(self, klass):
|
||||||
|
@ -123,14 +91,16 @@ class ClusterSet(object):
|
||||||
group.fixate()
|
group.fixate()
|
||||||
return list(r)
|
return list(r)
|
||||||
|
|
||||||
def group_objs(objs, hratio, vratio, klass):
|
@classmethod
|
||||||
|
def build(klass, objs, hratio, vratio, objtype):
|
||||||
plane = Plane(objs)
|
plane = Plane(objs)
|
||||||
cset = ClusterSet(klass)
|
cset = ClusterSet(objtype)
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
margin = obj.get_margin()
|
margin = obj.get_margin()
|
||||||
hmargin = hratio * margin
|
hmargin = hratio * margin
|
||||||
vmargin = vratio * margin
|
vmargin = vratio * margin
|
||||||
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||||
|
assert obj in neighbors, obj
|
||||||
cset.add(neighbors)
|
cset.add(neighbors)
|
||||||
return cset.finish()
|
return cset.finish()
|
||||||
|
|
||||||
|
@ -140,11 +110,12 @@ def group_objs(objs, hratio, vratio, klass):
|
||||||
class LayoutItem(object):
|
class LayoutItem(object):
|
||||||
|
|
||||||
def __init__(self, bbox):
|
def __init__(self, bbox):
|
||||||
#assert x0 <= x1 and y0 <= y1
|
|
||||||
self.set_bbox(bbox)
|
self.set_bbox(bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_bbox(self, (x0,y0,x1,y1)):
|
def set_bbox(self, (x0,y0,x1,y1)):
|
||||||
|
if x1 < x0: (x0,x1) = (x1,x0)
|
||||||
|
if y1 < y0: (y0,y1) = (y1,y0)
|
||||||
self.x0 = x0
|
self.x0 = x0
|
||||||
self.y0 = y0
|
self.y0 = y0
|
||||||
self.x1 = x1
|
self.x1 = x1
|
||||||
|
@ -203,6 +174,9 @@ class LayoutContainer(LayoutItem):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter(self.objs)
|
return iter(self.objs)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.objs)
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
self.objs.add(obj)
|
self.objs.add(obj)
|
||||||
return
|
return
|
||||||
|
@ -212,7 +186,7 @@ class LayoutContainer(LayoutItem):
|
||||||
return
|
return
|
||||||
|
|
||||||
# fixate(): determines its boundery and writing direction.
|
# fixate(): determines its boundery and writing direction.
|
||||||
def fixate(self, direction=None):
|
def fixate(self):
|
||||||
if not self.width and self.objs:
|
if not self.width and self.objs:
|
||||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||||
for obj in self.objs:
|
for obj in self.objs:
|
||||||
|
@ -228,14 +202,7 @@ class LayoutContainer(LayoutItem):
|
||||||
return self.weight
|
return self.weight
|
||||||
|
|
||||||
def get_direction(self):
|
def get_direction(self):
|
||||||
if not self.objs: return None
|
return None
|
||||||
d = {}
|
|
||||||
for obj in self.objs:
|
|
||||||
k = obj.get_direction()
|
|
||||||
if k not in d: d[k] = 0
|
|
||||||
d[k] += 1
|
|
||||||
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
|
|
||||||
return direction
|
|
||||||
|
|
||||||
|
|
||||||
## LTLine
|
## LTLine
|
||||||
|
@ -259,21 +226,37 @@ class LTRect(LayoutItem):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTAnon
|
## LTText
|
||||||
##
|
##
|
||||||
class LTAnon(object):
|
class LTText(object):
|
||||||
|
|
||||||
def __init__(self, text):
|
def __init__(self, text):
|
||||||
self.text = text
|
self.text = text
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<text %r>' % self.text
|
||||||
|
|
||||||
|
def get_weight(self):
|
||||||
|
return len(self.text)
|
||||||
|
|
||||||
|
def is_upright(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
## LTAnon
|
||||||
|
##
|
||||||
|
class LTAnon(LTText):
|
||||||
|
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
## LTText
|
## LTTextItem
|
||||||
##
|
##
|
||||||
class LTText(LayoutItem):
|
class LTTextItem(LayoutItem, LTText):
|
||||||
|
|
||||||
|
debug = 1
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
assert chars
|
assert chars
|
||||||
|
@ -307,21 +290,25 @@ class LTText(LayoutItem):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
if self.debug:
|
||||||
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
||||||
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
|
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
|
||||||
self.font, self.fontsize, self.get_bbox(),
|
self.font, self.fontsize, self.get_bbox(),
|
||||||
'(%.1f, %.1f)' % self.adv,
|
'(%.1f, %.1f)' % self.adv,
|
||||||
self.text))
|
self.text))
|
||||||
|
else:
|
||||||
|
return '<text %r>' % self.text
|
||||||
|
|
||||||
def get_margin(self):
|
def get_margin(self):
|
||||||
return abs(self.fontsize)
|
return abs(self.fontsize)
|
||||||
|
|
||||||
def get_weight(self):
|
|
||||||
return len(self.text)
|
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
return self.vertical
|
return self.vertical
|
||||||
|
|
||||||
|
def is_upright(self):
|
||||||
|
(a,b,c,d,e,f) = self.matrix
|
||||||
|
return 0 < a*d and b*c <= 0
|
||||||
|
|
||||||
|
|
||||||
## LTFigure
|
## LTFigure
|
||||||
##
|
##
|
||||||
|
@ -336,6 +323,54 @@ class LTFigure(LayoutContainer):
|
||||||
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
|
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
|
||||||
|
|
||||||
|
|
||||||
|
## LTTextLine
|
||||||
|
##
|
||||||
|
class LTTextLine(LayoutContainer):
|
||||||
|
|
||||||
|
def __init__(self, id, objs, direction, word_margin):
|
||||||
|
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||||
|
self.direction = direction
|
||||||
|
self.word_margin = word_margin
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
|
||||||
|
|
||||||
|
def get_margin(self):
|
||||||
|
return min(self.width, self.height)
|
||||||
|
|
||||||
|
def get_direction(self):
|
||||||
|
return self.direction
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
|
||||||
|
|
||||||
|
def fixate(self):
|
||||||
|
LayoutContainer.fixate(self)
|
||||||
|
objs = []
|
||||||
|
if self.direction == 'V':
|
||||||
|
y0 = -INF
|
||||||
|
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
|
||||||
|
if isinstance(obj, LTTextItem) and self.word_margin:
|
||||||
|
margin = self.word_margin * obj.get_margin()
|
||||||
|
if obj.y1+margin < y0:
|
||||||
|
objs.append(LTAnon(' '))
|
||||||
|
objs.append(obj)
|
||||||
|
y0 = obj.y0
|
||||||
|
else:
|
||||||
|
x1 = INF
|
||||||
|
for obj in sorted(self.objs, key=lambda obj: obj.x0):
|
||||||
|
if isinstance(obj, LTTextItem) and self.word_margin:
|
||||||
|
margin = self.word_margin * obj.get_margin()
|
||||||
|
if x1 < obj.x0-margin:
|
||||||
|
objs.append(LTAnon(' '))
|
||||||
|
objs.append(obj)
|
||||||
|
x1 = obj.x1
|
||||||
|
objs.append(LTAnon('\n'))
|
||||||
|
self.objs = objs
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## LTTextBox
|
## LTTextBox
|
||||||
##
|
##
|
||||||
## A set of text objects that are grouped within
|
## A set of text objects that are grouped within
|
||||||
|
@ -343,65 +378,55 @@ class LTFigure(LayoutContainer):
|
||||||
##
|
##
|
||||||
class LTTextBox(LayoutContainer):
|
class LTTextBox(LayoutContainer):
|
||||||
|
|
||||||
def __init__(self, id, objs):
|
def __init__(self, id, objs, direction):
|
||||||
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
|
||||||
self.direction = None
|
self.direction = direction
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
|
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
|
||||||
|
|
||||||
def fixate(self, direction='H'):
|
def get_text(self):
|
||||||
LayoutContainer.fixate(self, direction=direction)
|
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
|
||||||
if not direction:
|
|
||||||
if any( obj.is_vertical() for obj in self.objs ):
|
def fixate(self):
|
||||||
direction = 'V'
|
LayoutContainer.fixate(self)
|
||||||
if 2 <= len(self.objs):
|
|
||||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
|
||||||
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
|
|
||||||
h = objs[0].voverlap(objs[1])
|
|
||||||
v = objs[0].hoverlap(objs[1])
|
|
||||||
if h < v:
|
|
||||||
direction = 'V'
|
|
||||||
self.direction = direction
|
|
||||||
if self.direction == 'V':
|
if self.direction == 'V':
|
||||||
self.lines = reorder_hv(self.objs, -1)
|
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
|
||||||
else:
|
else:
|
||||||
self.lines = reorder_vh(self.objs, +1)
|
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
|
||||||
self.objs = []
|
|
||||||
for line in self.lines:
|
|
||||||
self.objs.extend(line)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_direction(self):
|
def get_direction(self):
|
||||||
return self.direction
|
return self.direction
|
||||||
|
|
||||||
def get_lines(self, word_margin):
|
|
||||||
if self.get_direction() == 'V':
|
def tsort(objs, f):
|
||||||
for line in self.lines:
|
gi = dict( (obj,[]) for obj in objs )
|
||||||
y0 = -INF
|
go = dict( (obj,[]) for obj in objs )
|
||||||
for obj in line:
|
for obj1 in objs:
|
||||||
if not isinstance(obj, LTText): continue
|
for obj2 in objs:
|
||||||
if word_margin:
|
if f(obj1, obj2): # obj1 -> obj2
|
||||||
margin = word_margin * obj.get_margin()
|
go[obj1].append(obj2)
|
||||||
if obj.y1+margin < y0:
|
gi[obj2].append(obj1)
|
||||||
yield LTAnon(' ')
|
r = objs[:]
|
||||||
yield obj
|
s = []
|
||||||
y0 = obj.y0
|
while r:
|
||||||
yield LTAnon('\n')
|
for obj in r:
|
||||||
|
if not go[obj] or gi[obj]: continue
|
||||||
|
for c in go[obj]:
|
||||||
|
gi[c].remove(obj)
|
||||||
|
del gi[obj]
|
||||||
|
del go[obj]
|
||||||
|
r.remove(obj)
|
||||||
|
s.append(obj)
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
for line in self.lines:
|
obj = r.pop()
|
||||||
x1 = INF
|
del gi[obj]
|
||||||
for obj in line:
|
del go[obj]
|
||||||
if not isinstance(obj, LTText): continue
|
s.append(obj)
|
||||||
if word_margin:
|
return s
|
||||||
margin = word_margin * obj.get_margin()
|
|
||||||
if x1 < obj.x0-margin:
|
|
||||||
yield LTAnon(' ')
|
|
||||||
yield obj
|
|
||||||
x1 = obj.x1
|
|
||||||
yield LTAnon('\n')
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## LTPage
|
## LTPage
|
||||||
|
@ -416,19 +441,39 @@ class LTPage(LayoutContainer):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
||||||
|
|
||||||
def fixate(self, dirtection='H'):
|
def analyze_layout(self, laparams):
|
||||||
return
|
textobjs = []
|
||||||
|
otherobjs = []
|
||||||
def group_text(self, char_margin, line_margin):
|
for obj in self.objs:
|
||||||
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
|
if isinstance(obj, LTText) and obj.is_upright():
|
||||||
objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
|
textobjs.append(obj)
|
||||||
if self.get_direction() == 'V':
|
|
||||||
objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
|
|
||||||
lines = reorder_hv(objs, -1)
|
|
||||||
else:
|
else:
|
||||||
objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
|
otherobjs.append(obj)
|
||||||
lines = reorder_vh(objs, +1)
|
if laparams.direction == 'V':
|
||||||
self.objs = []
|
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
|
||||||
for line in lines:
|
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)))
|
||||||
self.objs.extend(line)
|
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
|
||||||
|
(lambda id,objs: LTTextBox(id, objs, 'V')))
|
||||||
|
def vorder(obj1, obj2):
|
||||||
|
if obj1.voverlap(obj2):
|
||||||
|
return obj2.x1 < obj1.x1
|
||||||
|
elif obj1.hoverlap(obj2):
|
||||||
|
return obj2.y1 < obj1.y1
|
||||||
|
else:
|
||||||
|
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
|
||||||
|
boxes = tsort(boxes, vorder)
|
||||||
|
else:
|
||||||
|
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
||||||
|
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)))
|
||||||
|
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
|
||||||
|
(lambda id,objs: LTTextBox(id, objs, 'H')))
|
||||||
|
def horder(obj1, obj2):
|
||||||
|
if obj1.hoverlap(obj2):
|
||||||
|
return obj2.y1 < obj1.y1
|
||||||
|
elif obj1.voverlap(obj2):
|
||||||
|
return obj1.x1 < obj2.x0
|
||||||
|
else:
|
||||||
|
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
|
||||||
|
boxes = tsort(boxes, horder)
|
||||||
|
self.objs = otherobjs + boxes
|
||||||
return
|
return
|
||||||
|
|
|
@ -756,9 +756,8 @@ class PDFPageInterpreter(object):
|
||||||
##
|
##
|
||||||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||||
|
|
||||||
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
||||||
doc = PDFDocument()
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
|
||||||
parser = PDFParser(doc, fp)
|
parser = PDFParser(doc, fp)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
if not doc.is_extractable:
|
if not doc.is_extractable:
|
||||||
|
@ -768,5 +767,4 @@ def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||||
if pagenos and (pageno not in pagenos): continue
|
if pagenos and (pageno not in pagenos): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
if maxpages and maxpages <= pageno+1: break
|
if maxpages and maxpages <= pageno+1: break
|
||||||
fp.close()
|
|
||||||
return
|
return
|
||||||
|
|
|
@ -47,6 +47,10 @@ def bsearch(objs, v0):
|
||||||
(v, obj) = objs[i]
|
(v, obj) = objs[i]
|
||||||
if v0 == v:
|
if v0 == v:
|
||||||
(i0,i1) = (i,i+1)
|
(i0,i1) = (i,i+1)
|
||||||
|
while 0 < i0 and objs[i0-1][0] == v0:
|
||||||
|
i0 -= 1
|
||||||
|
while i1 < len(objs)-1 and objs[i1][0] == v0:
|
||||||
|
i1 += 1
|
||||||
break
|
break
|
||||||
elif v0 < v:
|
elif v0 < v:
|
||||||
i1 = i
|
i1 = i
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# GNUMakefile for test
|
# GNUMakefile for test
|
||||||
|
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
|
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
|
||||||
|
|
||||||
HTMLS= \
|
HTMLS= \
|
||||||
simple1.html \
|
simple1.html \
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -14,7 +14,7 @@ other extra information such as font information or ruled lines.
|
||||||
It includes a PDF converter that can transform PDF files
|
It includes a PDF converter that can transform PDF files
|
||||||
into other text formats (such as HTML). It has an extensible
|
into other text formats (such as HTML). It has an extensible
|
||||||
PDF parser that can be used for other purposes instead of text analysis.''',
|
PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
keywords='pdf parser, pdf converter, text mining',
|
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
||||||
license='MIT/X',
|
license='MIT/X',
|
||||||
author='Yusuke Shinyama',
|
author='Yusuke Shinyama',
|
||||||
author_email='yusuke at cs dot nyu dot edu',
|
author_email='yusuke at cs dot nyu dot edu',
|
||||||
|
|
|
@ -19,7 +19,10 @@ import sys
|
||||||
# comment out at runtime.
|
# comment out at runtime.
|
||||||
import cgitb; cgitb.enable()
|
import cgitb; cgitb.enable()
|
||||||
import os, os.path, re, cgi, time, random, codecs, logging, traceback
|
import os, os.path, re, cgi, time, random, codecs, logging, traceback
|
||||||
import pdflib.pdf2txt
|
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
|
||||||
|
from pdfminer.converter import HTMLConverter, TextConverter
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
from pdfminer.cmap import CMapDB
|
||||||
|
|
||||||
|
|
||||||
# quote HTML metacharacters
|
# quote HTML metacharacters
|
||||||
|
@ -35,6 +38,7 @@ def url(base, **kw):
|
||||||
r.append('%s=%s' % (k, v))
|
r.append('%s=%s' % (k, v))
|
||||||
return base+'&'.join(r)
|
return base+'&'.join(r)
|
||||||
|
|
||||||
|
|
||||||
## convert
|
## convert
|
||||||
##
|
##
|
||||||
class FileSizeExceeded(ValueError): pass
|
class FileSizeExceeded(ValueError): pass
|
||||||
|
@ -54,13 +58,16 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10,
|
||||||
infp.close()
|
infp.close()
|
||||||
# perform conversion and
|
# perform conversion and
|
||||||
# send the results over the network.
|
# send the results over the network.
|
||||||
pdflib.pdf2txt.CMapDB.initialize('.', './CDBCMap')
|
CMapDB.initialize()
|
||||||
rsrc = pdflib.pdf2txt.PDFResourceManager()
|
rsrc = PDFResourceManager()
|
||||||
|
laparams = LAParams()
|
||||||
if html:
|
if html:
|
||||||
device = pdflib.pdf2txt.HTMLConverter(rsrc, outfp, codec=codec)
|
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
else:
|
else:
|
||||||
device = pdflib.pdf2txt.TextConverter(rsrc, outfp, codec=codec)
|
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
pdflib.pdf2txt.convert(rsrc, device, path, pagenos, maxpages=maxpages)
|
fp = file(path, 'rb')
|
||||||
|
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
|
||||||
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,17 +5,18 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_p
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice
|
||||||
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
|
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
|
||||||
from pdfminer.cmap import CMapDB, find_cmap_path
|
from pdfminer.cmap import CMapDB, find_cmap_path
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||||
'[-M char_margin] [-L line_margin] [-W word_margin] '
|
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||||
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -31,12 +32,10 @@ def main(argv):
|
||||||
outfile = None
|
outfile = None
|
||||||
outtype = None
|
outtype = None
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
char_margin = 1.0
|
|
||||||
line_margin = 0.3
|
|
||||||
word_margin = 0.2
|
|
||||||
pageno = 1
|
pageno = 1
|
||||||
scale = 1
|
scale = 1
|
||||||
showpageno = True
|
showpageno = True
|
||||||
|
laparams = LAParams()
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
|
@ -47,9 +46,10 @@ def main(argv):
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
elif k == '-s': scale = float(v)
|
elif k == '-s': scale = float(v)
|
||||||
elif k == '-M': char_margin = float(v)
|
elif k == '-D': laparams.direction = v
|
||||||
elif k == '-L': line_margin = float(v)
|
elif k == '-M': laparams.char_margin = float(v)
|
||||||
elif k == '-W': word_margin = float(v)
|
elif k == '-L': laparams.line_margin = float(v)
|
||||||
|
elif k == '-W': laparams.word_margin = float(v)
|
||||||
#
|
#
|
||||||
CMapDB.debug = debug
|
CMapDB.debug = debug
|
||||||
PDFResourceManager.debug = debug
|
PDFResourceManager.debug = debug
|
||||||
|
@ -74,20 +74,19 @@ def main(argv):
|
||||||
else:
|
else:
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
if outtype == 'text':
|
if outtype == 'text':
|
||||||
device = TextConverter(rsrc, outfp, codec=codec,
|
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
|
||||||
elif outtype == 'sgml':
|
elif outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec,
|
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
|
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
|
||||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
return usage()
|
return usage()
|
||||||
for fname in args:
|
for fname in args:
|
||||||
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
|
fp = file(fname, 'rb')
|
||||||
|
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
|
||||||
|
fp.close()
|
||||||
device.close()
|
device.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
152
tools/sgml.py
152
tools/sgml.py
|
@ -1,152 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
import sys, sgmllib
|
|
||||||
__all__ = [ 'Document', 'Page', 'Text', 'PDFSGMLParser' ]
|
|
||||||
|
|
||||||
def fixed(x):
|
|
||||||
return int(float(x)*1000)
|
|
||||||
def getbbox(s):
|
|
||||||
(a,b,c,d) = s.split(',')
|
|
||||||
return (fixed(a),fixed(b),fixed(c),fixed(d))
|
|
||||||
|
|
||||||
|
|
||||||
## Document
|
|
||||||
##
|
|
||||||
class Document:
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.pages = []
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<Document: pages=%r>' % self.pages
|
|
||||||
|
|
||||||
def get_pages(self):
|
|
||||||
return self.pages
|
|
||||||
|
|
||||||
def add_page(self, page):
|
|
||||||
self.pages.append(page)
|
|
||||||
return
|
|
||||||
|
|
||||||
def add_text(self, text):
|
|
||||||
self.pages[-1].add_text(text)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## Page
|
|
||||||
##
|
|
||||||
class Page:
|
|
||||||
|
|
||||||
def __init__(self, pageid, bbox, rotate):
|
|
||||||
self.pageid = pageid
|
|
||||||
self.bbox = bbox
|
|
||||||
self.rotate = rotate
|
|
||||||
self.texts = []
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<Page(%s): texts=%r>' % (self.pageid, self.texts)
|
|
||||||
|
|
||||||
def get_texts(self):
|
|
||||||
return self.texts
|
|
||||||
|
|
||||||
def add_text(self, text):
|
|
||||||
self.texts.append(text)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## Text
|
|
||||||
##
|
|
||||||
class Text:
|
|
||||||
|
|
||||||
def __init__(self, font, direction, bbox, size):
|
|
||||||
self.font = font
|
|
||||||
self.direction = direction
|
|
||||||
self.bbox = bbox
|
|
||||||
self.size = size
|
|
||||||
self.data = ''
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<Text: %r>' % (self.data)
|
|
||||||
|
|
||||||
def add_data(self, data):
|
|
||||||
self.data += data
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## PDFSGMLParser
|
|
||||||
##
|
|
||||||
class PDFSGMLParser(sgmllib.SGMLParser):
|
|
||||||
|
|
||||||
def __init__(self, doc):
|
|
||||||
sgmllib.SGMLParser.__init__(self)
|
|
||||||
self.doc = doc
|
|
||||||
self.curtext = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def start_document(self, attrs):
|
|
||||||
return
|
|
||||||
def end_document(self):
|
|
||||||
return
|
|
||||||
|
|
||||||
def start_page(self, attrs):
|
|
||||||
attrs = dict(attrs)
|
|
||||||
pageid = attrs['id']
|
|
||||||
bbox = getbbox(attrs['bbox'])
|
|
||||||
rotate = int(attrs['rotate'])
|
|
||||||
page = Page(pageid, bbox, rotate)
|
|
||||||
self.doc.add_page(page)
|
|
||||||
return
|
|
||||||
def end_page(self):
|
|
||||||
return
|
|
||||||
|
|
||||||
def start_text(self, attrs):
|
|
||||||
attrs = dict(attrs)
|
|
||||||
font = attrs['font']
|
|
||||||
direction = attrs['direction']
|
|
||||||
bbox = getbbox(attrs['bbox'])
|
|
||||||
size = fixed(attrs['fontsize'])
|
|
||||||
text = Text(font, direction, bbox, size)
|
|
||||||
self.curtext = text
|
|
||||||
return
|
|
||||||
def end_text(self):
|
|
||||||
assert self.curtext
|
|
||||||
self.doc.add_text(self.curtext)
|
|
||||||
self.curtext = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
if not self.curtext: return
|
|
||||||
self.curtext.add_data(data)
|
|
||||||
return
|
|
||||||
|
|
||||||
def feedfile(self, fp, encoding='utf-8'):
|
|
||||||
for line in fp:
|
|
||||||
line = unicode(line, encoding, 'ignore')
|
|
||||||
self.feed(line)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# main
|
|
||||||
def main(argv):
|
|
||||||
import getopt
|
|
||||||
def usage():
|
|
||||||
print 'usage: %s [-d] [-c encoding] [file ...]' % argv[0]
|
|
||||||
return 100
|
|
||||||
try:
|
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dc:')
|
|
||||||
except getopt.GetoptError:
|
|
||||||
return usage()
|
|
||||||
encoding = 'utf-8'
|
|
||||||
for (k, v) in opts:
|
|
||||||
if k == '-d': debug += 1
|
|
||||||
elif k == '-c': encoding = v
|
|
||||||
for fname in args:
|
|
||||||
doc = Document()
|
|
||||||
parser = PDFSGMLParser(doc)
|
|
||||||
parser.feedfile(fname, encoding)
|
|
||||||
parser.close()
|
|
||||||
print doc
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
|
162
tools/viewpdf.py
162
tools/viewpdf.py
|
@ -1,162 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
import sys
|
|
||||||
from sgml import PDFSGMLParser, Document
|
|
||||||
stdout = sys.stdout
|
|
||||||
stderr = sys.stderr
|
|
||||||
try:
|
|
||||||
import pygame
|
|
||||||
from pygame.locals import *
|
|
||||||
except ImportError:
|
|
||||||
print >>stderr, 'you need pygame'
|
|
||||||
sys.exit(111)
|
|
||||||
|
|
||||||
|
|
||||||
def scale(x):
|
|
||||||
return int(x*0.002)
|
|
||||||
|
|
||||||
|
|
||||||
## FontManager
|
|
||||||
##
|
|
||||||
class FontManager:
|
|
||||||
|
|
||||||
fonts = {}
|
|
||||||
#default_font = '/Library/Fonts/Vera.ttf'
|
|
||||||
default_font = '/usr/share/fonts/truetype/kochi/kochi-gothic.ttf'
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_font(klass, path, size):
|
|
||||||
if not path:
|
|
||||||
path = klass.default_font
|
|
||||||
size = int(size)
|
|
||||||
k = (path,size)
|
|
||||||
if k not in klass.fonts:
|
|
||||||
font = pygame.font.Font(path, size)
|
|
||||||
klass.fonts[k] = font
|
|
||||||
else:
|
|
||||||
font = klass.fonts[k]
|
|
||||||
return font
|
|
||||||
|
|
||||||
|
|
||||||
## PDFViewer
|
|
||||||
##
|
|
||||||
class PDFViewer:
|
|
||||||
|
|
||||||
BGCOLOR = (255,255,255)
|
|
||||||
FGCOLOR = (0,0,0)
|
|
||||||
|
|
||||||
def __init__(self, display, doc):
|
|
||||||
self.display = display
|
|
||||||
self.buf = None
|
|
||||||
self.pages = doc.get_pages()
|
|
||||||
self.render_page(0)
|
|
||||||
return
|
|
||||||
|
|
||||||
def render_page(self, pageno):
|
|
||||||
print >>stderr, 'rendering: page=%d...' % pageno
|
|
||||||
page = self.pages[pageno]
|
|
||||||
(x,y,w,h) = page.bbox
|
|
||||||
self.width = scale(w)
|
|
||||||
self.height = scale(h)
|
|
||||||
self.buf = pygame.Surface((self.width, self.height))
|
|
||||||
self.buf.fill(self.BGCOLOR)
|
|
||||||
for text in page.get_texts():
|
|
||||||
font = FontManager.get_font(None, scale(text.size*0.7))
|
|
||||||
(x,y,w,h) = text.bbox
|
|
||||||
r = font.render(text.data, 1, self.FGCOLOR)
|
|
||||||
self.buf.blit(r, (scale(x), self.height-scale(y)))
|
|
||||||
self.pageno = pageno
|
|
||||||
self.pos = (0,0)
|
|
||||||
self.refresh()
|
|
||||||
return
|
|
||||||
|
|
||||||
def refresh(self):
|
|
||||||
size = self.display.get_size()
|
|
||||||
self.display.blit(self.buf, (0,0), (self.pos, size))
|
|
||||||
pygame.display.flip()
|
|
||||||
return
|
|
||||||
|
|
||||||
STEP = 8
|
|
||||||
def run(self):
|
|
||||||
loop = True
|
|
||||||
key = None
|
|
||||||
(w,h) = self.display.get_size()
|
|
||||||
xmax = self.width - w
|
|
||||||
ymax = self.height - h
|
|
||||||
while loop:
|
|
||||||
for e in pygame.event.get():
|
|
||||||
if e.type == VIDEOEXPOSE:
|
|
||||||
self.refresh()
|
|
||||||
elif e.type == KEYDOWN:
|
|
||||||
if e.key in (K_ESCAPE, K_RETURN, K_q):
|
|
||||||
loop = False
|
|
||||||
break
|
|
||||||
elif e.key == K_SPACE:
|
|
||||||
if self.pageno < len(self.pages)-1:
|
|
||||||
self.render_page(self.pageno+1)
|
|
||||||
elif e.key == K_b:
|
|
||||||
if 0 < self.pageno:
|
|
||||||
self.render_page(self.pageno-1)
|
|
||||||
else:
|
|
||||||
key = e.key
|
|
||||||
elif e.type == KEYUP:
|
|
||||||
key = None
|
|
||||||
if key:
|
|
||||||
(x,y) = self.pos
|
|
||||||
if key in (K_h, K_LEFT, K_KP4):
|
|
||||||
x = max(0, x-self.STEP)
|
|
||||||
elif key in (K_l, K_RIGHT, K_KP6):
|
|
||||||
x = min(xmax, x+self.STEP)
|
|
||||||
elif key in (K_k, K_UP, K_KP8):
|
|
||||||
y = max(0, y-self.STEP)
|
|
||||||
elif key in (K_j, K_DOWN, K_KP2):
|
|
||||||
y = min(ymax, y+self.STEP)
|
|
||||||
self.pos = (x,y)
|
|
||||||
self.refresh()
|
|
||||||
return
|
|
||||||
|
|
||||||
# main
|
|
||||||
def main(argv):
|
|
||||||
import getopt
|
|
||||||
def usage():
|
|
||||||
print 'usage: %s [-d] [-c encoding] file' % argv[0]
|
|
||||||
return 100
|
|
||||||
try:
|
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dc:P:')
|
|
||||||
except getopt.GetoptError:
|
|
||||||
return usage()
|
|
||||||
if not args: return usage()
|
|
||||||
debug = 0
|
|
||||||
encoding = 'utf-8'
|
|
||||||
cmapdir = 'CMap'
|
|
||||||
cdbcmapdir = 'CDBCMap'
|
|
||||||
password = ''
|
|
||||||
for (k, v) in opts:
|
|
||||||
if k == '-d': debug += 1
|
|
||||||
elif k == '-c': encoding = v
|
|
||||||
elif k == '-P': password = v
|
|
||||||
#
|
|
||||||
fname = args.pop(0)
|
|
||||||
if fname.endswith('.pdf'):
|
|
||||||
# convert .pdf to sgml
|
|
||||||
import tempfile
|
|
||||||
from pdf2txt import CMapDB, PDFResourceManager, pdf2txt
|
|
||||||
print >>stderr, 'reading %r...' % fname
|
|
||||||
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
|
||||||
rsrc = PDFResourceManager(debug=debug)
|
|
||||||
fp = tempfile.TemporaryFile()
|
|
||||||
pdf2txt(fp, rsrc, fname, None, encoding, password=password, debug=debug)
|
|
||||||
fp.seek(0)
|
|
||||||
else:
|
|
||||||
fp = file(fname, 'rb')
|
|
||||||
doc = Document()
|
|
||||||
parser = PDFSGMLParser(doc)
|
|
||||||
parser.feedfile(fp, encoding)
|
|
||||||
parser.close()
|
|
||||||
fp.close()
|
|
||||||
#
|
|
||||||
pygame.init()
|
|
||||||
pygame.display.set_mode((640,480))
|
|
||||||
PDFViewer(pygame.display.get_surface(), doc).run()
|
|
||||||
return
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
|
Loading…
Reference in New Issue