layout analysis improved.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@120 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-07-21 07:55:19 +00:00
parent 0113486b76
commit 8a5bec5065
14 changed files with 263 additions and 525 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sun Jul 12 00:36:44 JST 2009
Last Modified: Tue Jul 21 16:24:26 JST 2009
<!-- hhmts end -->
</div>
@ -191,6 +191,7 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
</ul>
<p>
<dt> <code>-D <em>direction</em></code>
<dt> <code>-M <em>char_margin</em></code>
<dt> <code>-L <em>line_margin</em></code>
<dt> <code>-W <em>word_margin</em></code>
@ -318,6 +319,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2009/07/21: Improvement in layout analysis.
<li> 2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
<li> 2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
<li> 2009/03/30: Text output mode added.

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
__version__ = '20090711'
__version__ = '20090721'
if __name__ == '__main__': print __version__

View File

@ -199,7 +199,9 @@ class CMapDB(object):
cmapdb = {}
@classmethod
def initialize(klass, dirname, cdbdirname=None):
def initialize(klass, dirname=None, cdbdirname=None):
if not dirname:
dirname = find_cmap_path()
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
return

View File

@ -2,7 +2,7 @@
import sys
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
@ -10,10 +10,9 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
def __init__(self, rsrc, pageno=1, laparams=None):
PDFDevice.__init__(self, rsrc)
self.char_margin = char_margin
self.line_margin = line_margin
self.laparams = laparams
self.undefined_char = '?'
self.pageno = pageno
self.stack = []
@ -27,9 +26,9 @@ class PDFPageAggregator(PDFDevice):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1
if self.char_margin != None and self.line_margin != None:
self.cur_item.group_text(self.char_margin, self.line_margin)
return self.cur_item
def begin_figure(self, name, bbox, matrix):
@ -79,8 +78,8 @@ class PDFPageAggregator(PDFDevice):
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = LTText(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
@ -116,13 +115,10 @@ class PDFPageAggregator(PDFDevice):
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
char_margin=None, line_margin=None, word_margin=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
char_margin=char_margin, line_margin=line_margin)
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp
self.codec = codec
self.word_margin = word_margin
return
def write(self, text):
@ -202,17 +198,6 @@ class SGMLConverter(PDFConverter):
for child in item:
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTText):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTAnon):
if item.text == ' ':
self.outfp.write('<space>\n')
elif item.text == '\n':
self.outfp.write('<newline>\n')
elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
@ -222,11 +207,26 @@ class SGMLConverter(PDFConverter):
for child in item:
render(child)
self.outfp.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
for child in item:
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item.get_lines(self.word_margin):
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTTextItem):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
(enc(item.font.fontname), item.is_vertical(),
item.get_bbox(), item.fontsize))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n', item.text)
else:
assert 0, item
return
page = PDFConverter.end_page(self, page)
render(page)
@ -237,11 +237,9 @@ class SGMLConverter(PDFConverter):
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
char_margin=None, line_margin=None, word_margin=None,
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
@ -268,7 +266,7 @@ class HTMLConverter(PDFConverter):
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.id, page.id))
for child in item:
render(child)
elif isinstance(item, LTText):
elif isinstance(item, LTTextItem):
if item.vertical:
wmode = 'tb-rl'
else:
@ -281,13 +279,14 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n')
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTAnon):
pass
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
for child in item:
render(child)
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item.get_lines(self.word_margin):
for child in item:
render(child)
return
page = PDFConverter.end_page(self, page)
@ -307,11 +306,9 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
char_margin=None, line_margin=None, word_margin=None,
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
return
@ -322,14 +319,12 @@ class TextConverter(PDFConverter):
def end_page(self, page):
def render(item):
if isinstance(item, LTText):
self.write(item.text+'\n')
elif isinstance(item, LTTextBox):
for obj in item.get_lines(self.word_margin):
self.write(obj.text)
self.write('\n')
self.write(item.text)
elif isinstance(item, LayoutContainer):
for child in item:
render(child)
if isinstance(item, LTTextBox):
self.write('\n')
page = PDFConverter.end_page(self, page)
if self.showpageno:
self.write('Page %d\n' % page.id)

View File

@ -4,50 +4,24 @@ from pdfminer.utils import apply_matrix_norm, bsearch
INF = sys.maxint
## reorder_hv, reorder_vh
## chop_hv, chop_vh
## LAParams
##
## Reorders objects according to its writing direction.
##
def reorder_vh(objs, hdir):
if 0 < hdir:
hkey = (lambda obj: obj.x0)
vkey = (lambda obj: -obj.y1)
else:
hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1)
r = []
line = []
for obj in sorted(objs, key=vkey):
if line:
v = line[-1].voverlap(obj) * 2
if v < obj.height or v < line[-1].height:
line.sort(key=hkey)
r.append(line)
line = []
line.append(obj)
line.sort(key=hkey)
r.append(line)
return r
class LAParams(object):
def __init__(self,
direction=None,
char_margin=1.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
self.char_margin = char_margin
self.line_margin = line_margin
self.word_margin = word_margin
return
def reorder_hv(objs, hdir):
if 0 < hdir:
hkey = (lambda obj: obj.x0)
vkey = (lambda obj: -obj.y1)
else:
hkey = (lambda obj: -obj.x1)
vkey = (lambda obj: -obj.y1)
r = []
line = []
for obj in sorted(objs, key=hkey):
if line and not line[-1].hoverlap(obj):
line.sort(key=vkey)
r.append(line)
line = []
line.append(obj)
line.sort(key=vkey)
r.append(line)
return r
def __repr__(self):
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.direction, self.char_margin, self.line_margin, self.word_margin))
## Plane
@ -91,12 +65,6 @@ class Plane(object):
## ClusterSet
##
## Maintains a set of LTTextBox objects.
## It incrementally constructs LTTextBox objects
## and group them when necessary. It gives
## a sequence of LTTextBox objects that represent
## the text stream of that page.
##
class ClusterSet(object):
def __init__(self, klass):
@ -123,16 +91,18 @@ class ClusterSet(object):
group.fixate()
return list(r)
def group_objs(objs, hratio, vratio, klass):
plane = Plane(objs)
cset = ClusterSet(klass)
for obj in objs:
margin = obj.get_margin()
hmargin = hratio * margin
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
cset.add(neighbors)
return cset.finish()
@classmethod
def build(klass, objs, hratio, vratio, objtype):
plane = Plane(objs)
cset = ClusterSet(objtype)
for obj in objs:
margin = obj.get_margin()
hmargin = hratio * margin
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
assert obj in neighbors, obj
cset.add(neighbors)
return cset.finish()
## LayoutItem
@ -140,11 +110,12 @@ def group_objs(objs, hratio, vratio, klass):
class LayoutItem(object):
def __init__(self, bbox):
#assert x0 <= x1 and y0 <= y1
self.set_bbox(bbox)
return
def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
if y1 < y0: (y0,y1) = (y1,y0)
self.x0 = x0
self.y0 = y0
self.x1 = x1
@ -202,6 +173,9 @@ class LayoutContainer(LayoutItem):
def __iter__(self):
return iter(self.objs)
def __len__(self):
return len(self.objs)
def add(self, obj):
self.objs.add(obj)
@ -212,7 +186,7 @@ class LayoutContainer(LayoutItem):
return
# fixate(): determines its boundery and writing direction.
def fixate(self, direction=None):
def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
@ -228,14 +202,7 @@ class LayoutContainer(LayoutItem):
return self.weight
def get_direction(self):
if not self.objs: return None
d = {}
for obj in self.objs:
k = obj.get_direction()
if k not in d: d[k] = 0
d[k] += 1
(direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
return direction
return None
## LTLine
@ -259,21 +226,37 @@ class LTRect(LayoutItem):
return
## LTAnon
## LTText
##
class LTAnon(object):
class LTText(object):
def __init__(self, text):
self.text = text
return
def __repr__(self):
return '<text %r>' % self.text
def get_weight(self):
return len(self.text)
def is_upright(self):
return True
## LTAnon
##
class LTAnon(LTText):
def get_weight(self):
return 0
## LTText
## LTTextItem
##
class LTText(LayoutItem):
class LTTextItem(LayoutItem, LTText):
debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
@ -307,21 +290,25 @@ class LTText(LayoutItem):
return
def __repr__(self):
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv,
self.text))
if self.debug:
return ('<text matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
self.font, self.fontsize, self.get_bbox(),
'(%.1f, %.1f)' % self.adv,
self.text))
else:
return '<text %r>' % self.text
def get_margin(self):
return abs(self.fontsize)
def get_weight(self):
return len(self.text)
def is_vertical(self):
return self.vertical
def is_upright(self):
(a,b,c,d,e,f) = self.matrix
return 0 < a*d and b*c <= 0
## LTFigure
##
@ -336,6 +323,54 @@ class LTFigure(LayoutContainer):
return ('<figure id=%r bbox=%s matrix=%r>' % (self.id, self.get_bbox(), self.matrix))
## LTTextLine
##
class LTTextLine(LayoutContainer):
def __init__(self, id, objs, direction, word_margin):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = direction
self.word_margin = word_margin
return
def __repr__(self):
return ('<line %s(%s)>' % (self.get_bbox(), self.direction))
def get_margin(self):
return min(self.width, self.height)
def get_direction(self):
return self.direction
def get_text(self):
return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
def fixate(self):
LayoutContainer.fixate(self)
objs = []
if self.direction == 'V':
y0 = -INF
for obj in sorted(self.objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if obj.y1+margin < y0:
objs.append(LTAnon(' '))
objs.append(obj)
y0 = obj.y0
else:
x1 = INF
for obj in sorted(self.objs, key=lambda obj: obj.x0):
if isinstance(obj, LTTextItem) and self.word_margin:
margin = self.word_margin * obj.get_margin()
if x1 < obj.x0-margin:
objs.append(LTAnon(' '))
objs.append(obj)
x1 = obj.x1
objs.append(LTAnon('\n'))
self.objs = objs
return
## LTTextBox
##
## A set of text objects that are grouped within
@ -343,65 +378,55 @@ class LTFigure(LayoutContainer):
##
class LTTextBox(LayoutContainer):
def __init__(self, id, objs):
def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
self.direction = None
self.direction = direction
return
def __repr__(self):
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
return ('<textbox %s(%s) %r...>' % (self.get_bbox(), self.direction, self.get_text()[:20]))
def fixate(self, direction='H'):
LayoutContainer.fixate(self, direction=direction)
if not direction:
if any( obj.is_vertical() for obj in self.objs ):
direction = 'V'
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
if h < v:
direction = 'V'
self.direction = direction
def get_text(self):
return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
def fixate(self):
LayoutContainer.fixate(self)
if self.direction == 'V':
self.lines = reorder_hv(self.objs, -1)
self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
self.lines = reorder_vh(self.objs, +1)
self.objs = []
for line in self.lines:
self.objs.extend(line)
self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return
def get_direction(self):
return self.direction
def get_lines(self, word_margin):
if self.get_direction() == 'V':
for line in self.lines:
y0 = -INF
for obj in line:
if not isinstance(obj, LTText): continue
if word_margin:
margin = word_margin * obj.get_margin()
if obj.y1+margin < y0:
yield LTAnon(' ')
yield obj
y0 = obj.y0
yield LTAnon('\n')
def tsort(objs, f):
gi = dict( (obj,[]) for obj in objs )
go = dict( (obj,[]) for obj in objs )
for obj1 in objs:
for obj2 in objs:
if f(obj1, obj2): # obj1 -> obj2
go[obj1].append(obj2)
gi[obj2].append(obj1)
r = objs[:]
s = []
while r:
for obj in r:
if not go[obj] or gi[obj]: continue
for c in go[obj]:
gi[c].remove(obj)
del gi[obj]
del go[obj]
r.remove(obj)
s.append(obj)
break
else:
for line in self.lines:
x1 = INF
for obj in line:
if not isinstance(obj, LTText): continue
if word_margin:
margin = word_margin * obj.get_margin()
if x1 < obj.x0-margin:
yield LTAnon(' ')
yield obj
x1 = obj.x1
yield LTAnon('\n')
return
obj = r.pop()
del gi[obj]
del go[obj]
s.append(obj)
return s
## LTPage
@ -416,19 +441,39 @@ class LTPage(LayoutContainer):
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def fixate(self, dirtection='H'):
return
def group_text(self, char_margin, line_margin):
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
if self.get_direction() == 'V':
objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
lines = reorder_hv(objs, -1)
def analyze_layout(self, laparams):
textobjs = []
otherobjs = []
for obj in self.objs:
if isinstance(obj, LTText) and obj.is_upright():
textobjs.append(obj)
else:
otherobjs.append(obj)
if laparams.direction == 'V':
lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
(lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)))
boxes = ClusterSet.build(lines, laparams.line_margin, 0,
(lambda id,objs: LTTextBox(id, objs, 'V')))
def vorder(obj1, obj2):
if obj1.voverlap(obj2):
return obj2.x1 < obj1.x1
elif obj1.hoverlap(obj2):
return obj2.y1 < obj1.y1
else:
return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
boxes = tsort(boxes, vorder)
else:
objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
lines = reorder_vh(objs, +1)
self.objs = []
for line in lines:
self.objs.extend(line)
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)))
boxes = ClusterSet.build(lines, 0, laparams.line_margin,
(lambda id,objs: LTTextBox(id, objs, 'H')))
def horder(obj1, obj2):
if obj1.hoverlap(obj2):
return obj2.y1 < obj1.y1
elif obj1.voverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
boxes = tsort(boxes, horder)
self.objs = otherobjs + boxes
return

View File

@ -756,9 +756,8 @@ class PDFPageInterpreter(object):
##
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
doc = PDFDocument()
fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
if not doc.is_extractable:
@ -768,5 +767,4 @@ def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
fp.close()
return

View File

@ -47,6 +47,10 @@ def bsearch(objs, v0):
(v, obj) = objs[i]
if v0 == v:
(i0,i1) = (i,i+1)
while 0 < i0 and objs[i0-1][0] == v0:
i0 -= 1
while i1 < len(objs)-1 and objs[i1][0] == v0:
i1 += 1
break
elif v0 < v:
i1 = i

View File

@ -1,7 +1,7 @@
# GNUMakefile for test
PYTHON=python
PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
HTMLS= \
simple1.html \

View File

@ -14,7 +14,7 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''',
keywords='pdf parser, pdf converter, text mining',
keywords=['pdf parser', 'pdf converter', 'text mining'],
license='MIT/X',
author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu',

View File

View File

@ -19,7 +19,10 @@ import sys
# comment out at runtime.
import cgitb; cgitb.enable()
import os, os.path, re, cgi, time, random, codecs, logging, traceback
import pdflib.pdf2txt
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from pdfminer.cmap import CMapDB
# quote HTML metacharacters
@ -35,6 +38,7 @@ def url(base, **kw):
r.append('%s=%s' % (k, v))
return base+'&'.join(r)
## convert
##
class FileSizeExceeded(ValueError): pass
@ -54,13 +58,16 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10,
infp.close()
# perform conversion and
# send the results over the network.
pdflib.pdf2txt.CMapDB.initialize('.', './CDBCMap')
rsrc = pdflib.pdf2txt.PDFResourceManager()
CMapDB.initialize()
rsrc = PDFResourceManager()
laparams = LAParams()
if html:
device = pdflib.pdf2txt.HTMLConverter(rsrc, outfp, codec=codec)
device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
else:
device = pdflib.pdf2txt.TextConverter(rsrc, outfp, codec=codec)
pdflib.pdf2txt.convert(rsrc, device, path, pagenos, maxpages=maxpages)
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
fp.close()
return

View File

@ -5,17 +5,18 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_p
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmap import CMapDB, find_cmap_path
from pdfminer.layout import LAParams
# main
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-M char_margin] [-L line_margin] [-W word_margin] '
'[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@ -31,12 +32,10 @@ def main(argv):
outfile = None
outtype = None
codec = 'utf-8'
char_margin = 1.0
line_margin = 0.3
word_margin = 0.2
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-C': cmapdir = v
@ -47,9 +46,10 @@ def main(argv):
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-s': scale = float(v)
elif k == '-M': char_margin = float(v)
elif k == '-L': line_margin = float(v)
elif k == '-W': word_margin = float(v)
elif k == '-D': laparams.direction = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
@ -74,20 +74,19 @@ def main(argv):
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
fp = file(fname, 'rb')
process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
return

View File

@ -1,152 +0,0 @@
#!/usr/bin/env python
import sys, sgmllib
__all__ = [ 'Document', 'Page', 'Text', 'PDFSGMLParser' ]
def fixed(x):
return int(float(x)*1000)
def getbbox(s):
(a,b,c,d) = s.split(',')
return (fixed(a),fixed(b),fixed(c),fixed(d))
## Document
##
class Document:
def __init__(self):
self.pages = []
return
def __repr__(self):
return '<Document: pages=%r>' % self.pages
def get_pages(self):
return self.pages
def add_page(self, page):
self.pages.append(page)
return
def add_text(self, text):
self.pages[-1].add_text(text)
return
## Page
##
class Page:
def __init__(self, pageid, bbox, rotate):
self.pageid = pageid
self.bbox = bbox
self.rotate = rotate
self.texts = []
return
def __repr__(self):
return '<Page(%s): texts=%r>' % (self.pageid, self.texts)
def get_texts(self):
return self.texts
def add_text(self, text):
self.texts.append(text)
return
## Text
##
class Text:
def __init__(self, font, direction, bbox, size):
self.font = font
self.direction = direction
self.bbox = bbox
self.size = size
self.data = ''
return
def __repr__(self):
return '<Text: %r>' % (self.data)
def add_data(self, data):
self.data += data
return
## PDFSGMLParser
##
class PDFSGMLParser(sgmllib.SGMLParser):
def __init__(self, doc):
sgmllib.SGMLParser.__init__(self)
self.doc = doc
self.curtext = None
return
def start_document(self, attrs):
return
def end_document(self):
return
def start_page(self, attrs):
attrs = dict(attrs)
pageid = attrs['id']
bbox = getbbox(attrs['bbox'])
rotate = int(attrs['rotate'])
page = Page(pageid, bbox, rotate)
self.doc.add_page(page)
return
def end_page(self):
return
def start_text(self, attrs):
attrs = dict(attrs)
font = attrs['font']
direction = attrs['direction']
bbox = getbbox(attrs['bbox'])
size = fixed(attrs['fontsize'])
text = Text(font, direction, bbox, size)
self.curtext = text
return
def end_text(self):
assert self.curtext
self.doc.add_text(self.curtext)
self.curtext = None
return
def handle_data(self, data):
if not self.curtext: return
self.curtext.add_data(data)
return
def feedfile(self, fp, encoding='utf-8'):
for line in fp:
line = unicode(line, encoding, 'ignore')
self.feed(line)
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-c encoding] [file ...]' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dc:')
except getopt.GetoptError:
return usage()
encoding = 'utf-8'
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-c': encoding = v
for fname in args:
doc = Document()
parser = PDFSGMLParser(doc)
parser.feedfile(fname, encoding)
parser.close()
print doc
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -1,162 +0,0 @@
#!/usr/bin/env python
import sys
from sgml import PDFSGMLParser, Document
stdout = sys.stdout
stderr = sys.stderr
try:
import pygame
from pygame.locals import *
except ImportError:
print >>stderr, 'you need pygame'
sys.exit(111)
def scale(x):
return int(x*0.002)
## FontManager
##
class FontManager:
fonts = {}
#default_font = '/Library/Fonts/Vera.ttf'
default_font = '/usr/share/fonts/truetype/kochi/kochi-gothic.ttf'
@classmethod
def get_font(klass, path, size):
if not path:
path = klass.default_font
size = int(size)
k = (path,size)
if k not in klass.fonts:
font = pygame.font.Font(path, size)
klass.fonts[k] = font
else:
font = klass.fonts[k]
return font
## PDFViewer
##
class PDFViewer:
BGCOLOR = (255,255,255)
FGCOLOR = (0,0,0)
def __init__(self, display, doc):
self.display = display
self.buf = None
self.pages = doc.get_pages()
self.render_page(0)
return
def render_page(self, pageno):
print >>stderr, 'rendering: page=%d...' % pageno
page = self.pages[pageno]
(x,y,w,h) = page.bbox
self.width = scale(w)
self.height = scale(h)
self.buf = pygame.Surface((self.width, self.height))
self.buf.fill(self.BGCOLOR)
for text in page.get_texts():
font = FontManager.get_font(None, scale(text.size*0.7))
(x,y,w,h) = text.bbox
r = font.render(text.data, 1, self.FGCOLOR)
self.buf.blit(r, (scale(x), self.height-scale(y)))
self.pageno = pageno
self.pos = (0,0)
self.refresh()
return
def refresh(self):
size = self.display.get_size()
self.display.blit(self.buf, (0,0), (self.pos, size))
pygame.display.flip()
return
STEP = 8
def run(self):
loop = True
key = None
(w,h) = self.display.get_size()
xmax = self.width - w
ymax = self.height - h
while loop:
for e in pygame.event.get():
if e.type == VIDEOEXPOSE:
self.refresh()
elif e.type == KEYDOWN:
if e.key in (K_ESCAPE, K_RETURN, K_q):
loop = False
break
elif e.key == K_SPACE:
if self.pageno < len(self.pages)-1:
self.render_page(self.pageno+1)
elif e.key == K_b:
if 0 < self.pageno:
self.render_page(self.pageno-1)
else:
key = e.key
elif e.type == KEYUP:
key = None
if key:
(x,y) = self.pos
if key in (K_h, K_LEFT, K_KP4):
x = max(0, x-self.STEP)
elif key in (K_l, K_RIGHT, K_KP6):
x = min(xmax, x+self.STEP)
elif key in (K_k, K_UP, K_KP8):
y = max(0, y-self.STEP)
elif key in (K_j, K_DOWN, K_KP2):
y = min(ymax, y+self.STEP)
self.pos = (x,y)
self.refresh()
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-c encoding] file' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dc:P:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
encoding = 'utf-8'
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
password = ''
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-c': encoding = v
elif k == '-P': password = v
#
fname = args.pop(0)
if fname.endswith('.pdf'):
# convert .pdf to sgml
import tempfile
from pdf2txt import CMapDB, PDFResourceManager, pdf2txt
print >>stderr, 'reading %r...' % fname
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug)
fp = tempfile.TemporaryFile()
pdf2txt(fp, rsrc, fname, None, encoding, password=password, debug=debug)
fp.seek(0)
else:
fp = file(fname, 'rb')
doc = Document()
parser = PDFSGMLParser(doc)
parser.feedfile(fp, encoding)
parser.close()
fp.close()
#
pygame.init()
pygame.display.set_mode((640,480))
PDFViewer(pygame.display.get_surface(), doc).run()
return
if __name__ == '__main__': sys.exit(main(sys.argv))