diff --git a/README.html b/README.html
index 9255959..08fb7f6 100644
--- a/README.html
+++ b/README.html
@@ -18,7 +18,7 @@ Python PDF parser and analyzer
-Last Modified: Sun Jul 12 00:36:44 JST 2009
+Last Modified: Tue Jul 21 16:24:26 JST 2009
@@ -191,6 +191,7 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF ").
+
-D direction
-M char_margin
-L line_margin
-W word_margin
@@ -318,6 +319,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+ 2009/07/21: Improvement in layout analysis.
2009/07/11: Improvement in layout analysis. Thanks to Lubos Pintes.
2009/05/17: Bugfixes, massive code restructuring, and simple graphic element support added. setup.py is supported.
2009/03/30: Text output mode added.
diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py
index 150bcd1..07e92a7 100644
--- a/pdfminer/__init__.py
+++ b/pdfminer/__init__.py
@@ -1,4 +1,4 @@
#!/usr/bin/env python
-__version__ = '20090711'
+__version__ = '20090721'
if __name__ == '__main__': print __version__
diff --git a/pdfminer/cmap.py b/pdfminer/cmap.py
index 444e90a..8f0731f 100644
--- a/pdfminer/cmap.py
+++ b/pdfminer/cmap.py
@@ -199,7 +199,9 @@ class CMapDB(object):
cmapdb = {}
@classmethod
- def initialize(klass, dirname, cdbdirname=None):
+ def initialize(klass, dirname=None, cdbdirname=None):
+ if not dirname:
+ dirname = find_cmap_path()
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
return
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 9cf86d5..b811001 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -2,7 +2,7 @@
import sys
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdffont import PDFUnicodeNotDefined
-from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox, LTAnon
+from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
@@ -10,10 +10,9 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
##
class PDFPageAggregator(PDFDevice):
- def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
+ def __init__(self, rsrc, pageno=1, laparams=None):
PDFDevice.__init__(self, rsrc)
- self.char_margin = char_margin
- self.line_margin = line_margin
+ self.laparams = laparams
self.undefined_char = '?'
self.pageno = pageno
self.stack = []
@@ -27,9 +26,9 @@ class PDFPageAggregator(PDFDevice):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
+ if self.laparams:
+ self.cur_item.analyze_layout(self.laparams)
self.pageno += 1
- if self.char_margin != None and self.line_margin != None:
- self.cur_item.group_text(self.char_margin, self.line_margin)
return self.cur_item
def begin_figure(self, name, bbox, matrix):
@@ -79,8 +78,8 @@ class PDFPageAggregator(PDFDevice):
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
- item = LTText(textmatrix, textstate.font, textstate.fontsize,
- textstate.charspace, textstate.scaling, chars)
+ item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
+ textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
@@ -116,13 +115,10 @@ class PDFPageAggregator(PDFDevice):
##
class PDFConverter(PDFPageAggregator):
- def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
- char_margin=None, line_margin=None, word_margin=None):
- PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
- char_margin=char_margin, line_margin=line_margin)
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
+ PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp
self.codec = codec
- self.word_margin = word_margin
return
def write(self, text):
@@ -202,17 +198,6 @@ class SGMLConverter(PDFConverter):
for child in item:
render(child)
self.outfp.write('\n')
- elif isinstance(item, LTText):
- self.outfp.write('' %
- (enc(item.font.fontname), item.is_vertical(),
- item.get_bbox(), item.fontsize))
- self.write(item.text)
- self.outfp.write(' \n')
- elif isinstance(item, LTAnon):
- if item.text == ' ':
- self.outfp.write('\n')
- elif item.text == '\n':
- self.outfp.write('\n')
elif isinstance(item, LTLine):
self.outfp.write(' ' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
@@ -222,11 +207,26 @@ class SGMLConverter(PDFConverter):
for child in item:
render(child)
self.outfp.write('\n')
+ elif isinstance(item, LTTextLine):
+ self.outfp.write('\n' % (item.get_bbox()))
+ for child in item:
+ render(child)
+ self.outfp.write(' \n')
elif isinstance(item, LTTextBox):
self.outfp.write('\n' % (item.id, item.get_bbox()))
- for child in item.get_lines(self.word_margin):
+ for child in item:
render(child)
self.outfp.write(' \n')
+ elif isinstance(item, LTTextItem):
+ self.outfp.write('' %
+ (enc(item.font.fontname), item.is_vertical(),
+ item.get_bbox(), item.fontsize))
+ self.write(item.text)
+ self.outfp.write(' \n')
+ elif isinstance(item, LTText):
+ self.outfp.write('%s \n', item.text)
+ else:
+ assert 0, item
return
page = PDFConverter.end_page(self, page)
render(page)
@@ -237,11 +237,9 @@ class SGMLConverter(PDFConverter):
##
class HTMLConverter(PDFConverter):
- def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
- char_margin=None, line_margin=None, word_margin=None,
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50):
- PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
- char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+ PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
@@ -268,7 +266,7 @@ class HTMLConverter(PDFConverter):
self.outfp.write('Page %s \n' % (page.id, page.id))
for child in item:
render(child)
- elif isinstance(item, LTText):
+ elif isinstance(item, LTTextItem):
if item.vertical:
wmode = 'tb-rl'
else:
@@ -281,13 +279,14 @@ class HTMLConverter(PDFConverter):
self.outfp.write('\n')
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
- elif isinstance(item, LTAnon):
- pass
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
+ elif isinstance(item, LTTextLine):
+ for child in item:
+ render(child)
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
- for child in item.get_lines(self.word_margin):
+ for child in item:
render(child)
return
page = PDFConverter.end_page(self, page)
@@ -307,11 +306,9 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
- def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
- char_margin=None, line_margin=None, word_margin=None,
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False):
- PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
- char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+ PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
return
@@ -322,14 +319,12 @@ class TextConverter(PDFConverter):
def end_page(self, page):
def render(item):
if isinstance(item, LTText):
- self.write(item.text+'\n')
- elif isinstance(item, LTTextBox):
- for obj in item.get_lines(self.word_margin):
- self.write(obj.text)
- self.write('\n')
+ self.write(item.text)
elif isinstance(item, LayoutContainer):
for child in item:
render(child)
+ if isinstance(item, LTTextBox):
+ self.write('\n')
page = PDFConverter.end_page(self, page)
if self.showpageno:
self.write('Page %d\n' % page.id)
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index fc09278..e0de3af 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -4,50 +4,24 @@ from pdfminer.utils import apply_matrix_norm, bsearch
INF = sys.maxint
-## reorder_hv, reorder_vh
-## chop_hv, chop_vh
+## LAParams
##
-## Reorders objects according to its writing direction.
-##
-def reorder_vh(objs, hdir):
- if 0 < hdir:
- hkey = (lambda obj: obj.x0)
- vkey = (lambda obj: -obj.y1)
- else:
- hkey = (lambda obj: -obj.x1)
- vkey = (lambda obj: -obj.y1)
- r = []
- line = []
- for obj in sorted(objs, key=vkey):
- if line:
- v = line[-1].voverlap(obj) * 2
- if v < obj.height or v < line[-1].height:
- line.sort(key=hkey)
- r.append(line)
- line = []
- line.append(obj)
- line.sort(key=hkey)
- r.append(line)
- return r
+class LAParams(object):
+
+ def __init__(self,
+ direction=None,
+ char_margin=1.0,
+ line_margin=0.5,
+ word_margin=0.1):
+ self.direction = direction
+ self.char_margin = char_margin
+ self.line_margin = line_margin
+ self.word_margin = word_margin
+ return
-def reorder_hv(objs, hdir):
- if 0 < hdir:
- hkey = (lambda obj: obj.x0)
- vkey = (lambda obj: -obj.y1)
- else:
- hkey = (lambda obj: -obj.x1)
- vkey = (lambda obj: -obj.y1)
- r = []
- line = []
- for obj in sorted(objs, key=hkey):
- if line and not line[-1].hoverlap(obj):
- line.sort(key=vkey)
- r.append(line)
- line = []
- line.append(obj)
- line.sort(key=vkey)
- r.append(line)
- return r
+ def __repr__(self):
+ return ('' %
+ (self.direction, self.char_margin, self.line_margin, self.word_margin))
## Plane
@@ -91,12 +65,6 @@ class Plane(object):
## ClusterSet
##
-## Maintains a set of LTTextBox objects.
-## It incrementally constructs LTTextBox objects
-## and group them when necessary. It gives
-## a sequence of LTTextBox objects that represent
-## the text stream of that page.
-##
class ClusterSet(object):
def __init__(self, klass):
@@ -123,16 +91,18 @@ class ClusterSet(object):
group.fixate()
return list(r)
-def group_objs(objs, hratio, vratio, klass):
- plane = Plane(objs)
- cset = ClusterSet(klass)
- for obj in objs:
- margin = obj.get_margin()
- hmargin = hratio * margin
- vmargin = vratio * margin
- neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
- cset.add(neighbors)
- return cset.finish()
+ @classmethod
+ def build(klass, objs, hratio, vratio, objtype):
+ plane = Plane(objs)
+ cset = ClusterSet(objtype)
+ for obj in objs:
+ margin = obj.get_margin()
+ hmargin = hratio * margin
+ vmargin = vratio * margin
+ neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
+ assert obj in neighbors, obj
+ cset.add(neighbors)
+ return cset.finish()
## LayoutItem
@@ -140,11 +110,12 @@ def group_objs(objs, hratio, vratio, klass):
class LayoutItem(object):
def __init__(self, bbox):
- #assert x0 <= x1 and y0 <= y1
self.set_bbox(bbox)
return
def set_bbox(self, (x0,y0,x1,y1)):
+ if x1 < x0: (x0,x1) = (x1,x0)
+ if y1 < y0: (y0,y1) = (y1,y0)
self.x0 = x0
self.y0 = y0
self.x1 = x1
@@ -202,6 +173,9 @@ class LayoutContainer(LayoutItem):
def __iter__(self):
return iter(self.objs)
+
+ def __len__(self):
+ return len(self.objs)
def add(self, obj):
self.objs.add(obj)
@@ -212,7 +186,7 @@ class LayoutContainer(LayoutItem):
return
# fixate(): determines its boundery and writing direction.
- def fixate(self, direction=None):
+ def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
@@ -228,14 +202,7 @@ class LayoutContainer(LayoutItem):
return self.weight
def get_direction(self):
- if not self.objs: return None
- d = {}
- for obj in self.objs:
- k = obj.get_direction()
- if k not in d: d[k] = 0
- d[k] += 1
- (direction,_) = sorted(d.iteritems(), key=lambda (k,v):v)[0]
- return direction
+ return None
## LTLine
@@ -259,21 +226,37 @@ class LTRect(LayoutItem):
return
-## LTAnon
+## LTText
##
-class LTAnon(object):
+class LTText(object):
def __init__(self, text):
self.text = text
return
+ def __repr__(self):
+ return '' % self.text
+
+ def get_weight(self):
+ return len(self.text)
+
+ def is_upright(self):
+ return True
+
+
+## LTAnon
+##
+class LTAnon(LTText):
+
def get_weight(self):
return 0
-## LTText
+## LTTextItem
##
-class LTText(LayoutItem):
+class LTTextItem(LayoutItem, LTText):
+
+ debug = 1
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
assert chars
@@ -307,21 +290,25 @@ class LTText(LayoutItem):
return
def __repr__(self):
- return ('' %
- ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
- self.font, self.fontsize, self.get_bbox(),
- '(%.1f, %.1f)' % self.adv,
- self.text))
+ if self.debug:
+ return ('' %
+ ('[%.1f, %.1f, %.1f, %.1f, (%.1f, %.1f)]' % self.matrix,
+ self.font, self.fontsize, self.get_bbox(),
+ '(%.1f, %.1f)' % self.adv,
+ self.text))
+ else:
+ return '' % self.text
def get_margin(self):
return abs(self.fontsize)
- def get_weight(self):
- return len(self.text)
-
def is_vertical(self):
return self.vertical
+ def is_upright(self):
+ (a,b,c,d,e,f) = self.matrix
+ return 0 < a*d and b*c <= 0
+
## LTFigure
##
@@ -336,6 +323,54 @@ class LTFigure(LayoutContainer):
return ('' % (self.id, self.get_bbox(), self.matrix))
+## LTTextLine
+##
+class LTTextLine(LayoutContainer):
+
+ def __init__(self, id, objs, direction, word_margin):
+ LayoutContainer.__init__(self, id, (0,0,0,0), objs)
+ self.direction = direction
+ self.word_margin = word_margin
+ return
+
+ def __repr__(self):
+ return ('' % (self.get_bbox(), self.direction))
+
+ def get_margin(self):
+ return min(self.width, self.height)
+
+ def get_direction(self):
+ return self.direction
+
+ def get_text(self):
+ return ''.join( obj.text for obj in self.objs if isinstance(obj, LTText) )
+
+ def fixate(self):
+ LayoutContainer.fixate(self)
+ objs = []
+ if self.direction == 'V':
+ y0 = -INF
+ for obj in sorted(self.objs, key=lambda obj: -obj.y1):
+ if isinstance(obj, LTTextItem) and self.word_margin:
+ margin = self.word_margin * obj.get_margin()
+ if obj.y1+margin < y0:
+ objs.append(LTAnon(' '))
+ objs.append(obj)
+ y0 = obj.y0
+ else:
+ x1 = INF
+ for obj in sorted(self.objs, key=lambda obj: obj.x0):
+ if isinstance(obj, LTTextItem) and self.word_margin:
+ margin = self.word_margin * obj.get_margin()
+ if x1 < obj.x0-margin:
+ objs.append(LTAnon(' '))
+ objs.append(obj)
+ x1 = obj.x1
+ objs.append(LTAnon('\n'))
+ self.objs = objs
+ return
+
+
## LTTextBox
##
## A set of text objects that are grouped within
@@ -343,65 +378,55 @@ class LTFigure(LayoutContainer):
##
class LTTextBox(LayoutContainer):
- def __init__(self, id, objs):
+ def __init__(self, id, objs, direction):
LayoutContainer.__init__(self, id, (0,0,0,0), objs)
- self.direction = None
+ self.direction = direction
return
def __repr__(self):
- return ('' % (self.get_bbox(), self.direction))
+ return ('' % (self.get_bbox(), self.direction, self.get_text()[:20]))
- def fixate(self, direction='H'):
- LayoutContainer.fixate(self, direction=direction)
- if not direction:
- if any( obj.is_vertical() for obj in self.objs ):
- direction = 'V'
- if 2 <= len(self.objs):
- objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
- if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
- h = objs[0].voverlap(objs[1])
- v = objs[0].hoverlap(objs[1])
- if h < v:
- direction = 'V'
- self.direction = direction
+ def get_text(self):
+ return ''.join( obj.get_text() for obj in self.objs if isinstance(obj, LTTextLine) )
+
+ def fixate(self):
+ LayoutContainer.fixate(self)
if self.direction == 'V':
- self.lines = reorder_hv(self.objs, -1)
+ self.objs = sorted(self.objs, key=lambda obj: -obj.x1)
else:
- self.lines = reorder_vh(self.objs, +1)
- self.objs = []
- for line in self.lines:
- self.objs.extend(line)
+ self.objs = sorted(self.objs, key=lambda obj: -obj.y1)
return
def get_direction(self):
return self.direction
- def get_lines(self, word_margin):
- if self.get_direction() == 'V':
- for line in self.lines:
- y0 = -INF
- for obj in line:
- if not isinstance(obj, LTText): continue
- if word_margin:
- margin = word_margin * obj.get_margin()
- if obj.y1+margin < y0:
- yield LTAnon(' ')
- yield obj
- y0 = obj.y0
- yield LTAnon('\n')
+
+def tsort(objs, f):
+ gi = dict( (obj,[]) for obj in objs )
+ go = dict( (obj,[]) for obj in objs )
+ for obj1 in objs:
+ for obj2 in objs:
+ if f(obj1, obj2): # obj1 -> obj2
+ go[obj1].append(obj2)
+ gi[obj2].append(obj1)
+ r = objs[:]
+ s = []
+ while r:
+ for obj in r:
+ if not go[obj] or gi[obj]: continue
+ for c in go[obj]:
+ gi[c].remove(obj)
+ del gi[obj]
+ del go[obj]
+ r.remove(obj)
+ s.append(obj)
+ break
else:
- for line in self.lines:
- x1 = INF
- for obj in line:
- if not isinstance(obj, LTText): continue
- if word_margin:
- margin = word_margin * obj.get_margin()
- if x1 < obj.x0-margin:
- yield LTAnon(' ')
- yield obj
- x1 = obj.x1
- yield LTAnon('\n')
- return
+ obj = r.pop()
+ del gi[obj]
+ del go[obj]
+ s.append(obj)
+ return s
## LTPage
@@ -416,19 +441,39 @@ class LTPage(LayoutContainer):
def __repr__(self):
return ('' % (self.id, self.get_bbox(), self.rotate))
- def fixate(self, dirtection='H'):
- return
-
- def group_text(self, char_margin, line_margin):
- textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
- objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
- if self.get_direction() == 'V':
- objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
- lines = reorder_hv(objs, -1)
+ def analyze_layout(self, laparams):
+ textobjs = []
+ otherobjs = []
+ for obj in self.objs:
+ if isinstance(obj, LTText) and obj.is_upright():
+ textobjs.append(obj)
+ else:
+ otherobjs.append(obj)
+ if laparams.direction == 'V':
+ lines = ClusterSet.build(textobjs, 0, laparams.char_margin,
+ (lambda id,objs: LTTextLine(id, objs, 'V', laparams.word_margin)))
+ boxes = ClusterSet.build(lines, laparams.line_margin, 0,
+ (lambda id,objs: LTTextBox(id, objs, 'V')))
+ def vorder(obj1, obj2):
+ if obj1.voverlap(obj2):
+ return obj2.x1 < obj1.x1
+ elif obj1.hoverlap(obj2):
+ return obj2.y1 < obj1.y1
+ else:
+ return obj2.x1 < obj1.x1 and obj2.y1 < obj1.y1
+ boxes = tsort(boxes, vorder)
else:
- objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
- lines = reorder_vh(objs, +1)
- self.objs = []
- for line in lines:
- self.objs.extend(line)
+ lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
+ (lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)))
+ boxes = ClusterSet.build(lines, 0, laparams.line_margin,
+ (lambda id,objs: LTTextBox(id, objs, 'H')))
+ def horder(obj1, obj2):
+ if obj1.hoverlap(obj2):
+ return obj2.y1 < obj1.y1
+ elif obj1.voverlap(obj2):
+ return obj1.x1 < obj2.x0
+ else:
+ return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
+ boxes = tsort(boxes, horder)
+ self.objs = otherobjs + boxes
return
diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
index 8f3c143..7ab9351 100644
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@@ -756,9 +756,8 @@ class PDFPageInterpreter(object):
##
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
-def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
+def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
doc = PDFDocument()
- fp = file(fname, 'rb')
parser = PDFParser(doc, fp)
doc.initialize(password)
if not doc.is_extractable:
@@ -768,5 +767,4 @@ def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
- fp.close()
return
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 368b146..75e65b6 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -47,6 +47,10 @@ def bsearch(objs, v0):
(v, obj) = objs[i]
if v0 == v:
(i0,i1) = (i,i+1)
+ while 0 < i0 and objs[i0-1][0] == v0:
+ i0 -= 1
+ while i1 < len(objs)-1 and objs[i1][0] == v0:
+ i1 += 1
break
elif v0 < v:
i1 = i
diff --git a/samples/Makefile b/samples/Makefile
index bfc0eb0..a358227 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,7 +1,7 @@
# GNUMakefile for test
PYTHON=python
-PDF2TXT=$(PYTHON) ../tools/pdf2txt.py
+PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py
HTMLS= \
simple1.html \
diff --git a/setup.py b/setup.py
index d51caa7..c95ac38 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''',
- keywords='pdf parser, pdf converter, text mining',
+ keywords=['pdf parser', 'pdf converter', 'text mining'],
license='MIT/X',
author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu',
diff --git a/tools/__init__.py b/tools/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tools/pdf2html.cgi b/tools/pdf2html.cgi
index 29e1b01..15f9ecc 100755
--- a/tools/pdf2html.cgi
+++ b/tools/pdf2html.cgi
@@ -19,7 +19,10 @@ import sys
# comment out at runtime.
import cgitb; cgitb.enable()
import os, os.path, re, cgi, time, random, codecs, logging, traceback
-import pdflib.pdf2txt
+from pdfminer.pdfinterp import PDFResourceManager, process_pdf
+from pdfminer.converter import HTMLConverter, TextConverter
+from pdfminer.layout import LAParams
+from pdfminer.cmap import CMapDB
# quote HTML metacharacters
@@ -35,6 +38,7 @@ def url(base, **kw):
r.append('%s=%s' % (k, v))
return base+'&'.join(r)
+
## convert
##
class FileSizeExceeded(ValueError): pass
@@ -54,13 +58,16 @@ def convert(outfp, infp, path, codec='utf-8', maxpages=10,
infp.close()
# perform conversion and
# send the results over the network.
- pdflib.pdf2txt.CMapDB.initialize('.', './CDBCMap')
- rsrc = pdflib.pdf2txt.PDFResourceManager()
+ CMapDB.initialize()
+ rsrc = PDFResourceManager()
+ laparams = LAParams()
if html:
- device = pdflib.pdf2txt.HTMLConverter(rsrc, outfp, codec=codec)
+ device = HTMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
else:
- device = pdflib.pdf2txt.TextConverter(rsrc, outfp, codec=codec)
- pdflib.pdf2txt.convert(rsrc, device, path, pagenos, maxpages=maxpages)
+ device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
+ fp = file(path, 'rb')
+ process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages)
+ fp.close()
return
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index 6ad95e6..006e8ce 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -5,17 +5,18 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_p
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmap import CMapDB, find_cmap_path
+from pdfminer.layout import LAParams
# main
def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
- '[-M char_margin] [-L line_margin] [-W word_margin] '
+ '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100
try:
- (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
+ (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@@ -31,12 +32,10 @@ def main(argv):
outfile = None
outtype = None
codec = 'utf-8'
- char_margin = 1.0
- line_margin = 0.3
- word_margin = 0.2
pageno = 1
scale = 1
showpageno = True
+ laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-C': cmapdir = v
@@ -47,9 +46,10 @@ def main(argv):
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-s': scale = float(v)
- elif k == '-M': char_margin = float(v)
- elif k == '-L': line_margin = float(v)
- elif k == '-W': word_margin = float(v)
+ elif k == '-D': laparams.direction = v
+ elif k == '-M': laparams.char_margin = float(v)
+ elif k == '-L': laparams.line_margin = float(v)
+ elif k == '-W': laparams.word_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
@@ -74,20 +74,19 @@ def main(argv):
else:
outfp = sys.stdout
if outtype == 'text':
- device = TextConverter(rsrc, outfp, codec=codec,
- char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+ device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'sgml':
- device = SGMLConverter(rsrc, outfp, codec=codec,
- char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+ device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'html':
- device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
- char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+ device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
- process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
+ fp = file(fname, 'rb')
+ process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
+ fp.close()
device.close()
return
diff --git a/tools/sgml.py b/tools/sgml.py
deleted file mode 100755
index b79a5ff..0000000
--- a/tools/sgml.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/usr/bin/env python
-import sys, sgmllib
-__all__ = [ 'Document', 'Page', 'Text', 'PDFSGMLParser' ]
-
-def fixed(x):
- return int(float(x)*1000)
-def getbbox(s):
- (a,b,c,d) = s.split(',')
- return (fixed(a),fixed(b),fixed(c),fixed(d))
-
-
-## Document
-##
-class Document:
-
- def __init__(self):
- self.pages = []
- return
-
- def __repr__(self):
- return '' % self.pages
-
- def get_pages(self):
- return self.pages
-
- def add_page(self, page):
- self.pages.append(page)
- return
-
- def add_text(self, text):
- self.pages[-1].add_text(text)
- return
-
-
-## Page
-##
-class Page:
-
- def __init__(self, pageid, bbox, rotate):
- self.pageid = pageid
- self.bbox = bbox
- self.rotate = rotate
- self.texts = []
- return
-
- def __repr__(self):
- return '' % (self.pageid, self.texts)
-
- def get_texts(self):
- return self.texts
-
- def add_text(self, text):
- self.texts.append(text)
- return
-
-
-## Text
-##
-class Text:
-
- def __init__(self, font, direction, bbox, size):
- self.font = font
- self.direction = direction
- self.bbox = bbox
- self.size = size
- self.data = ''
- return
-
- def __repr__(self):
- return '' % (self.data)
-
- def add_data(self, data):
- self.data += data
- return
-
-
-## PDFSGMLParser
-##
-class PDFSGMLParser(sgmllib.SGMLParser):
-
- def __init__(self, doc):
- sgmllib.SGMLParser.__init__(self)
- self.doc = doc
- self.curtext = None
- return
-
- def start_document(self, attrs):
- return
- def end_document(self):
- return
-
- def start_page(self, attrs):
- attrs = dict(attrs)
- pageid = attrs['id']
- bbox = getbbox(attrs['bbox'])
- rotate = int(attrs['rotate'])
- page = Page(pageid, bbox, rotate)
- self.doc.add_page(page)
- return
- def end_page(self):
- return
-
- def start_text(self, attrs):
- attrs = dict(attrs)
- font = attrs['font']
- direction = attrs['direction']
- bbox = getbbox(attrs['bbox'])
- size = fixed(attrs['fontsize'])
- text = Text(font, direction, bbox, size)
- self.curtext = text
- return
- def end_text(self):
- assert self.curtext
- self.doc.add_text(self.curtext)
- self.curtext = None
- return
-
- def handle_data(self, data):
- if not self.curtext: return
- self.curtext.add_data(data)
- return
-
- def feedfile(self, fp, encoding='utf-8'):
- for line in fp:
- line = unicode(line, encoding, 'ignore')
- self.feed(line)
- return
-
-
-# main
-def main(argv):
- import getopt
- def usage():
- print 'usage: %s [-d] [-c encoding] [file ...]' % argv[0]
- return 100
- try:
- (opts, args) = getopt.getopt(argv[1:], 'dc:')
- except getopt.GetoptError:
- return usage()
- encoding = 'utf-8'
- for (k, v) in opts:
- if k == '-d': debug += 1
- elif k == '-c': encoding = v
- for fname in args:
- doc = Document()
- parser = PDFSGMLParser(doc)
- parser.feedfile(fname, encoding)
- parser.close()
- print doc
- return 0
-
-if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/tools/viewpdf.py b/tools/viewpdf.py
deleted file mode 100755
index ec25d12..0000000
--- a/tools/viewpdf.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/usr/bin/env python
-import sys
-from sgml import PDFSGMLParser, Document
-stdout = sys.stdout
-stderr = sys.stderr
-try:
- import pygame
- from pygame.locals import *
-except ImportError:
- print >>stderr, 'you need pygame'
- sys.exit(111)
-
-
-def scale(x):
- return int(x*0.002)
-
-
-## FontManager
-##
-class FontManager:
-
- fonts = {}
- #default_font = '/Library/Fonts/Vera.ttf'
- default_font = '/usr/share/fonts/truetype/kochi/kochi-gothic.ttf'
-
- @classmethod
- def get_font(klass, path, size):
- if not path:
- path = klass.default_font
- size = int(size)
- k = (path,size)
- if k not in klass.fonts:
- font = pygame.font.Font(path, size)
- klass.fonts[k] = font
- else:
- font = klass.fonts[k]
- return font
-
-
-## PDFViewer
-##
-class PDFViewer:
-
- BGCOLOR = (255,255,255)
- FGCOLOR = (0,0,0)
-
- def __init__(self, display, doc):
- self.display = display
- self.buf = None
- self.pages = doc.get_pages()
- self.render_page(0)
- return
-
- def render_page(self, pageno):
- print >>stderr, 'rendering: page=%d...' % pageno
- page = self.pages[pageno]
- (x,y,w,h) = page.bbox
- self.width = scale(w)
- self.height = scale(h)
- self.buf = pygame.Surface((self.width, self.height))
- self.buf.fill(self.BGCOLOR)
- for text in page.get_texts():
- font = FontManager.get_font(None, scale(text.size*0.7))
- (x,y,w,h) = text.bbox
- r = font.render(text.data, 1, self.FGCOLOR)
- self.buf.blit(r, (scale(x), self.height-scale(y)))
- self.pageno = pageno
- self.pos = (0,0)
- self.refresh()
- return
-
- def refresh(self):
- size = self.display.get_size()
- self.display.blit(self.buf, (0,0), (self.pos, size))
- pygame.display.flip()
- return
-
- STEP = 8
- def run(self):
- loop = True
- key = None
- (w,h) = self.display.get_size()
- xmax = self.width - w
- ymax = self.height - h
- while loop:
- for e in pygame.event.get():
- if e.type == VIDEOEXPOSE:
- self.refresh()
- elif e.type == KEYDOWN:
- if e.key in (K_ESCAPE, K_RETURN, K_q):
- loop = False
- break
- elif e.key == K_SPACE:
- if self.pageno < len(self.pages)-1:
- self.render_page(self.pageno+1)
- elif e.key == K_b:
- if 0 < self.pageno:
- self.render_page(self.pageno-1)
- else:
- key = e.key
- elif e.type == KEYUP:
- key = None
- if key:
- (x,y) = self.pos
- if key in (K_h, K_LEFT, K_KP4):
- x = max(0, x-self.STEP)
- elif key in (K_l, K_RIGHT, K_KP6):
- x = min(xmax, x+self.STEP)
- elif key in (K_k, K_UP, K_KP8):
- y = max(0, y-self.STEP)
- elif key in (K_j, K_DOWN, K_KP2):
- y = min(ymax, y+self.STEP)
- self.pos = (x,y)
- self.refresh()
- return
-
-# main
-def main(argv):
- import getopt
- def usage():
- print 'usage: %s [-d] [-c encoding] file' % argv[0]
- return 100
- try:
- (opts, args) = getopt.getopt(argv[1:], 'dc:P:')
- except getopt.GetoptError:
- return usage()
- if not args: return usage()
- debug = 0
- encoding = 'utf-8'
- cmapdir = 'CMap'
- cdbcmapdir = 'CDBCMap'
- password = ''
- for (k, v) in opts:
- if k == '-d': debug += 1
- elif k == '-c': encoding = v
- elif k == '-P': password = v
- #
- fname = args.pop(0)
- if fname.endswith('.pdf'):
- # convert .pdf to sgml
- import tempfile
- from pdf2txt import CMapDB, PDFResourceManager, pdf2txt
- print >>stderr, 'reading %r...' % fname
- CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
- rsrc = PDFResourceManager(debug=debug)
- fp = tempfile.TemporaryFile()
- pdf2txt(fp, rsrc, fname, None, encoding, password=password, debug=debug)
- fp.seek(0)
- else:
- fp = file(fname, 'rb')
- doc = Document()
- parser = PDFSGMLParser(doc)
- parser.feedfile(fp, encoding)
- parser.close()
- fp.close()
- #
- pygame.init()
- pygame.display.set_mode((640,480))
- PDFViewer(pygame.display.get_surface(), doc).run()
- return
-
-if __name__ == '__main__': sys.exit(main(sys.argv))