diff --git a/README.html b/README.html
index 53ad545..da23724 100644
--- a/README.html
+++ b/README.html
@@ -18,7 +18,7 @@ Python PDF parser and analyzer
-Last Modified: Sat May 23 10:06:04 JST 2009
+Last Modified: Sat Jun 20 19:51:02 JST 2009
@@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
Do the following test:
$ pdf2txt.py samples/simple1.pdf
-<html><head>
-<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-</head><body>
-<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
-<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
-<span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"> World </span>
-<span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"> </span>
-<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> Hello </span>
-<span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;">World </span>
-<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span>
-<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
-</body></html>
+
+
+Hello
+
+World
+
+ Hello World
Done!
@@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding
Examples:
-$ pdf2txt.py samples/naacl06-shinyama.pdf > output.html
+$ pdf2txt.py samples/naacl06-shinyama.pdf -o output.html
(extract text as an HTML file whose filename is output.html)
-$ pdf2txt.py -c euc-jp samples/jo.pdf > output.html
+$ pdf2txt.py -c euc-jp samples/jo.pdf -o output.html
(extract a Japanese HTML file in vertical writing, CMap is required)
-$ pdf2txt.py -P mypassword -t text secret.pdf > output.txt
+$ pdf2txt.py -P mypassword secret.pdf -o output.txt
(extract a text from an encrypted PDF file)
@@ -175,7 +170,7 @@ Options:
-
-o filename
- Specifies the output file name.
-By default, it prints the extracted contents to stdout.
+By default, it prints the extracted contents to stdout in text format.
-
-p pageno[,pageno,...]
- Specifies the comma-separated list of the page numbers to be extracted.
@@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF").
+
-
-T cluster_margin
+ -
+
+
-
-W word_margin
+ -
+
+
-
-s scale
+ -
+
+
-
-m maxpages
+ -
+
-
-P password
- Provides the user password to open the PDF file.
+
-
-C CMap directory
+ -
+
-
-d
- Increases the debug level.
@@ -231,7 +241,10 @@ Options:
Instructs to dump all the objects.
By default, it only prints the document trailer (like a header).
-
-p pageno
+ -i objno,objno, ...
+
+
+
-p pageno,pageno, ...
Specifies the page number to be extracted.
Multiple -p
options are allowed.
Note that page numbers start from one.
@@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
-P password
Provides the user password to open the PDF file.
+
-T
+
+
-d
Increases the debug level.
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 666c10d..9cf86d5 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
##
class PDFPageAggregator(PDFDevice):
- def __init__(self, rsrc, pageno=1, cluster_margin=None):
+ def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
PDFDevice.__init__(self, rsrc)
- self.cluster_margin = cluster_margin
+ self.char_margin = char_margin
+ self.line_margin = line_margin
self.undefined_char = '?'
self.pageno = pageno
self.stack = []
@@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice):
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
self.pageno += 1
- if self.cluster_margin:
- self.cur_item.group_text(self.cluster_margin)
+ if self.char_margin != None and self.line_margin != None:
+ self.cur_item.group_text(self.char_margin, self.line_margin)
return self.cur_item
def begin_figure(self, name, bbox, matrix):
@@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice):
##
class PDFConverter(PDFPageAggregator):
- def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
- PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+ char_margin=None, line_margin=None, word_margin=None):
+ PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
+ char_margin=char_margin, line_margin=line_margin)
self.outfp = outfp
self.codec = codec
self.word_margin = word_margin
@@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter):
##
class HTMLConverter(PDFConverter):
- def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+ char_margin=None, line_margin=None, word_margin=None,
scale=1, showpageno=True, pagepad=50):
- PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
+ PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
+ char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
@@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter):
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTAnon):
- self.write(item.text)
+ pass
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextBox):
@@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
- def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
- showpageno=False, word_margin=None):
- PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
+ def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
+ char_margin=None, line_margin=None, word_margin=None,
+ showpageno=False):
+ PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
+ char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno
return
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index cb2598c..7e575cf 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None):
## It performs binary search so that the processing time
## should be around O(log n).
##
-def bsearch(objs, v0, v1):
- if v1 <= v0: return []
+def bsearch(objs, v0):
i0 = 0
- i1 = len(objs)-1
- while i0 <= i1:
+ i1 = len(objs)
+ while i0 < i1:
i = (i0+i1)/2
- assert 0 <= i and i < len(objs)
(v, obj) = objs[i]
- if v < v0:
- i0 = i+1
- elif v1 < v:
- i1 = i-1
- else:
- i0 = i
- while 0 < i0:
- (v,_) = objs[i0-1]
- if v < v0: break
- i0 -= 1
+ if v0 == v:
+ (i0,i1) = (i,i+1)
+ break
+ elif v0 < v:
i1 = i
- while i1 < len(objs)-1:
- (v,_) = objs[i1+1]
- if v1 < v: break
- i1 += 1
- return [ obj for (_,obj) in objs[i0:i1+1] ]
- return []
+ else:
+ i0 = i+1
+ return (i0,i1)
## reorder_hv, reorder_vh
@@ -63,10 +52,12 @@ def reorder_vh(objs, hdir):
r = []
line = []
for obj in sorted(objs, key=vkey):
- if line and not line[-1].voverlap(obj):
- line.sort(key=hkey)
- r.append(line)
- line = []
+ if line:
+ v = line[-1].voverlap(obj) * 2
+ if v < obj.height or v < line[-1].height:
+ line.sort(key=hkey)
+ r.append(line)
+ line = []
line.append(obj)
line.sort(key=hkey)
r.append(line)
@@ -106,7 +97,8 @@ class Plane(object):
self.yobjs = []
for obj in objs:
self.place(obj)
- self.fixate()
+ self.xobjs.sort()
+ self.yobjs.sort()
return
# place(obj): place an object in a certain area.
@@ -118,16 +110,14 @@ class Plane(object):
self.yobjs.append((obj.y1, obj))
return
- # fixate(): you must call this after adding all objects.
- def fixate(self):
- self.xobjs.sort()
- self.yobjs.sort()
- return
-
# find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)):
- xobjs = set(bsearch(self.xobjs, x0, x1))
- yobjs = set(bsearch(self.yobjs, y0, y1))
+ (i0,_) = bsearch(self.xobjs, x0)
+ (_,i1) = bsearch(self.xobjs, x1)
+ xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
+ (i0,_) = bsearch(self.yobjs, y0)
+ (_,i1) = bsearch(self.yobjs, y1)
+ yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
objs = xobjs.intersection(yobjs)
return objs
@@ -166,12 +156,14 @@ class ClusterSet(object):
group.fixate()
return list(r)
-def group_objs(objs, ratio, klass):
+def group_objs(objs, hratio, vratio, klass):
plane = Plane(objs)
cset = ClusterSet(klass)
for obj in objs:
- margin = abs(obj.get_margin(ratio))
- neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
+ margin = obj.get_margin()
+ hmargin = hratio * margin
+ vmargin = vratio * margin
+ neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
cset.add(neighbors)
return cset.finish()
@@ -214,7 +206,7 @@ class LayoutItem(object):
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
- def get_margin(self, ratio):
+ def get_margin(self):
return 0
def get_weight(self):
@@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem):
return
# fixate(): determines its boundery and writing direction.
- def fixate(self):
+ def fixate(self, direction=None):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
@@ -354,8 +346,8 @@ class LTText(LayoutItem):
'(%.1f, %.1f)' % self.adv,
self.text))
- def get_margin(self, ratio):
- return self.fontsize * ratio
+ def get_margin(self):
+ return abs(self.fontsize)
def get_weight(self):
return len(self.text)
@@ -392,24 +384,25 @@ class LTTextBox(LayoutContainer):
def __repr__(self):
return ('' % (self.get_bbox(), self.direction))
- def fixate(self):
- LayoutContainer.fixate(self)
- self.direction = 'H'
- for obj in self.objs:
- if obj.is_vertical():
- self.direction = 'V'
- break
- if 2 <= len(self.objs):
- objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
- if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
- h = objs[0].voverlap(objs[1])
- v = objs[0].hoverlap(objs[1])
- if h < v:
- self.direction = 'V'
- if self.direction == 'H':
- self.lines = reorder_vh(self.objs, +1)
- else:
+ def fixate(self, direction='H'):
+ LayoutContainer.fixate(self, direction=direction)
+ if not direction:
+ for obj in self.objs:
+ if obj.is_vertical():
+ direction = 'V'
+ break
+ if 2 <= len(self.objs):
+ objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
+ if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
+ h = objs[0].voverlap(objs[1])
+ v = objs[0].hoverlap(objs[1])
+ if h < v:
+ direction = 'V'
+ self.direction = direction
+ if self.direction == 'V':
self.lines = reorder_hv(self.objs, -1)
+ else:
+ self.lines = reorder_vh(self.objs, +1)
self.objs = []
for line in self.lines:
self.objs.extend(line)
@@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer):
def get_direction(self):
return self.direction
- def get_lines(self, ratio):
- if self.get_direction() == 'H':
- for line in self.lines:
- x1 = INF
- for obj in line:
- if not isinstance(obj, LTText): continue
- if ratio:
- margin = obj.get_margin(ratio)
- if x1 < obj.x0-margin:
- yield LTAnon(' ')
- yield obj
- x1 = obj.x1
- yield LTAnon('\n')
- else:
+ def get_lines(self, word_margin):
+ if self.get_direction() == 'V':
for line in self.lines:
y0 = -INF
for obj in line:
if not isinstance(obj, LTText): continue
- if ratio:
- margin = obj.get_margin(ratio)
+ if word_margin:
+ margin = word_margin * obj.get_margin()
if obj.y1+margin < y0:
yield LTAnon(' ')
yield obj
y0 = obj.y0
yield LTAnon('\n')
+ else:
+ for line in self.lines:
+ x1 = INF
+ for obj in line:
+ if not isinstance(obj, LTText): continue
+ if word_margin:
+ margin = word_margin * obj.get_margin()
+ if x1 < obj.x0-margin:
+ yield LTAnon(' ')
+ yield obj
+ x1 = obj.x1
+ yield LTAnon('\n')
return
@@ -458,17 +451,18 @@ class LTPage(LayoutContainer):
def __repr__(self):
return ('' % (self.id, self.get_bbox(), self.rotate))
- def fixate(self):
+ def fixate(self, dirtection='H'):
return
- def group_text(self, ratio):
+ def group_text(self, char_margin, line_margin):
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
- otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
- self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
- if self.get_direction() == 'H':
- lines = reorder_vh(self.objs, +1)
+ objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
+ if self.get_direction() == 'V':
+ objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
+ lines = reorder_hv(objs, -1)
else:
- lines = reorder_hv(self.objs, -1)
+ objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
+ lines = reorder_vh(objs, +1)
self.objs = []
for line in lines:
self.objs.extend(line)
diff --git a/setup.py b/setup.py
index 3ea37f9..8ab539f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,9 @@
#!/usr/bin/env python
from distutils.core import setup
+from pdfminer import __version__
setup(name='pdfminer',
- version='20090330',
+ version=__version__,
description='PDF parser and analyzer',
license='MIT/X',
author='Yusuke Shinyama',
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index c44eb01..6ad95e6 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path
def main(argv):
import getopt
def usage():
- print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
+ print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
+ '[-M char_margin] [-L line_margin] [-W word_margin] '
+ '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100
try:
- (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
+ (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@@ -29,7 +31,8 @@ def main(argv):
outfile = None
outtype = None
codec = 'utf-8'
- cluster_margin = 0.5
+ char_margin = 1.0
+ line_margin = 0.3
word_margin = 0.2
pageno = 1
scale = 1
@@ -44,7 +47,8 @@ def main(argv):
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-s': scale = float(v)
- elif k == '-T': cluster_margin = float(v)
+ elif k == '-M': char_margin = float(v)
+ elif k == '-L': line_margin = float(v)
elif k == '-W': word_margin = float(v)
#
CMapDB.debug = debug
@@ -69,12 +73,15 @@ def main(argv):
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
- if outtype == 'sgml':
- device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
+ if outtype == 'text':
+ device = TextConverter(rsrc, outfp, codec=codec,
+ char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
+ elif outtype == 'sgml':
+ device = SGMLConverter(rsrc, outfp, codec=codec,
+ char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'html':
- device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
- elif outtype == 'text':
- device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
+ device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
+ char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else: