documentation fix

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@117 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-07-11 12:42:12 +00:00
parent 97dd4dda5e
commit 787ae4f814
5 changed files with 146 additions and 121 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat May 23 10:06:04 JST 2009
Last Modified: Sat Jun 20 19:51:02 JST 2009
<!-- hhmts end -->
</div>
@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
<li> Do the following test:<br>
<blockquote><pre>
$ <strong>pdf2txt.py samples/simple1.pdf</strong>
&lt;html&gt;&lt;head&gt;
&lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"&gt;
&lt;/head&gt;&lt;body&gt;
&lt;span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"&gt; World &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"&gt; &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; Hello &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;"&gt;World &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt;
&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#1"&gt;1&lt;/a&gt;&lt;/div&gt;
&lt;/body&gt;&lt;/html&gt;
Hello
World
Hello World
</pre></blockquote>
<li> Done!
</ol>
@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding
<p>
Examples:
<blockquote><pre>
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf &gt; output.html</strong>
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
(extract text as an HTML file whose filename is output.html)
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf &gt; output.html</strong>
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
(extract a Japanese HTML file in vertical writing, CMap is required)
$ <strong>pdf2txt.py -P mypassword -t text secret.pdf &gt; output.txt</strong>
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
(extract a text from an encrypted PDF file)
</pre></blockquote>
@ -175,7 +170,7 @@ Options:
<dl>
<dt> <code>-o <em>filename</em></code>
<dd> Specifies the output file name.
By default, it prints the extracted contents to stdout.
By default, it prints the extracted contents to stdout in text format.
<p>
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
<dd> Specifies the comma-separated list of the page numbers to be extracted.
@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
</ul>
<p>
<dt> <code>-T <em>cluster_margin</em></code>
<dd>
<p>
<dt> <code>-W <em>word_margin</em></code>
<dd>
<p>
<dt> <code>-s <em>scale</em></code>
<dd>
<p>
<dt> <code>-m <em>maxpages</em></code>
<dd>
<p>
<dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file.
<p>
<dt> <code>-C <em>CMap directory</em></code>
<dd>
<p>
<dt> <code>-d</code>
<dd> Increases the debug level.
</dl>
@ -231,7 +241,10 @@ Options:
<dd> Instructs to dump all the objects.
By default, it only prints the document trailer (like a header).
<p>
<dt> <code>-p <em>pageno</em></code>
<dt> <code>-i <em>objno,objno, ...</em></code>
<dd>
<p>
<dt> <code>-p <em>pageno,pageno, ...</em></code>
<dd> Specifies the page number to be extracted.
Multiple <code>-p</code> options are allowed.
Note that page numbers start from one.
@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
<dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file.
<p>
<dt> <code>-T</code>
<dd>
<p>
<dt> <code>-d</code>
<dd> Increases the debug level.
</dl>

View File

@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, cluster_margin=None):
def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
PDFDevice.__init__(self, rsrc)
self.cluster_margin = cluster_margin
self.char_margin = char_margin
self.line_margin = line_margin
self.undefined_char = '?'
self.pageno = pageno
self.stack = []
@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice):
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
self.pageno += 1
if self.cluster_margin:
self.cur_item.group_text(self.cluster_margin)
if self.char_margin != None and self.line_margin != None:
self.cur_item.group_text(self.char_margin, self.line_margin)
return self.cur_item
def begin_figure(self, name, bbox, matrix):
@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice):
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
char_margin=None, line_margin=None, word_margin=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
char_margin=char_margin, line_margin=line_margin)
self.outfp = outfp
self.codec = codec
self.word_margin = word_margin
@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter):
##
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
char_margin=None, line_margin=None, word_margin=None,
scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno
self.pagepad = pagepad
self.scale = scale
@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter):
if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTAnon):
self.write(item.text)
pass
elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextBox):
@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
showpageno=False, word_margin=None):
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
char_margin=None, line_margin=None, word_margin=None,
showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno
return

View File

@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None):
## It performs binary search so that the processing time
## should be around O(log n).
##
def bsearch(objs, v0, v1):
if v1 <= v0: return []
def bsearch(objs, v0):
i0 = 0
i1 = len(objs)-1
while i0 <= i1:
i1 = len(objs)
while i0 < i1:
i = (i0+i1)/2
assert 0 <= i and i < len(objs)
(v, obj) = objs[i]
if v < v0:
i0 = i+1
elif v1 < v:
i1 = i-1
else:
i0 = i
while 0 < i0:
(v,_) = objs[i0-1]
if v < v0: break
i0 -= 1
if v0 == v:
(i0,i1) = (i,i+1)
break
elif v0 < v:
i1 = i
while i1 < len(objs)-1:
(v,_) = objs[i1+1]
if v1 < v: break
i1 += 1
return [ obj for (_,obj) in objs[i0:i1+1] ]
return []
else:
i0 = i+1
return (i0,i1)
## reorder_hv, reorder_vh
@ -63,10 +52,12 @@ def reorder_vh(objs, hdir):
r = []
line = []
for obj in sorted(objs, key=vkey):
if line and not line[-1].voverlap(obj):
line.sort(key=hkey)
r.append(line)
line = []
if line:
v = line[-1].voverlap(obj) * 2
if v < obj.height or v < line[-1].height:
line.sort(key=hkey)
r.append(line)
line = []
line.append(obj)
line.sort(key=hkey)
r.append(line)
@ -106,7 +97,8 @@ class Plane(object):
self.yobjs = []
for obj in objs:
self.place(obj)
self.fixate()
self.xobjs.sort()
self.yobjs.sort()
return
# place(obj): place an object in a certain area.
@ -118,16 +110,14 @@ class Plane(object):
self.yobjs.append((obj.y1, obj))
return
# fixate(): you must call this after adding all objects.
def fixate(self):
self.xobjs.sort()
self.yobjs.sort()
return
# find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)):
xobjs = set(bsearch(self.xobjs, x0, x1))
yobjs = set(bsearch(self.yobjs, y0, y1))
(i0,_) = bsearch(self.xobjs, x0)
(_,i1) = bsearch(self.xobjs, x1)
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
(i0,_) = bsearch(self.yobjs, y0)
(_,i1) = bsearch(self.yobjs, y1)
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
objs = xobjs.intersection(yobjs)
return objs
@ -166,12 +156,14 @@ class ClusterSet(object):
group.fixate()
return list(r)
def group_objs(objs, ratio, klass):
def group_objs(objs, hratio, vratio, klass):
plane = Plane(objs)
cset = ClusterSet(klass)
for obj in objs:
margin = abs(obj.get_margin(ratio))
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
margin = obj.get_margin()
hmargin = hratio * margin
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
cset.add(neighbors)
return cset.finish()
@ -214,7 +206,7 @@ class LayoutItem(object):
def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_margin(self, ratio):
def get_margin(self):
return 0
def get_weight(self):
@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem):
return
# fixate(): determines its boundery and writing direction.
def fixate(self):
def fixate(self, direction=None):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs:
@ -354,8 +346,8 @@ class LTText(LayoutItem):
'(%.1f, %.1f)' % self.adv,
self.text))
def get_margin(self, ratio):
return self.fontsize * ratio
def get_margin(self):
return abs(self.fontsize)
def get_weight(self):
return len(self.text)
@ -392,24 +384,25 @@ class LTTextBox(LayoutContainer):
def __repr__(self):
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
def fixate(self):
LayoutContainer.fixate(self)
self.direction = 'H'
for obj in self.objs:
if obj.is_vertical():
self.direction = 'V'
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
if h < v:
self.direction = 'V'
if self.direction == 'H':
self.lines = reorder_vh(self.objs, +1)
else:
def fixate(self, direction='H'):
LayoutContainer.fixate(self, direction=direction)
if not direction:
for obj in self.objs:
if obj.is_vertical():
direction = 'V'
break
if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
if objs[0].get_weight() == 1 and objs[1].get_weight() == 1:
h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1])
if h < v:
direction = 'V'
self.direction = direction
if self.direction == 'V':
self.lines = reorder_hv(self.objs, -1)
else:
self.lines = reorder_vh(self.objs, +1)
self.objs = []
for line in self.lines:
self.objs.extend(line)
@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer):
def get_direction(self):
return self.direction
def get_lines(self, ratio):
if self.get_direction() == 'H':
for line in self.lines:
x1 = INF
for obj in line:
if not isinstance(obj, LTText): continue
if ratio:
margin = obj.get_margin(ratio)
if x1 < obj.x0-margin:
yield LTAnon(' ')
yield obj
x1 = obj.x1
yield LTAnon('\n')
else:
def get_lines(self, word_margin):
if self.get_direction() == 'V':
for line in self.lines:
y0 = -INF
for obj in line:
if not isinstance(obj, LTText): continue
if ratio:
margin = obj.get_margin(ratio)
if word_margin:
margin = word_margin * obj.get_margin()
if obj.y1+margin < y0:
yield LTAnon(' ')
yield obj
y0 = obj.y0
yield LTAnon('\n')
else:
for line in self.lines:
x1 = INF
for obj in line:
if not isinstance(obj, LTText): continue
if word_margin:
margin = word_margin * obj.get_margin()
if x1 < obj.x0-margin:
yield LTAnon(' ')
yield obj
x1 = obj.x1
yield LTAnon('\n')
return
@ -458,17 +451,18 @@ class LTPage(LayoutContainer):
def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def fixate(self):
def fixate(self, dirtection='H'):
return
def group_text(self, ratio):
def group_text(self, char_margin, line_margin):
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
if self.get_direction() == 'H':
lines = reorder_vh(self.objs, +1)
objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
if self.get_direction() == 'V':
objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
lines = reorder_hv(objs, -1)
else:
lines = reorder_hv(self.objs, -1)
objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
lines = reorder_vh(objs, +1)
self.objs = []
for line in lines:
self.objs.extend(line)

View File

@ -1,8 +1,9 @@
#!/usr/bin/env python
from distutils.core import setup
from pdfminer import __version__
setup(name='pdfminer',
version='20090330',
version=__version__,
description='PDF parser and analyzer',
license='MIT/X',
author='Yusuke Shinyama',

View File

@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@ -29,7 +31,8 @@ def main(argv):
outfile = None
outtype = None
codec = 'utf-8'
cluster_margin = 0.5
char_margin = 1.0
line_margin = 0.3
word_margin = 0.2
pageno = 1
scale = 1
@ -44,7 +47,8 @@ def main(argv):
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v)
elif k == '-M': char_margin = float(v)
elif k == '-L': line_margin = float(v)
elif k == '-W': word_margin = float(v)
#
CMapDB.debug = debug
@ -69,12 +73,15 @@ def main(argv):
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else: