documentation fix

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@117 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-07-11 12:42:12 +00:00
parent 97dd4dda5e
commit 787ae4f814
5 changed files with 146 additions and 121 deletions

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat May 23 10:06:04 JST 2009 Last Modified: Sat Jun 20 19:51:02 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
<li> Do the following test:<br> <li> Do the following test:<br>
<blockquote><pre> <blockquote><pre>
$ <strong>pdf2txt.py samples/simple1.pdf</strong> $ <strong>pdf2txt.py samples/simple1.pdf</strong>
&lt;html&gt;&lt;head&gt;
&lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"&gt;
&lt;/head&gt;&lt;body&gt; Hello
&lt;span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt; World
&lt;span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"&gt; World &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"&gt; &lt;/span&gt; Hello World
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; Hello &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;"&gt;World &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt;
&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#1"&gt;1&lt;/a&gt;&lt;/div&gt;
&lt;/body&gt;&lt;/html&gt;
</pre></blockquote> </pre></blockquote>
<li> Done! <li> Done!
</ol> </ol>
@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding
<p> <p>
Examples: Examples:
<blockquote><pre> <blockquote><pre>
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf &gt; output.html</strong> $ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
(extract text as an HTML file whose filename is output.html) (extract text as an HTML file whose filename is output.html)
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf &gt; output.html</strong> $ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
(extract a Japanese HTML file in vertical writing, CMap is required) (extract a Japanese HTML file in vertical writing, CMap is required)
$ <strong>pdf2txt.py -P mypassword -t text secret.pdf &gt; output.txt</strong> $ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
(extract a text from an encrypted PDF file) (extract a text from an encrypted PDF file)
</pre></blockquote> </pre></blockquote>
@ -175,7 +170,7 @@ Options:
<dl> <dl>
<dt> <code>-o <em>filename</em></code> <dt> <code>-o <em>filename</em></code>
<dd> Specifies the output file name. <dd> Specifies the output file name.
By default, it prints the extracted contents to stdout. By default, it prints the extracted contents to stdout in text format.
<p> <p>
<dt> <code>-p <em>pageno[,pageno,...]</em></code> <dt> <code>-p <em>pageno[,pageno,...]</em></code>
<dd> Specifies the comma-separated list of the page numbers to be extracted. <dd> Specifies the comma-separated list of the page numbers to be extracted.
@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>"). Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged PDF</em>").
</ul> </ul>
<p> <p>
<dt> <code>-T <em>cluster_margin</em></code>
<dd>
<p>
<dt> <code>-W <em>word_margin</em></code>
<dd>
<p>
<dt> <code>-s <em>scale</em></code>
<dd>
<p>
<dt> <code>-m <em>maxpages</em></code>
<dd>
<p>
<dt> <code>-P <em>password</em></code> <dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file. <dd> Provides the user password to open the PDF file.
<p> <p>
<dt> <code>-C <em>CMap directory</em></code>
<dd>
<p>
<dt> <code>-d</code> <dt> <code>-d</code>
<dd> Increases the debug level. <dd> Increases the debug level.
</dl> </dl>
@ -231,7 +241,10 @@ Options:
<dd> Instructs to dump all the objects. <dd> Instructs to dump all the objects.
By default, it only prints the document trailer (like a header). By default, it only prints the document trailer (like a header).
<p> <p>
<dt> <code>-p <em>pageno</em></code> <dt> <code>-i <em>objno,objno, ...</em></code>
<dd>
<p>
<dt> <code>-p <em>pageno,pageno, ...</em></code>
<dd> Specifies the page number to be extracted. <dd> Specifies the page number to be extracted.
Multiple <code>-p</code> options are allowed. Multiple <code>-p</code> options are allowed.
Note that page numbers start from one. Note that page numbers start from one.
@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
<dt> <code>-P <em>password</em></code> <dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file. <dd> Provides the user password to open the PDF file.
<p> <p>
<dt> <code>-T</code>
<dd>
<p>
<dt> <code>-d</code> <dt> <code>-d</code>
<dd> Increases the debug level. <dd> Increases the debug level.
</dl> </dl>

View File

@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
## ##
class PDFPageAggregator(PDFDevice): class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, cluster_margin=None): def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
PDFDevice.__init__(self, rsrc) PDFDevice.__init__(self, rsrc)
self.cluster_margin = cluster_margin self.char_margin = char_margin
self.line_margin = line_margin
self.undefined_char = '?' self.undefined_char = '?'
self.pageno = pageno self.pageno = pageno
self.stack = [] self.stack = []
@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice):
assert isinstance(self.cur_item, LTPage) assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate() self.cur_item.fixate()
self.pageno += 1 self.pageno += 1
if self.cluster_margin: if self.char_margin != None and self.line_margin != None:
self.cur_item.group_text(self.cluster_margin) self.cur_item.group_text(self.char_margin, self.line_margin)
return self.cur_item return self.cur_item
def begin_figure(self, name, bbox, matrix): def begin_figure(self, name, bbox, matrix):
@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice):
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin) char_margin=None, line_margin=None, word_margin=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
char_margin=char_margin, line_margin=line_margin)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.word_margin = word_margin self.word_margin = word_margin
@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter):
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8', def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
char_margin=None, line_margin=None, word_margin=None,
scale=1, showpageno=True, pagepad=50): scale=1, showpageno=True, pagepad=50):
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter):
if self.debug: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTAnon): elif isinstance(item, LTAnon):
self.write(item.text) pass
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter):
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
showpageno=False, word_margin=None): char_margin=None, line_margin=None, word_margin=None,
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec) showpageno=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
self.showpageno = showpageno self.showpageno = showpageno
return return

View File

@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None):
## It performs binary search so that the processing time ## It performs binary search so that the processing time
## should be around O(log n). ## should be around O(log n).
## ##
def bsearch(objs, v0, v1): def bsearch(objs, v0):
if v1 <= v0: return []
i0 = 0 i0 = 0
i1 = len(objs)-1 i1 = len(objs)
while i0 <= i1: while i0 < i1:
i = (i0+i1)/2 i = (i0+i1)/2
assert 0 <= i and i < len(objs)
(v, obj) = objs[i] (v, obj) = objs[i]
if v < v0: if v0 == v:
i0 = i+1 (i0,i1) = (i,i+1)
elif v1 < v: break
i1 = i-1 elif v0 < v:
else:
i0 = i
while 0 < i0:
(v,_) = objs[i0-1]
if v < v0: break
i0 -= 1
i1 = i i1 = i
while i1 < len(objs)-1: else:
(v,_) = objs[i1+1] i0 = i+1
if v1 < v: break return (i0,i1)
i1 += 1
return [ obj for (_,obj) in objs[i0:i1+1] ]
return []
## reorder_hv, reorder_vh ## reorder_hv, reorder_vh
@ -63,7 +52,9 @@ def reorder_vh(objs, hdir):
r = [] r = []
line = [] line = []
for obj in sorted(objs, key=vkey): for obj in sorted(objs, key=vkey):
if line and not line[-1].voverlap(obj): if line:
v = line[-1].voverlap(obj) * 2
if v < obj.height or v < line[-1].height:
line.sort(key=hkey) line.sort(key=hkey)
r.append(line) r.append(line)
line = [] line = []
@ -106,7 +97,8 @@ class Plane(object):
self.yobjs = [] self.yobjs = []
for obj in objs: for obj in objs:
self.place(obj) self.place(obj)
self.fixate() self.xobjs.sort()
self.yobjs.sort()
return return
# place(obj): place an object in a certain area. # place(obj): place an object in a certain area.
@ -118,16 +110,14 @@ class Plane(object):
self.yobjs.append((obj.y1, obj)) self.yobjs.append((obj.y1, obj))
return return
# fixate(): you must call this after adding all objects.
def fixate(self):
self.xobjs.sort()
self.yobjs.sort()
return
# find(): finds objects that are in a certain area. # find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)): def find(self, (x0,y0,x1,y1)):
xobjs = set(bsearch(self.xobjs, x0, x1)) (i0,_) = bsearch(self.xobjs, x0)
yobjs = set(bsearch(self.yobjs, y0, y1)) (_,i1) = bsearch(self.xobjs, x1)
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
(i0,_) = bsearch(self.yobjs, y0)
(_,i1) = bsearch(self.yobjs, y1)
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
objs = xobjs.intersection(yobjs) objs = xobjs.intersection(yobjs)
return objs return objs
@ -166,12 +156,14 @@ class ClusterSet(object):
group.fixate() group.fixate()
return list(r) return list(r)
def group_objs(objs, ratio, klass): def group_objs(objs, hratio, vratio, klass):
plane = Plane(objs) plane = Plane(objs)
cset = ClusterSet(klass) cset = ClusterSet(klass)
for obj in objs: for obj in objs:
margin = abs(obj.get_margin(ratio)) margin = obj.get_margin()
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin)) hmargin = hratio * margin
vmargin = vratio * margin
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
cset.add(neighbors) cset.add(neighbors)
return cset.finish() return cset.finish()
@ -214,7 +206,7 @@ class LayoutItem(object):
def get_bbox(self): def get_bbox(self):
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1) return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
def get_margin(self, ratio): def get_margin(self):
return 0 return 0
def get_weight(self): def get_weight(self):
@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem):
return return
# fixate(): determines its boundery and writing direction. # fixate(): determines its boundery and writing direction.
def fixate(self): def fixate(self, direction=None):
if not self.width and self.objs: if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
for obj in self.objs: for obj in self.objs:
@ -354,8 +346,8 @@ class LTText(LayoutItem):
'(%.1f, %.1f)' % self.adv, '(%.1f, %.1f)' % self.adv,
self.text)) self.text))
def get_margin(self, ratio): def get_margin(self):
return self.fontsize * ratio return abs(self.fontsize)
def get_weight(self): def get_weight(self):
return len(self.text) return len(self.text)
@ -392,12 +384,12 @@ class LTTextBox(LayoutContainer):
def __repr__(self): def __repr__(self):
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction)) return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
def fixate(self): def fixate(self, direction='H'):
LayoutContainer.fixate(self) LayoutContainer.fixate(self, direction=direction)
self.direction = 'H' if not direction:
for obj in self.objs: for obj in self.objs:
if obj.is_vertical(): if obj.is_vertical():
self.direction = 'V' direction = 'V'
break break
if 2 <= len(self.objs): if 2 <= len(self.objs):
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
@ -405,11 +397,12 @@ class LTTextBox(LayoutContainer):
h = objs[0].voverlap(objs[1]) h = objs[0].voverlap(objs[1])
v = objs[0].hoverlap(objs[1]) v = objs[0].hoverlap(objs[1])
if h < v: if h < v:
self.direction = 'V' direction = 'V'
if self.direction == 'H': self.direction = direction
self.lines = reorder_vh(self.objs, +1) if self.direction == 'V':
else:
self.lines = reorder_hv(self.objs, -1) self.lines = reorder_hv(self.objs, -1)
else:
self.lines = reorder_vh(self.objs, +1)
self.objs = [] self.objs = []
for line in self.lines: for line in self.lines:
self.objs.extend(line) self.objs.extend(line)
@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer):
def get_direction(self): def get_direction(self):
return self.direction return self.direction
def get_lines(self, ratio): def get_lines(self, word_margin):
if self.get_direction() == 'H': if self.get_direction() == 'V':
for line in self.lines:
x1 = INF
for obj in line:
if not isinstance(obj, LTText): continue
if ratio:
margin = obj.get_margin(ratio)
if x1 < obj.x0-margin:
yield LTAnon(' ')
yield obj
x1 = obj.x1
yield LTAnon('\n')
else:
for line in self.lines: for line in self.lines:
y0 = -INF y0 = -INF
for obj in line: for obj in line:
if not isinstance(obj, LTText): continue if not isinstance(obj, LTText): continue
if ratio: if word_margin:
margin = obj.get_margin(ratio) margin = word_margin * obj.get_margin()
if obj.y1+margin < y0: if obj.y1+margin < y0:
yield LTAnon(' ') yield LTAnon(' ')
yield obj yield obj
y0 = obj.y0 y0 = obj.y0
yield LTAnon('\n') yield LTAnon('\n')
else:
for line in self.lines:
x1 = INF
for obj in line:
if not isinstance(obj, LTText): continue
if word_margin:
margin = word_margin * obj.get_margin()
if x1 < obj.x0-margin:
yield LTAnon(' ')
yield obj
x1 = obj.x1
yield LTAnon('\n')
return return
@ -458,17 +451,18 @@ class LTPage(LayoutContainer):
def __repr__(self): def __repr__(self):
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate)) return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
def fixate(self): def fixate(self, dirtection='H'):
return return
def group_text(self, ratio): def group_text(self, char_margin, line_margin):
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ] textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ] objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs if self.get_direction() == 'V':
if self.get_direction() == 'H': objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
lines = reorder_vh(self.objs, +1) lines = reorder_hv(objs, -1)
else: else:
lines = reorder_hv(self.objs, -1) objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
lines = reorder_vh(objs, +1)
self.objs = [] self.objs = []
for line in lines: for line in lines:
self.objs.extend(line) self.objs.extend(line)

View File

@ -1,8 +1,9 @@
#!/usr/bin/env python #!/usr/bin/env python
from distutils.core import setup from distutils.core import setup
from pdfminer import __version__
setup(name='pdfminer', setup(name='pdfminer',
version='20090330', version=__version__,
description='PDF parser and analyzer', description='PDF parser and analyzer',
license='MIT/X', license='MIT/X',
author='Yusuke Shinyama', author='Yusuke Shinyama',

View File

@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -29,7 +31,8 @@ def main(argv):
outfile = None outfile = None
outtype = None outtype = None
codec = 'utf-8' codec = 'utf-8'
cluster_margin = 0.5 char_margin = 1.0
line_margin = 0.3
word_margin = 0.2 word_margin = 0.2
pageno = 1 pageno = 1
scale = 1 scale = 1
@ -44,7 +47,8 @@ def main(argv):
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v) elif k == '-M': char_margin = float(v)
elif k == '-L': line_margin = float(v)
elif k == '-W': word_margin = float(v) elif k == '-W': word_margin = float(v)
# #
CMapDB.debug = debug CMapDB.debug = debug
@ -69,12 +73,15 @@ def main(argv):
outfp = file(outfile, 'w') outfp = file(outfile, 'w')
else: else:
outfp = sys.stdout outfp = sys.stdout
if outtype == 'sgml': if outtype == 'text':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin) device = TextConverter(rsrc, outfp, codec=codec,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec,
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale) device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
elif outtype == 'text': char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec) device = TagExtractor(rsrc, outfp, codec=codec)
else: else: