documentation fix
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@117 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
97dd4dda5e
commit
787ae4f814
52
README.html
52
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sat May 23 10:06:04 JST 2009
|
||||
Last Modified: Sat Jun 20 19:51:02 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
|
|||
<li> Do the following test:<br>
|
||||
<blockquote><pre>
|
||||
$ <strong>pdf2txt.py samples/simple1.pdf</strong>
|
||||
<html><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
</head><body>
|
||||
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
|
||||
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"> World </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"> </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> Hello </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;">World </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span>
|
||||
<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
|
||||
</body></html>
|
||||
|
||||
|
||||
Hello
|
||||
|
||||
World
|
||||
|
||||
Hello World
|
||||
</pre></blockquote>
|
||||
<li> Done!
|
||||
</ol>
|
||||
|
@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding
|
|||
<p>
|
||||
Examples:
|
||||
<blockquote><pre>
|
||||
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf > output.html</strong>
|
||||
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
|
||||
(extract text as an HTML file whose filename is output.html)
|
||||
|
||||
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf > output.html</strong>
|
||||
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
|
||||
(extract a Japanese HTML file in vertical writing, CMap is required)
|
||||
|
||||
$ <strong>pdf2txt.py -P mypassword -t text secret.pdf > output.txt</strong>
|
||||
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
|
||||
(extract a text from an encrypted PDF file)
|
||||
</pre></blockquote>
|
||||
|
||||
|
@ -175,7 +170,7 @@ Options:
|
|||
<dl>
|
||||
<dt> <code>-o <em>filename</em></code>
|
||||
<dd> Specifies the output file name.
|
||||
By default, it prints the extracted contents to stdout.
|
||||
By default, it prints the extracted contents to stdout in text format.
|
||||
<p>
|
||||
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
||||
<dd> Specifies the comma-separated list of the page numbers to be extracted.
|
||||
|
@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
|
|||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||
</ul>
|
||||
<p>
|
||||
<dt> <code>-T <em>cluster_margin</em></code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-W <em>word_margin</em></code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-s <em>scale</em></code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-m <em>maxpages</em></code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-P <em>password</em></code>
|
||||
<dd> Provides the user password to open the PDF file.
|
||||
<p>
|
||||
<dt> <code>-C <em>CMap directory</em></code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-d</code>
|
||||
<dd> Increases the debug level.
|
||||
</dl>
|
||||
|
@ -231,7 +241,10 @@ Options:
|
|||
<dd> Instructs to dump all the objects.
|
||||
By default, it only prints the document trailer (like a header).
|
||||
<p>
|
||||
<dt> <code>-p <em>pageno</em></code>
|
||||
<dt> <code>-i <em>objno,objno, ...</em></code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-p <em>pageno,pageno, ...</em></code>
|
||||
<dd> Specifies the page number to be extracted.
|
||||
Multiple <code>-p</code> options are allowed.
|
||||
Note that page numbers start from one.
|
||||
|
@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<dt> <code>-P <em>password</em></code>
|
||||
<dd> Provides the user password to open the PDF file.
|
||||
<p>
|
||||
<dt> <code>-T</code>
|
||||
<dd>
|
||||
<p>
|
||||
<dt> <code>-d</code>
|
||||
<dd> Increases the debug level.
|
||||
</dl>
|
||||
|
|
|
@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
|||
##
|
||||
class PDFPageAggregator(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1, cluster_margin=None):
|
||||
def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
self.cluster_margin = cluster_margin
|
||||
self.char_margin = char_margin
|
||||
self.line_margin = line_margin
|
||||
self.undefined_char = '?'
|
||||
self.pageno = pageno
|
||||
self.stack = []
|
||||
|
@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice):
|
|||
assert isinstance(self.cur_item, LTPage)
|
||||
self.cur_item.fixate()
|
||||
self.pageno += 1
|
||||
if self.cluster_margin:
|
||||
self.cur_item.group_text(self.cluster_margin)
|
||||
if self.char_margin != None and self.line_margin != None:
|
||||
self.cur_item.group_text(self.char_margin, self.line_margin)
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
|
@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice):
|
|||
##
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||
char_margin=None, line_margin=None, word_margin=None):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
|
||||
char_margin=char_margin, line_margin=line_margin)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
self.word_margin = word_margin
|
||||
|
@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter):
|
|||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||
char_margin=None, line_margin=None, word_margin=None,
|
||||
scale=1, showpageno=True, pagepad=50):
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
self.showpageno = showpageno
|
||||
self.pagepad = pagepad
|
||||
self.scale = scale
|
||||
|
@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter):
|
|||
if self.debug:
|
||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTAnon):
|
||||
self.write(item.text)
|
||||
pass
|
||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTTextBox):
|
||||
|
@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
||||
showpageno=False, word_margin=None):
|
||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||
char_margin=None, line_margin=None, word_margin=None,
|
||||
showpageno=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
self.showpageno = showpageno
|
||||
return
|
||||
|
||||
|
|
|
@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None):
|
|||
## It performs binary search so that the processing time
|
||||
## should be around O(log n).
|
||||
##
|
||||
def bsearch(objs, v0, v1):
|
||||
if v1 <= v0: return []
|
||||
def bsearch(objs, v0):
|
||||
i0 = 0
|
||||
i1 = len(objs)-1
|
||||
while i0 <= i1:
|
||||
i1 = len(objs)
|
||||
while i0 < i1:
|
||||
i = (i0+i1)/2
|
||||
assert 0 <= i and i < len(objs)
|
||||
(v, obj) = objs[i]
|
||||
if v < v0:
|
||||
i0 = i+1
|
||||
elif v1 < v:
|
||||
i1 = i-1
|
||||
else:
|
||||
i0 = i
|
||||
while 0 < i0:
|
||||
(v,_) = objs[i0-1]
|
||||
if v < v0: break
|
||||
i0 -= 1
|
||||
if v0 == v:
|
||||
(i0,i1) = (i,i+1)
|
||||
break
|
||||
elif v0 < v:
|
||||
i1 = i
|
||||
while i1 < len(objs)-1:
|
||||
(v,_) = objs[i1+1]
|
||||
if v1 < v: break
|
||||
i1 += 1
|
||||
return [ obj for (_,obj) in objs[i0:i1+1] ]
|
||||
return []
|
||||
else:
|
||||
i0 = i+1
|
||||
return (i0,i1)
|
||||
|
||||
|
||||
## reorder_hv, reorder_vh
|
||||
|
@ -63,7 +52,9 @@ def reorder_vh(objs, hdir):
|
|||
r = []
|
||||
line = []
|
||||
for obj in sorted(objs, key=vkey):
|
||||
if line and not line[-1].voverlap(obj):
|
||||
if line:
|
||||
v = line[-1].voverlap(obj) * 2
|
||||
if v < obj.height or v < line[-1].height:
|
||||
line.sort(key=hkey)
|
||||
r.append(line)
|
||||
line = []
|
||||
|
@ -106,7 +97,8 @@ class Plane(object):
|
|||
self.yobjs = []
|
||||
for obj in objs:
|
||||
self.place(obj)
|
||||
self.fixate()
|
||||
self.xobjs.sort()
|
||||
self.yobjs.sort()
|
||||
return
|
||||
|
||||
# place(obj): place an object in a certain area.
|
||||
|
@ -118,16 +110,14 @@ class Plane(object):
|
|||
self.yobjs.append((obj.y1, obj))
|
||||
return
|
||||
|
||||
# fixate(): you must call this after adding all objects.
|
||||
def fixate(self):
|
||||
self.xobjs.sort()
|
||||
self.yobjs.sort()
|
||||
return
|
||||
|
||||
# find(): finds objects that are in a certain area.
|
||||
def find(self, (x0,y0,x1,y1)):
|
||||
xobjs = set(bsearch(self.xobjs, x0, x1))
|
||||
yobjs = set(bsearch(self.yobjs, y0, y1))
|
||||
(i0,_) = bsearch(self.xobjs, x0)
|
||||
(_,i1) = bsearch(self.xobjs, x1)
|
||||
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
|
||||
(i0,_) = bsearch(self.yobjs, y0)
|
||||
(_,i1) = bsearch(self.yobjs, y1)
|
||||
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
|
||||
objs = xobjs.intersection(yobjs)
|
||||
return objs
|
||||
|
||||
|
@ -166,12 +156,14 @@ class ClusterSet(object):
|
|||
group.fixate()
|
||||
return list(r)
|
||||
|
||||
def group_objs(objs, ratio, klass):
|
||||
def group_objs(objs, hratio, vratio, klass):
|
||||
plane = Plane(objs)
|
||||
cset = ClusterSet(klass)
|
||||
for obj in objs:
|
||||
margin = abs(obj.get_margin(ratio))
|
||||
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
|
||||
margin = obj.get_margin()
|
||||
hmargin = hratio * margin
|
||||
vmargin = vratio * margin
|
||||
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||
cset.add(neighbors)
|
||||
return cset.finish()
|
||||
|
||||
|
@ -214,7 +206,7 @@ class LayoutItem(object):
|
|||
def get_bbox(self):
|
||||
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
||||
|
||||
def get_margin(self, ratio):
|
||||
def get_margin(self):
|
||||
return 0
|
||||
|
||||
def get_weight(self):
|
||||
|
@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem):
|
|||
return
|
||||
|
||||
# fixate(): determines its boundery and writing direction.
|
||||
def fixate(self):
|
||||
def fixate(self, direction=None):
|
||||
if not self.width and self.objs:
|
||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||
for obj in self.objs:
|
||||
|
@ -354,8 +346,8 @@ class LTText(LayoutItem):
|
|||
'(%.1f, %.1f)' % self.adv,
|
||||
self.text))
|
||||
|
||||
def get_margin(self, ratio):
|
||||
return self.fontsize * ratio
|
||||
def get_margin(self):
|
||||
return abs(self.fontsize)
|
||||
|
||||
def get_weight(self):
|
||||
return len(self.text)
|
||||
|
@ -392,12 +384,12 @@ class LTTextBox(LayoutContainer):
|
|||
def __repr__(self):
|
||||
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
|
||||
|
||||
def fixate(self):
|
||||
LayoutContainer.fixate(self)
|
||||
self.direction = 'H'
|
||||
def fixate(self, direction='H'):
|
||||
LayoutContainer.fixate(self, direction=direction)
|
||||
if not direction:
|
||||
for obj in self.objs:
|
||||
if obj.is_vertical():
|
||||
self.direction = 'V'
|
||||
direction = 'V'
|
||||
break
|
||||
if 2 <= len(self.objs):
|
||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||
|
@ -405,11 +397,12 @@ class LTTextBox(LayoutContainer):
|
|||
h = objs[0].voverlap(objs[1])
|
||||
v = objs[0].hoverlap(objs[1])
|
||||
if h < v:
|
||||
self.direction = 'V'
|
||||
if self.direction == 'H':
|
||||
self.lines = reorder_vh(self.objs, +1)
|
||||
else:
|
||||
direction = 'V'
|
||||
self.direction = direction
|
||||
if self.direction == 'V':
|
||||
self.lines = reorder_hv(self.objs, -1)
|
||||
else:
|
||||
self.lines = reorder_vh(self.objs, +1)
|
||||
self.objs = []
|
||||
for line in self.lines:
|
||||
self.objs.extend(line)
|
||||
|
@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer):
|
|||
def get_direction(self):
|
||||
return self.direction
|
||||
|
||||
def get_lines(self, ratio):
|
||||
if self.get_direction() == 'H':
|
||||
for line in self.lines:
|
||||
x1 = INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, LTText): continue
|
||||
if ratio:
|
||||
margin = obj.get_margin(ratio)
|
||||
if x1 < obj.x0-margin:
|
||||
yield LTAnon(' ')
|
||||
yield obj
|
||||
x1 = obj.x1
|
||||
yield LTAnon('\n')
|
||||
else:
|
||||
def get_lines(self, word_margin):
|
||||
if self.get_direction() == 'V':
|
||||
for line in self.lines:
|
||||
y0 = -INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, LTText): continue
|
||||
if ratio:
|
||||
margin = obj.get_margin(ratio)
|
||||
if word_margin:
|
||||
margin = word_margin * obj.get_margin()
|
||||
if obj.y1+margin < y0:
|
||||
yield LTAnon(' ')
|
||||
yield obj
|
||||
y0 = obj.y0
|
||||
yield LTAnon('\n')
|
||||
else:
|
||||
for line in self.lines:
|
||||
x1 = INF
|
||||
for obj in line:
|
||||
if not isinstance(obj, LTText): continue
|
||||
if word_margin:
|
||||
margin = word_margin * obj.get_margin()
|
||||
if x1 < obj.x0-margin:
|
||||
yield LTAnon(' ')
|
||||
yield obj
|
||||
x1 = obj.x1
|
||||
yield LTAnon('\n')
|
||||
return
|
||||
|
||||
|
||||
|
@ -458,17 +451,18 @@ class LTPage(LayoutContainer):
|
|||
def __repr__(self):
|
||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
||||
|
||||
def fixate(self):
|
||||
def fixate(self, dirtection='H'):
|
||||
return
|
||||
|
||||
def group_text(self, ratio):
|
||||
def group_text(self, char_margin, line_margin):
|
||||
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
|
||||
otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
|
||||
self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
|
||||
if self.get_direction() == 'H':
|
||||
lines = reorder_vh(self.objs, +1)
|
||||
objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
|
||||
if self.get_direction() == 'V':
|
||||
objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
|
||||
lines = reorder_hv(objs, -1)
|
||||
else:
|
||||
lines = reorder_hv(self.objs, -1)
|
||||
objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
|
||||
lines = reorder_vh(objs, +1)
|
||||
self.objs = []
|
||||
for line in lines:
|
||||
self.objs.extend(line)
|
||||
|
|
3
setup.py
3
setup.py
|
@ -1,8 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
from distutils.core import setup
|
||||
from pdfminer import __version__
|
||||
|
||||
setup(name='pdfminer',
|
||||
version='20090330',
|
||||
version=__version__,
|
||||
description='PDF parser and analyzer',
|
||||
license='MIT/X',
|
||||
author='Yusuke Shinyama',
|
||||
|
|
|
@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path
|
|||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
||||
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||
'[-M char_margin] [-L line_margin] [-W word_margin] '
|
||||
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
@ -29,7 +31,8 @@ def main(argv):
|
|||
outfile = None
|
||||
outtype = None
|
||||
codec = 'utf-8'
|
||||
cluster_margin = 0.5
|
||||
char_margin = 1.0
|
||||
line_margin = 0.3
|
||||
word_margin = 0.2
|
||||
pageno = 1
|
||||
scale = 1
|
||||
|
@ -44,7 +47,8 @@ def main(argv):
|
|||
elif k == '-c': codec = v
|
||||
elif k == '-o': outfile = v
|
||||
elif k == '-s': scale = float(v)
|
||||
elif k == '-T': cluster_margin = float(v)
|
||||
elif k == '-M': char_margin = float(v)
|
||||
elif k == '-L': line_margin = float(v)
|
||||
elif k == '-W': word_margin = float(v)
|
||||
#
|
||||
CMapDB.debug = debug
|
||||
|
@ -69,12 +73,15 @@ def main(argv):
|
|||
outfp = file(outfile, 'w')
|
||||
else:
|
||||
outfp = sys.stdout
|
||||
if outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
||||
if outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
elif outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
|
||||
elif outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
|
||||
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue