documentation fix
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@117 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
97dd4dda5e
commit
787ae4f814
52
README.html
52
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sat May 23 10:06:04 JST 2009
|
Last Modified: Sat Jun 20 19:51:02 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
|
||||||
<li> Do the following test:<br>
|
<li> Do the following test:<br>
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>pdf2txt.py samples/simple1.pdf</strong>
|
$ <strong>pdf2txt.py samples/simple1.pdf</strong>
|
||||||
<html><head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
|
||||||
</head><body>
|
Hello
|
||||||
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
|
|
||||||
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
|
World
|
||||||
<span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"> World </span>
|
|
||||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"> </span>
|
Hello World
|
||||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> Hello </span>
|
|
||||||
<span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;">World </span>
|
|
||||||
<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span>
|
|
||||||
<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
|
|
||||||
</body></html>
|
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
<li> Done!
|
<li> Done!
|
||||||
</ol>
|
</ol>
|
||||||
|
@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding
|
||||||
<p>
|
<p>
|
||||||
Examples:
|
Examples:
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf > output.html</strong>
|
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
|
||||||
(extract text as an HTML file whose filename is output.html)
|
(extract text as an HTML file whose filename is output.html)
|
||||||
|
|
||||||
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf > output.html</strong>
|
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
|
||||||
(extract a Japanese HTML file in vertical writing, CMap is required)
|
(extract a Japanese HTML file in vertical writing, CMap is required)
|
||||||
|
|
||||||
$ <strong>pdf2txt.py -P mypassword -t text secret.pdf > output.txt</strong>
|
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
|
||||||
(extract a text from an encrypted PDF file)
|
(extract a text from an encrypted PDF file)
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
|
|
||||||
|
@ -175,7 +170,7 @@ Options:
|
||||||
<dl>
|
<dl>
|
||||||
<dt> <code>-o <em>filename</em></code>
|
<dt> <code>-o <em>filename</em></code>
|
||||||
<dd> Specifies the output file name.
|
<dd> Specifies the output file name.
|
||||||
By default, it prints the extracted contents to stdout.
|
By default, it prints the extracted contents to stdout in text format.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
||||||
<dd> Specifies the comma-separated list of the page numbers to be extracted.
|
<dd> Specifies the comma-separated list of the page numbers to be extracted.
|
||||||
|
@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri
|
||||||
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
Tags used here are defined in the PDF specification (See §10.7 "<em>Tagged PDF</em>").
|
||||||
</ul>
|
</ul>
|
||||||
<p>
|
<p>
|
||||||
|
<dt> <code>-T <em>cluster_margin</em></code>
|
||||||
|
<dd>
|
||||||
|
<p>
|
||||||
|
<dt> <code>-W <em>word_margin</em></code>
|
||||||
|
<dd>
|
||||||
|
<p>
|
||||||
|
<dt> <code>-s <em>scale</em></code>
|
||||||
|
<dd>
|
||||||
|
<p>
|
||||||
|
<dt> <code>-m <em>maxpages</em></code>
|
||||||
|
<dd>
|
||||||
|
<p>
|
||||||
<dt> <code>-P <em>password</em></code>
|
<dt> <code>-P <em>password</em></code>
|
||||||
<dd> Provides the user password to open the PDF file.
|
<dd> Provides the user password to open the PDF file.
|
||||||
<p>
|
<p>
|
||||||
|
<dt> <code>-C <em>CMap directory</em></code>
|
||||||
|
<dd>
|
||||||
|
<p>
|
||||||
<dt> <code>-d</code>
|
<dt> <code>-d</code>
|
||||||
<dd> Increases the debug level.
|
<dd> Increases the debug level.
|
||||||
</dl>
|
</dl>
|
||||||
|
@ -231,7 +241,10 @@ Options:
|
||||||
<dd> Instructs to dump all the objects.
|
<dd> Instructs to dump all the objects.
|
||||||
By default, it only prints the document trailer (like a header).
|
By default, it only prints the document trailer (like a header).
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-p <em>pageno</em></code>
|
<dt> <code>-i <em>objno,objno, ...</em></code>
|
||||||
|
<dd>
|
||||||
|
<p>
|
||||||
|
<dt> <code>-p <em>pageno,pageno, ...</em></code>
|
||||||
<dd> Specifies the page number to be extracted.
|
<dd> Specifies the page number to be extracted.
|
||||||
Multiple <code>-p</code> options are allowed.
|
Multiple <code>-p</code> options are allowed.
|
||||||
Note that page numbers start from one.
|
Note that page numbers start from one.
|
||||||
|
@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<dt> <code>-P <em>password</em></code>
|
<dt> <code>-P <em>password</em></code>
|
||||||
<dd> Provides the user password to open the PDF file.
|
<dd> Provides the user password to open the PDF file.
|
||||||
<p>
|
<p>
|
||||||
|
<dt> <code>-T</code>
|
||||||
|
<dd>
|
||||||
|
<p>
|
||||||
<dt> <code>-d</code>
|
<dt> <code>-d</code>
|
||||||
<dd> Increases the debug level.
|
<dd> Increases the debug level.
|
||||||
</dl>
|
</dl>
|
||||||
|
|
|
@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||||
##
|
##
|
||||||
class PDFPageAggregator(PDFDevice):
|
class PDFPageAggregator(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, pageno=1, cluster_margin=None):
|
def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None):
|
||||||
PDFDevice.__init__(self, rsrc)
|
PDFDevice.__init__(self, rsrc)
|
||||||
self.cluster_margin = cluster_margin
|
self.char_margin = char_margin
|
||||||
|
self.line_margin = line_margin
|
||||||
self.undefined_char = '?'
|
self.undefined_char = '?'
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
self.stack = []
|
self.stack = []
|
||||||
|
@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice):
|
||||||
assert isinstance(self.cur_item, LTPage)
|
assert isinstance(self.cur_item, LTPage)
|
||||||
self.cur_item.fixate()
|
self.cur_item.fixate()
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
if self.cluster_margin:
|
if self.char_margin != None and self.line_margin != None:
|
||||||
self.cur_item.group_text(self.cluster_margin)
|
self.cur_item.group_text(self.char_margin, self.line_margin)
|
||||||
return self.cur_item
|
return self.cur_item
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
def begin_figure(self, name, bbox, matrix):
|
||||||
|
@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice):
|
||||||
##
|
##
|
||||||
class PDFConverter(PDFPageAggregator):
|
class PDFConverter(PDFPageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'):
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin)
|
char_margin=None, line_margin=None, word_margin=None):
|
||||||
|
PDFPageAggregator.__init__(self, rsrc, pageno=pageno,
|
||||||
|
char_margin=char_margin, line_margin=line_margin)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
self.word_margin = word_margin
|
self.word_margin = word_margin
|
||||||
|
@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8',
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||||
|
char_margin=None, line_margin=None, word_margin=None,
|
||||||
scale=1, showpageno=True, pagepad=50):
|
scale=1, showpageno=True, pagepad=50):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
||||||
|
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTAnon):
|
elif isinstance(item, LTAnon):
|
||||||
self.write(item.text)
|
pass
|
||||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
|
@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8',
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1,
|
||||||
showpageno=False, word_margin=None):
|
char_margin=None, line_margin=None, word_margin=None,
|
||||||
PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec)
|
showpageno=False):
|
||||||
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno,
|
||||||
|
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None):
|
||||||
## It performs binary search so that the processing time
|
## It performs binary search so that the processing time
|
||||||
## should be around O(log n).
|
## should be around O(log n).
|
||||||
##
|
##
|
||||||
def bsearch(objs, v0, v1):
|
def bsearch(objs, v0):
|
||||||
if v1 <= v0: return []
|
|
||||||
i0 = 0
|
i0 = 0
|
||||||
i1 = len(objs)-1
|
i1 = len(objs)
|
||||||
while i0 <= i1:
|
while i0 < i1:
|
||||||
i = (i0+i1)/2
|
i = (i0+i1)/2
|
||||||
assert 0 <= i and i < len(objs)
|
|
||||||
(v, obj) = objs[i]
|
(v, obj) = objs[i]
|
||||||
if v < v0:
|
if v0 == v:
|
||||||
i0 = i+1
|
(i0,i1) = (i,i+1)
|
||||||
elif v1 < v:
|
break
|
||||||
i1 = i-1
|
elif v0 < v:
|
||||||
else:
|
|
||||||
i0 = i
|
|
||||||
while 0 < i0:
|
|
||||||
(v,_) = objs[i0-1]
|
|
||||||
if v < v0: break
|
|
||||||
i0 -= 1
|
|
||||||
i1 = i
|
i1 = i
|
||||||
while i1 < len(objs)-1:
|
else:
|
||||||
(v,_) = objs[i1+1]
|
i0 = i+1
|
||||||
if v1 < v: break
|
return (i0,i1)
|
||||||
i1 += 1
|
|
||||||
return [ obj for (_,obj) in objs[i0:i1+1] ]
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
## reorder_hv, reorder_vh
|
## reorder_hv, reorder_vh
|
||||||
|
@ -63,7 +52,9 @@ def reorder_vh(objs, hdir):
|
||||||
r = []
|
r = []
|
||||||
line = []
|
line = []
|
||||||
for obj in sorted(objs, key=vkey):
|
for obj in sorted(objs, key=vkey):
|
||||||
if line and not line[-1].voverlap(obj):
|
if line:
|
||||||
|
v = line[-1].voverlap(obj) * 2
|
||||||
|
if v < obj.height or v < line[-1].height:
|
||||||
line.sort(key=hkey)
|
line.sort(key=hkey)
|
||||||
r.append(line)
|
r.append(line)
|
||||||
line = []
|
line = []
|
||||||
|
@ -106,7 +97,8 @@ class Plane(object):
|
||||||
self.yobjs = []
|
self.yobjs = []
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
self.place(obj)
|
self.place(obj)
|
||||||
self.fixate()
|
self.xobjs.sort()
|
||||||
|
self.yobjs.sort()
|
||||||
return
|
return
|
||||||
|
|
||||||
# place(obj): place an object in a certain area.
|
# place(obj): place an object in a certain area.
|
||||||
|
@ -118,16 +110,14 @@ class Plane(object):
|
||||||
self.yobjs.append((obj.y1, obj))
|
self.yobjs.append((obj.y1, obj))
|
||||||
return
|
return
|
||||||
|
|
||||||
# fixate(): you must call this after adding all objects.
|
|
||||||
def fixate(self):
|
|
||||||
self.xobjs.sort()
|
|
||||||
self.yobjs.sort()
|
|
||||||
return
|
|
||||||
|
|
||||||
# find(): finds objects that are in a certain area.
|
# find(): finds objects that are in a certain area.
|
||||||
def find(self, (x0,y0,x1,y1)):
|
def find(self, (x0,y0,x1,y1)):
|
||||||
xobjs = set(bsearch(self.xobjs, x0, x1))
|
(i0,_) = bsearch(self.xobjs, x0)
|
||||||
yobjs = set(bsearch(self.yobjs, y0, y1))
|
(_,i1) = bsearch(self.xobjs, x1)
|
||||||
|
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
|
||||||
|
(i0,_) = bsearch(self.yobjs, y0)
|
||||||
|
(_,i1) = bsearch(self.yobjs, y1)
|
||||||
|
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
|
||||||
objs = xobjs.intersection(yobjs)
|
objs = xobjs.intersection(yobjs)
|
||||||
return objs
|
return objs
|
||||||
|
|
||||||
|
@ -166,12 +156,14 @@ class ClusterSet(object):
|
||||||
group.fixate()
|
group.fixate()
|
||||||
return list(r)
|
return list(r)
|
||||||
|
|
||||||
def group_objs(objs, ratio, klass):
|
def group_objs(objs, hratio, vratio, klass):
|
||||||
plane = Plane(objs)
|
plane = Plane(objs)
|
||||||
cset = ClusterSet(klass)
|
cset = ClusterSet(klass)
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
margin = abs(obj.get_margin(ratio))
|
margin = obj.get_margin()
|
||||||
neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin))
|
hmargin = hratio * margin
|
||||||
|
vmargin = vratio * margin
|
||||||
|
neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin))
|
||||||
cset.add(neighbors)
|
cset.add(neighbors)
|
||||||
return cset.finish()
|
return cset.finish()
|
||||||
|
|
||||||
|
@ -214,7 +206,7 @@ class LayoutItem(object):
|
||||||
def get_bbox(self):
|
def get_bbox(self):
|
||||||
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
||||||
|
|
||||||
def get_margin(self, ratio):
|
def get_margin(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
|
@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem):
|
||||||
return
|
return
|
||||||
|
|
||||||
# fixate(): determines its boundery and writing direction.
|
# fixate(): determines its boundery and writing direction.
|
||||||
def fixate(self):
|
def fixate(self, direction=None):
|
||||||
if not self.width and self.objs:
|
if not self.width and self.objs:
|
||||||
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
|
||||||
for obj in self.objs:
|
for obj in self.objs:
|
||||||
|
@ -354,8 +346,8 @@ class LTText(LayoutItem):
|
||||||
'(%.1f, %.1f)' % self.adv,
|
'(%.1f, %.1f)' % self.adv,
|
||||||
self.text))
|
self.text))
|
||||||
|
|
||||||
def get_margin(self, ratio):
|
def get_margin(self):
|
||||||
return self.fontsize * ratio
|
return abs(self.fontsize)
|
||||||
|
|
||||||
def get_weight(self):
|
def get_weight(self):
|
||||||
return len(self.text)
|
return len(self.text)
|
||||||
|
@ -392,12 +384,12 @@ class LTTextBox(LayoutContainer):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
|
return ('<textbox %s(%s)>' % (self.get_bbox(), self.direction))
|
||||||
|
|
||||||
def fixate(self):
|
def fixate(self, direction='H'):
|
||||||
LayoutContainer.fixate(self)
|
LayoutContainer.fixate(self, direction=direction)
|
||||||
self.direction = 'H'
|
if not direction:
|
||||||
for obj in self.objs:
|
for obj in self.objs:
|
||||||
if obj.is_vertical():
|
if obj.is_vertical():
|
||||||
self.direction = 'V'
|
direction = 'V'
|
||||||
break
|
break
|
||||||
if 2 <= len(self.objs):
|
if 2 <= len(self.objs):
|
||||||
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1)
|
||||||
|
@ -405,11 +397,12 @@ class LTTextBox(LayoutContainer):
|
||||||
h = objs[0].voverlap(objs[1])
|
h = objs[0].voverlap(objs[1])
|
||||||
v = objs[0].hoverlap(objs[1])
|
v = objs[0].hoverlap(objs[1])
|
||||||
if h < v:
|
if h < v:
|
||||||
self.direction = 'V'
|
direction = 'V'
|
||||||
if self.direction == 'H':
|
self.direction = direction
|
||||||
self.lines = reorder_vh(self.objs, +1)
|
if self.direction == 'V':
|
||||||
else:
|
|
||||||
self.lines = reorder_hv(self.objs, -1)
|
self.lines = reorder_hv(self.objs, -1)
|
||||||
|
else:
|
||||||
|
self.lines = reorder_vh(self.objs, +1)
|
||||||
self.objs = []
|
self.objs = []
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
self.objs.extend(line)
|
self.objs.extend(line)
|
||||||
|
@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer):
|
||||||
def get_direction(self):
|
def get_direction(self):
|
||||||
return self.direction
|
return self.direction
|
||||||
|
|
||||||
def get_lines(self, ratio):
|
def get_lines(self, word_margin):
|
||||||
if self.get_direction() == 'H':
|
if self.get_direction() == 'V':
|
||||||
for line in self.lines:
|
|
||||||
x1 = INF
|
|
||||||
for obj in line:
|
|
||||||
if not isinstance(obj, LTText): continue
|
|
||||||
if ratio:
|
|
||||||
margin = obj.get_margin(ratio)
|
|
||||||
if x1 < obj.x0-margin:
|
|
||||||
yield LTAnon(' ')
|
|
||||||
yield obj
|
|
||||||
x1 = obj.x1
|
|
||||||
yield LTAnon('\n')
|
|
||||||
else:
|
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
y0 = -INF
|
y0 = -INF
|
||||||
for obj in line:
|
for obj in line:
|
||||||
if not isinstance(obj, LTText): continue
|
if not isinstance(obj, LTText): continue
|
||||||
if ratio:
|
if word_margin:
|
||||||
margin = obj.get_margin(ratio)
|
margin = word_margin * obj.get_margin()
|
||||||
if obj.y1+margin < y0:
|
if obj.y1+margin < y0:
|
||||||
yield LTAnon(' ')
|
yield LTAnon(' ')
|
||||||
yield obj
|
yield obj
|
||||||
y0 = obj.y0
|
y0 = obj.y0
|
||||||
yield LTAnon('\n')
|
yield LTAnon('\n')
|
||||||
|
else:
|
||||||
|
for line in self.lines:
|
||||||
|
x1 = INF
|
||||||
|
for obj in line:
|
||||||
|
if not isinstance(obj, LTText): continue
|
||||||
|
if word_margin:
|
||||||
|
margin = word_margin * obj.get_margin()
|
||||||
|
if x1 < obj.x0-margin:
|
||||||
|
yield LTAnon(' ')
|
||||||
|
yield obj
|
||||||
|
x1 = obj.x1
|
||||||
|
yield LTAnon('\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -458,17 +451,18 @@ class LTPage(LayoutContainer):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
return ('<page id=%r bbox=%s rotate=%r>' % (self.id, self.get_bbox(), self.rotate))
|
||||||
|
|
||||||
def fixate(self):
|
def fixate(self, dirtection='H'):
|
||||||
return
|
return
|
||||||
|
|
||||||
def group_text(self, ratio):
|
def group_text(self, char_margin, line_margin):
|
||||||
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
|
textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ]
|
||||||
otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
|
objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ]
|
||||||
self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs
|
if self.get_direction() == 'V':
|
||||||
if self.get_direction() == 'H':
|
objs += group_objs(textobjs, line_margin, char_margin, LTTextBox)
|
||||||
lines = reorder_vh(self.objs, +1)
|
lines = reorder_hv(objs, -1)
|
||||||
else:
|
else:
|
||||||
lines = reorder_hv(self.objs, -1)
|
objs += group_objs(textobjs, char_margin, line_margin, LTTextBox)
|
||||||
|
lines = reorder_vh(objs, +1)
|
||||||
self.objs = []
|
self.objs = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
self.objs.extend(line)
|
self.objs.extend(line)
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -1,8 +1,9 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
|
from pdfminer import __version__
|
||||||
|
|
||||||
setup(name='pdfminer',
|
setup(name='pdfminer',
|
||||||
version='20090330',
|
version=__version__,
|
||||||
description='PDF parser and analyzer',
|
description='PDF parser and analyzer',
|
||||||
license='MIT/X',
|
license='MIT/X',
|
||||||
author='Yusuke Shinyama',
|
author='Yusuke Shinyama',
|
||||||
|
|
|
@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||||
|
'[-M char_margin] [-L line_margin] [-W word_margin] '
|
||||||
|
'[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -29,7 +31,8 @@ def main(argv):
|
||||||
outfile = None
|
outfile = None
|
||||||
outtype = None
|
outtype = None
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
cluster_margin = 0.5
|
char_margin = 1.0
|
||||||
|
line_margin = 0.3
|
||||||
word_margin = 0.2
|
word_margin = 0.2
|
||||||
pageno = 1
|
pageno = 1
|
||||||
scale = 1
|
scale = 1
|
||||||
|
@ -44,7 +47,8 @@ def main(argv):
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
elif k == '-s': scale = float(v)
|
elif k == '-s': scale = float(v)
|
||||||
elif k == '-T': cluster_margin = float(v)
|
elif k == '-M': char_margin = float(v)
|
||||||
|
elif k == '-L': line_margin = float(v)
|
||||||
elif k == '-W': word_margin = float(v)
|
elif k == '-W': word_margin = float(v)
|
||||||
#
|
#
|
||||||
CMapDB.debug = debug
|
CMapDB.debug = debug
|
||||||
|
@ -69,12 +73,15 @@ def main(argv):
|
||||||
outfp = file(outfile, 'w')
|
outfp = file(outfile, 'w')
|
||||||
else:
|
else:
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
if outtype == 'sgml':
|
if outtype == 'text':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
device = TextConverter(rsrc, outfp, codec=codec,
|
||||||
|
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||||
|
elif outtype == 'sgml':
|
||||||
|
device = SGMLConverter(rsrc, outfp, codec=codec,
|
||||||
|
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale)
|
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale,
|
||||||
elif outtype == 'text':
|
char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin)
|
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue