20090330 release
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@80 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
68cc99379d
commit
37a6a0450d
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
# Makefile for pdfminer
|
# Makefile for pdfminer
|
||||||
|
|
||||||
PACKAGE=pdfminer
|
PACKAGE=pdfminer
|
||||||
VERSION=20090325
|
VERSION=20090330
|
||||||
GNUTAR=tar
|
GNUTAR=tar
|
||||||
SVN=svn
|
SVN=svn
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
|
|
|
@ -17,7 +17,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Mar 29 19:09:46 JST 2009
|
Last Modified: Mon Mar 30 00:13:34 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -50,8 +50,8 @@ which could be useful for analyzing the document.
|
||||||
<a name="source"></a>
|
<a name="source"></a>
|
||||||
<p>
|
<p>
|
||||||
<strong>Download:</strong><br>
|
<strong>Download:</strong><br>
|
||||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz">
|
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz">
|
||||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz
|
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz
|
||||||
</a>
|
</a>
|
||||||
(1.8Mbytes)
|
(1.8Mbytes)
|
||||||
|
|
||||||
|
@ -274,6 +274,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2009/03/30: Text output mode added.
|
||||||
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
||||||
<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
|
<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
|
||||||
<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
|
<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
|
||||||
|
|
|
@ -66,7 +66,7 @@ class SGMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=0.5, splitwords=False):
|
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||||
self.pagenum = pagenum
|
self.pagenum = pagenum
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
|
@ -124,16 +124,19 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True):
|
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
||||||
self.pagenum = pagenum
|
self.pagenum = pagenum
|
||||||
|
if cluster_margin == None:
|
||||||
|
cluster_margin = 0.5
|
||||||
self.cluster_margin = cluster_margin
|
self.cluster_margin = cluster_margin
|
||||||
self.hyphenation = hyphenation
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
from cluster import cluster_pageobjs
|
from cluster import cluster_pageobjs
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
|
if self.pagenum:
|
||||||
|
self.outfp.write('Page %d\n' % page.id)
|
||||||
if self.cluster_margin:
|
if self.cluster_margin:
|
||||||
textobjs = get_textobjs(page)
|
textobjs = get_textobjs(page)
|
||||||
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
|
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
|
||||||
|
@ -152,6 +155,7 @@ class TextConverter(PDFConverter):
|
||||||
if isinstance(item, TextItem):
|
if isinstance(item, TextItem):
|
||||||
self.outfp.write(item.text.encode(self.codec, 'replace'))
|
self.outfp.write(item.text.encode(self.codec, 'replace'))
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
|
self.outfp.write('\f')
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
@ -246,7 +250,7 @@ def main(argv):
|
||||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -258,7 +262,9 @@ def main(argv):
|
||||||
maxpages = 0
|
maxpages = 0
|
||||||
outtype = 'html'
|
outtype = 'html'
|
||||||
password = ''
|
password = ''
|
||||||
|
pagenum = True
|
||||||
splitwords = False
|
splitwords = False
|
||||||
|
cluster_margin = None
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
|
@ -268,6 +274,7 @@ def main(argv):
|
||||||
elif k == '-m': maxpages = int(v)
|
elif k == '-m': maxpages = int(v)
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
elif k == '-D': cdbcmapdir = v
|
elif k == '-D': cdbcmapdir = v
|
||||||
|
elif k == '-T': cluster_margin = float(v)
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
elif k == '-w': splitwords = True
|
elif k == '-w': splitwords = True
|
||||||
|
@ -283,9 +290,9 @@ def main(argv):
|
||||||
if outtype == 'sgml':
|
if outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'text':
|
elif outtype == 'text':
|
||||||
device = TextConverter(rsrc, outfp, codec=codec)
|
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue