20090330 release
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@80 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
68cc99379d
commit
37a6a0450d
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
# Makefile for pdfminer
|
||||
|
||||
PACKAGE=pdfminer
|
||||
VERSION=20090325
|
||||
VERSION=20090330
|
||||
GNUTAR=tar
|
||||
SVN=svn
|
||||
PYTHON=python
|
||||
|
|
|
@ -17,7 +17,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sun Mar 29 19:09:46 JST 2009
|
||||
Last Modified: Mon Mar 30 00:13:34 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -50,8 +50,8 @@ which could be useful for analyzing the document.
|
|||
<a name="source"></a>
|
||||
<p>
|
||||
<strong>Download:</strong><br>
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz
|
||||
</a>
|
||||
(1.8Mbytes)
|
||||
|
||||
|
@ -274,6 +274,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<hr noshade>
|
||||
<h2>Changes</h2>
|
||||
<ul>
|
||||
<li> 2009/03/30: Text output mode added.
|
||||
<li> 2009/03/25: Encoding problems fixed. Word splitting option added.
|
||||
<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
|
||||
<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
|
||||
|
|
|
@ -66,7 +66,7 @@ class SGMLConverter(PDFConverter):
|
|||
##
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=0.5, splitwords=False):
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
self.pagenum = pagenum
|
||||
self.pagepad = pagepad
|
||||
|
@ -124,16 +124,19 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True):
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
||||
self.pagenum = pagenum
|
||||
if cluster_margin == None:
|
||||
cluster_margin = 0.5
|
||||
self.cluster_margin = cluster_margin
|
||||
self.hyphenation = hyphenation
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
from cluster import cluster_pageobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.pagenum:
|
||||
self.outfp.write('Page %d\n' % page.id)
|
||||
if self.cluster_margin:
|
||||
textobjs = get_textobjs(page)
|
||||
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
|
||||
|
@ -152,6 +155,7 @@ class TextConverter(PDFConverter):
|
|||
if isinstance(item, TextItem):
|
||||
self.outfp.write(item.text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n')
|
||||
self.outfp.write('\f')
|
||||
return
|
||||
|
||||
def close(self):
|
||||
|
@ -246,7 +250,7 @@ def main(argv):
|
|||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
@ -258,7 +262,9 @@ def main(argv):
|
|||
maxpages = 0
|
||||
outtype = 'html'
|
||||
password = ''
|
||||
pagenum = True
|
||||
splitwords = False
|
||||
cluster_margin = None
|
||||
outfp = sys.stdout
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
|
@ -268,6 +274,7 @@ def main(argv):
|
|||
elif k == '-m': maxpages = int(v)
|
||||
elif k == '-C': cmapdir = v
|
||||
elif k == '-D': cdbcmapdir = v
|
||||
elif k == '-T': cluster_margin = float(v)
|
||||
elif k == '-t': outtype = v
|
||||
elif k == '-o': outfp = file(v, 'wb')
|
||||
elif k == '-w': splitwords = True
|
||||
|
@ -283,9 +290,9 @@ def main(argv):
|
|||
if outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
|
||||
elif outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec)
|
||||
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue