20090330 release

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@80 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-03-29 15:14:23 +00:00
parent 68cc99379d
commit 37a6a0450d
3 changed files with 18 additions and 10 deletions

View File

@ -1,7 +1,7 @@
# Makefile for pdfminer # Makefile for pdfminer
PACKAGE=pdfminer PACKAGE=pdfminer
VERSION=20090325 VERSION=20090330
GNUTAR=tar GNUTAR=tar
SVN=svn SVN=svn
PYTHON=python PYTHON=python

View File

@ -17,7 +17,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Mar 29 19:09:46 JST 2009 Last Modified: Mon Mar 30 00:13:34 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -50,8 +50,8 @@ which could be useful for analyzing the document.
<a name="source"></a> <a name="source"></a>
<p> <p>
<strong>Download:</strong><br> <strong>Download:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz"> <a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090325.tar.gz http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090330.tar.gz
</a> </a>
(1.8Mbytes) (1.8Mbytes)
@ -274,6 +274,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2009/03/30: Text output mode added.
<li> 2009/03/25: Encoding problems fixed. Word splitting option added. <li> 2009/03/25: Encoding problems fixed. Word splitting option added.
<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe. <li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries. <li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.

View File

@ -66,7 +66,7 @@ class SGMLConverter(PDFConverter):
## ##
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=0.5, splitwords=False): def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=None, splitwords=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords) PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
self.pagenum = pagenum self.pagenum = pagenum
self.pagepad = pagepad self.pagepad = pagepad
@ -124,16 +124,19 @@ class HTMLConverter(PDFConverter):
## ##
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True): def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=None, splitwords=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True) PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
self.pagenum = pagenum self.pagenum = pagenum
if cluster_margin == None:
cluster_margin = 0.5
self.cluster_margin = cluster_margin self.cluster_margin = cluster_margin
self.hyphenation = hyphenation
return return
def end_page(self, page): def end_page(self, page):
from cluster import cluster_pageobjs from cluster import cluster_pageobjs
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
if self.pagenum:
self.outfp.write('Page %d\n' % page.id)
if self.cluster_margin: if self.cluster_margin:
textobjs = get_textobjs(page) textobjs = get_textobjs(page)
clusters = cluster_pageobjs(textobjs, self.cluster_margin) clusters = cluster_pageobjs(textobjs, self.cluster_margin)
@ -152,6 +155,7 @@ class TextConverter(PDFConverter):
if isinstance(item, TextItem): if isinstance(item, TextItem):
self.outfp.write(item.text.encode(self.codec, 'replace')) self.outfp.write(item.text.encode(self.codec, 'replace'))
self.outfp.write('\n') self.outfp.write('\n')
self.outfp.write('\f')
return return
def close(self): def close(self):
@ -246,7 +250,7 @@ def main(argv):
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -258,7 +262,9 @@ def main(argv):
maxpages = 0 maxpages = 0
outtype = 'html' outtype = 'html'
password = '' password = ''
pagenum = True
splitwords = False splitwords = False
cluster_margin = None
outfp = sys.stdout outfp = sys.stdout
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
@ -268,6 +274,7 @@ def main(argv):
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-D': cdbcmapdir = v elif k == '-D': cdbcmapdir = v
elif k == '-T': cluster_margin = float(v)
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
elif k == '-w': splitwords = True elif k == '-w': splitwords = True
@ -283,9 +290,9 @@ def main(argv):
if outtype == 'sgml': if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords) device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords) device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords, cluster_margin=cluster_margin)
elif outtype == 'text': elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec) device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec) device = TagExtractor(rsrc, outfp, codec=codec)
else: else: