documentation fix.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@108 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-17 06:21:08 +00:00
parent 2756223c85
commit 8cae56a555
5 changed files with 28 additions and 99 deletions

View File

@ -10,7 +10,7 @@ VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz DISTFILE=$(DISTNAME).tar.gz
CONV_CMAP=$(PYTHON) -m tools.conv_cmap CONV_CMAP=$(PYTHON) pdfminer/cmap.py
all: all:
@ -27,8 +27,7 @@ test:
cd samples && make test cd samples && make test
cdbcmap: CMap cdbcmap: CMap
-mkdir CDBCMap $(CONV_CMAP) CMap
$(CONV_CMAP) CMap/*
# Maintainance: # Maintainance:
commit: clean commit: clean

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat Apr 4 09:14:28 JST 2009 Last Modified: Sat May 16 19:58:11 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -26,23 +26,22 @@ Last Modified: Sat Apr 4 09:14:28 JST 2009
<hr noshade> <hr noshade>
<h2>What's It?</h2> <h2>What's It?</h2>
<p> <p>
PDFMiner is a suite of programs that aims to help PDFMiner is a suite of programs that help
analyzing text data from PDF documents. extracting and analyzing text data of PDF documents.
It includes a PDF parser, a PDF renderer
(though only rendering text is supported for now),
and a couple of nice tools to extract texts.
Unlike other PDF-related tools, it allows to obtain Unlike other PDF-related tools, it allows to obtain
the exact location of texts in a page, as well as the exact location of texts in a page, as well as
other layout information such as font size or font name, other extra information such as font information or ruled lines.
which could be useful for analyzing the document. It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purpoes instead of text analysis.
<p> <p>
<strong>Features:</strong> <strong>Features:</strong>
<ul> <ul>
<li> Written entirely in Python. (for version 2.5 or newer) <li> Written entirely in Python. (version 2.4 or newer required)
<li> PDF-1.7 specification support. <li> PDF-1.7 specification support. (well, almost)
<li> Non-ASCII languages and vertical writing scripts support. <li> Non-ASCII languages and vertical writing scripts support.
<li> Various font types (Type1, TrueType, Type3, and CID) support. <li> Various font types (Type1, TrueType, Type3, and CID) support.
<li> Basic encryption (RC4). <li> Basic encryption (RC4) support.
<li> PDF to HTML conversion (with a sample converter web app). <li> PDF to HTML conversion (with a sample converter web app).
<li> Outline (TOC) extraction. <li> Outline (TOC) extraction.
<li> Tagged contents extraction. <li> Tagged contents extraction.
@ -78,33 +77,28 @@ http://pdf2html.tabesugi.net:8080/
<a name="install"></a> <a name="install"></a>
<hr noshade> <hr noshade>
<h2>How to Install</h2> <h2>How to Install</h2>
<p>
<strong>Note:</strong>
This software is not yet out-of-the-box.
You have to download and unpack it manually,
and spend some time to make it work.
<strong>Your will</strong> is needed!
I do not support easy_install or setup.py or any automated installation until
this is very matured to the point that it really should be widely distributed.
(For now, it's not yet up to the standard, IMO.)
<ol> <ol>
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer. <li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
<li> Download the <a href="#source">PDFMiner source</a>. <li> Download the <a href="#source">PDFMiner source</a>.
<li> Extract it. <li> Extract it.
<li> Go to the <code>pdfminer</code> directory. <li> Run <code>setup.py</code> to install:<br>
<blockquote><pre>
# <strong>python setup.py install</strong>
</pre></blockquote>
<li> Do the following test:<br> <li> Do the following test:<br>
<blockquote><pre> <blockquote><pre>
$ <strong>python pdflib/pdf2txt.py samples/simple1.pdf</strong> $ <strong>pdf2txt.py samples/simple1.pdf</strong>
&lt;html&gt;&lt;head&gt;&lt;meta http-equiv="Content-Type" content="text/html; charset=ascii"&gt; &lt;html&gt;&lt;head&gt;
&lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"&gt;
&lt;/head&gt;&lt;body&gt; &lt;/head&gt;&lt;body&gt;
&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt;&lt;span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt; &lt;span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; &lt;/span&gt; &lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:106px; top:224px; font-size:22px;"&gt;Hello &lt;/span&gt; &lt;span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"&gt; World &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:168px; top:224px; font-size:22px;"&gt;World &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"&gt; &lt;/span&gt; &lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"&gt; &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt; &lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; Hello &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;"&gt;World &lt;/span&gt; &lt;span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;"&gt;World &lt;/span&gt;
&lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt;
&lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#1"&gt;1&lt;/a&gt;&lt;/div&gt; &lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#1"&gt;1&lt;/a&gt;&lt;/div&gt;
&lt;/body&gt;&lt;/html&gt; &lt;/body&gt;&lt;/html&gt;
</pre></blockquote> </pre></blockquote>

View File

@ -2,8 +2,7 @@
PYTHON=python PYTHON=python
CMAPDIR=../CMap CMAPDIR=../CMap
CDBCMAPDIR=../CDBCMap PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR)
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR) -D$(CDBCMAPDIR)
HTMLS= \ HTMLS= \
simple1.html \ simple1.html \

View File

@ -1,61 +0,0 @@
#!/usr/bin/env python
import sys, os.path
stderr = sys.stderr
def dumpcdb(cmap, cdbfile, verbose=1):
from struct import pack, unpack
try:
import cdb
except ImportError:
import pdfminer.pycdb as cdb
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose:
print >>stderr, 'Writing: %r...' % cdbfile
for (k,v) in cmap.getall_attrs():
m.add('/'+k, repr(v))
for (code,cid) in cmap.getall_code2cid():
m.add('c'+code, pack('>L',cid))
for (cid,code) in cmap.getall_cid2code():
m.add('i'+pack('>L',cid), code)
m.finish()
return
def convert_cmap(files, cmapdir, cdbcmapdir, force=False):
from pdfminer.cmap import CMapDB
CMapDB.initialize(cmapdir)
for fname in files:
if fname.endswith('.upr'): continue
cmapname = os.path.basename(fname)
cdbname = os.path.join(cdbcmapdir, cmapname+'.cmap.cdb')
if not force and os.path.exists(cdbname):
print >>stderr, 'Skipping: %r' % cdbname
continue
print >>stderr, 'Reading: %r...' % fname
cmap = CMapDB.get_cmap(cmapname)
dumpcdb(cmap, cdbname)
return
def main(argv):
import getopt
def usage():
print 'usage: %s [-c cmapdir] [-C cdbcmapdir] [-f] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'c:C:f')
except getopt.GetoptError:
return usage()
if not args: usage()
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
force = False
for (k, v) in opts:
if k == '-f': force = True
elif k == '-c': cmapdir = v
elif k == '-C': cdbcmapdir = v
if not os.path.isdir(cmapdir):
raise ValueError('not directory: %r' % cmapdir)
if not os.path.isdir(cdbcmapdir):
raise ValueError('not directory: %r' % cdbcmapdir)
return convert_cmap(args, cmapdir, cdbcmapdir, force=force)
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmap import CMapDB from pdfminer.cmap import CMapDB, find_cmap_path
# main # main
def main(argv): def main(argv):
@ -20,8 +20,7 @@ def main(argv):
# debug option # debug option
debug = 0 debug = 0
# path option # path option
cmapdir = 'CMap' cmapdir = find_cmap_path()
cdbcmapdir = 'CDBCMap'
# input option # input option
password = '' password = ''
pagenos = set() pagenos = set()
@ -37,7 +36,6 @@ def main(argv):
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-D': cdbcmapdir = v
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
@ -54,7 +52,7 @@ def main(argv):
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug PDFDevice.debug = debug
# #
CMapDB.initialize(cmapdir, cdbcmapdir) CMapDB.initialize(cmapdir)
rsrc = PDFResourceManager() rsrc = PDFResourceManager()
if outtype == 'sgml': if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)