documentation fix.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@108 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-05-17 06:21:08 +00:00 · 2009-05-17 06:21:08 +00:00 · 8cae56a555
parent 2756223c85
commit 8cae56a555
5 changed files with 28 additions and 99 deletions
--- a/5
+++ b/5
@ -10,7 +10,7 @@ VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
 DISTNAME=$(PACKAGE)-dist-$(VERSION)
 DISTFILE=$(DISTNAME).tar.gz
-CONV_CMAP=$(PYTHON) -m tools.conv_cmap
+CONV_CMAP=$(PYTHON) pdfminer/cmap.py
 all:
@ -27,8 +27,7 @@ test:
 	cd samples && make test
 cdbcmap: CMap
-	-mkdir CDBCMap
+	$(CONV_CMAP) CMap
 	$(CONV_CMAP) CMap/*
 # Maintainance:
 commit: clean
--- a/README.html
+++ b/README.html
@ -18,7 +18,7 @@ Python PDF parser and analyzer
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sat Apr  4 09:14:28 JST 2009
+Last Modified: Sat May 16 19:58:11 JST 2009
 <!-- hhmts end -->
 </div>
@ -26,23 +26,22 @@ Last Modified: Sat Apr  4 09:14:28 JST 2009
 <hr noshade>
 <h2>What's It?</h2>
 <p>
-PDFMiner is a suite of programs that aims to help
+PDFMiner is a suite of programs that help
-analyzing text data from PDF documents.
+extracting and analyzing text data of PDF documents.
 It includes a PDF parser, a PDF renderer
 (though only rendering text is supported for now),
 and a couple of nice tools to extract texts.
 Unlike other PDF-related tools, it allows to obtain
 the exact location of texts in a page, as well as 
-other layout information such as font size or font name,
+other extra information such as font information or ruled lines.
-which could be useful for analyzing the document.
+It includes a PDF converter that can transform PDF files
 into other text formats (such as HTML). It has an extensible
 PDF parser that can be used for other purpoes instead of text analysis.
 <p>
 <strong>Features:</strong>
 <ul>
-<li> Written entirely in Python. (for version 2.5 or newer)
+<li> Written entirely in Python. (version 2.4 or newer required)
-<li> PDF-1.7 specification support.
+<li> PDF-1.7 specification support. (well, almost)
 <li> Non-ASCII languages and vertical writing scripts support.
 <li> Various font types (Type1, TrueType, Type3, and CID) support.
-<li> Basic encryption (RC4).
+<li> Basic encryption (RC4) support.
 <li> PDF to HTML conversion (with a sample converter web app).
 <li> Outline (TOC) extraction.
 <li> Tagged contents extraction.
@ -78,33 +77,28 @@ http://pdf2html.tabesugi.net:8080/
 <a name="install"></a>
 <hr noshade>
 <h2>How to Install</h2>
 <p>
 <strong>Note:</strong>
 This software is not yet out-of-the-box.
 You have to download and unpack it manually, 
 and spend some time to make it work.
 <strong>Your will</strong> is needed!
 I do not support easy_install or setup.py or any automated installation until
 this is very matured to the point that it really should be widely distributed.
 (For now, it's not yet up to the standard, IMO.)
 <ol>
 <li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
 <li> Download the <a href="#source">PDFMiner source</a>.
 <li> Extract it.
-<li> Go to the <code>pdfminer</code> directory.
+<li> Run <code>setup.py</code> to install:<br>
 <blockquote><pre>
 # <strong>python setup.py install</strong>
 </pre></blockquote>
 <li> Do the following test:<br>
 <blockquote><pre>
-$ <strong>python pdflib/pdf2txt.py samples/simple1.pdf</strong>
+$ <strong>pdf2txt.py samples/simple1.pdf</strong>
-&lt;html&gt;&lt;head&gt;&lt;meta http-equiv="Content-Type" content="text/html; charset=ascii"&gt;
+&lt;html&gt;&lt;head&gt;
 &lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"&gt;
 &lt;/head&gt;&lt;body&gt;
-&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt;&lt;span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
+&lt;span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"&gt;&lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; &lt;/span&gt;
+&lt;div style="position:absolute; top:50px;"&gt;&lt;a name="1"&gt;Page 1&lt;/a&gt;&lt;/div&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:106px; top:224px; font-size:22px;"&gt;Hello &lt;/span&gt;
+&lt;span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"&gt; World &lt;/span&gt;
 &lt;span style="position:absolute; writing-mode:lr-tb; left:168px; top:224px; font-size:22px;"&gt;World &lt;/span&gt;
 &lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"&gt; &lt;/span&gt;
-&lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt;
+&lt;span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"&gt; Hello &lt;/span&gt;
 &lt;span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;"&gt;World &lt;/span&gt;
 &lt;span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;"&gt;Hello &lt;/span&gt;
 &lt;div style="position:absolute; top:0px;"&gt;Page: &lt;a href="#1"&gt;1&lt;/a&gt;&lt;/div&gt;
 &lt;/body&gt;&lt;/html&gt;
 </pre></blockquote>
--- a/samples/Makefile
+++ b/samples/Makefile
@ -2,8 +2,7 @@
 PYTHON=python
 CMAPDIR=../CMap
-CDBCMAPDIR=../CDBCMap
+PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR)
 PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR) -D$(CDBCMAPDIR)
 HTMLS= \
 	simple1.html \
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@ -1,61 +0,0 @@
 #!/usr/bin/env python
 import sys, os.path
 stderr = sys.stderr
 def dumpcdb(cmap, cdbfile, verbose=1):
  from struct import pack, unpack
  try:
    import cdb
  except ImportError:
    import pdfminer.pycdb as cdb
  m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
  if verbose:
    print >>stderr, 'Writing: %r...' % cdbfile
  for (k,v) in cmap.getall_attrs():
    m.add('/'+k, repr(v))
  for (code,cid) in cmap.getall_code2cid():
    m.add('c'+code, pack('>L',cid))
  for (cid,code) in cmap.getall_cid2code():
    m.add('i'+pack('>L',cid), code)
  m.finish()
  return
 def convert_cmap(files, cmapdir, cdbcmapdir, force=False):
  from pdfminer.cmap import CMapDB
  CMapDB.initialize(cmapdir)
  for fname in files:
    if fname.endswith('.upr'): continue
    cmapname = os.path.basename(fname)
    cdbname = os.path.join(cdbcmapdir, cmapname+'.cmap.cdb')
    if not force and os.path.exists(cdbname):
      print >>stderr, 'Skipping: %r' % cdbname
      continue
    print >>stderr, 'Reading: %r...' % fname
    cmap = CMapDB.get_cmap(cmapname)
    dumpcdb(cmap, cdbname)
  return
 def main(argv):
  import getopt
  def usage():
    print 'usage: %s [-c cmapdir] [-C cdbcmapdir] [-f] file ...' % argv[0]
    return 100
  try:
    (opts, args) = getopt.getopt(argv[1:], 'c:C:f')
  except getopt.GetoptError:
    return usage()
  if not args: usage()
  cmapdir = 'CMap'
  cdbcmapdir = 'CDBCMap'
  force = False
  for (k, v) in opts:
    if k == '-f': force = True
    elif k == '-c': cmapdir = v
    elif k == '-C': cdbcmapdir = v
  if not os.path.isdir(cmapdir):
    raise ValueError('not directory: %r' % cmapdir)
  if not os.path.isdir(cdbcmapdir):
    raise ValueError('not directory: %r' % cdbcmapdir)
  return convert_cmap(args, cmapdir, cdbcmapdir, force=force)
 if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
-from pdfminer.cmap import CMapDB
+from pdfminer.cmap import CMapDB, find_cmap_path
 # main
 def main(argv):
@ -20,8 +20,7 @@ def main(argv):
  # debug option
  debug = 0
  # path option
-  cmapdir = 'CMap'
+  cmapdir = find_cmap_path()
  cdbcmapdir = 'CDBCMap'
  # input option
  password = ''
  pagenos = set()
@ -37,7 +36,6 @@ def main(argv):
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-C': cmapdir = v
    elif k == '-D': cdbcmapdir = v
    elif k == '-P': password = v
    elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-m': maxpages = int(v)
@ -54,7 +52,7 @@ def main(argv):
  PDFPageInterpreter.debug = debug
  PDFDevice.debug = debug
  #
-  CMapDB.initialize(cmapdir, cdbcmapdir)
+  CMapDB.initialize(cmapdir)
  rsrc = PDFResourceManager()
  if outtype == 'sgml':
    device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)