documentation fix.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@108 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
2756223c85
commit
8cae56a555
5
Makefile
5
Makefile
|
@ -10,7 +10,7 @@ VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
|
|||
DISTNAME=$(PACKAGE)-dist-$(VERSION)
|
||||
DISTFILE=$(DISTNAME).tar.gz
|
||||
|
||||
CONV_CMAP=$(PYTHON) -m tools.conv_cmap
|
||||
CONV_CMAP=$(PYTHON) pdfminer/cmap.py
|
||||
|
||||
all:
|
||||
|
||||
|
@ -27,8 +27,7 @@ test:
|
|||
cd samples && make test
|
||||
|
||||
cdbcmap: CMap
|
||||
-mkdir CDBCMap
|
||||
$(CONV_CMAP) CMap/*
|
||||
$(CONV_CMAP) CMap
|
||||
|
||||
# Maintainance:
|
||||
commit: clean
|
||||
|
|
50
README.html
50
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sat Apr 4 09:14:28 JST 2009
|
||||
Last Modified: Sat May 16 19:58:11 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -26,23 +26,22 @@ Last Modified: Sat Apr 4 09:14:28 JST 2009
|
|||
<hr noshade>
|
||||
<h2>What's It?</h2>
|
||||
<p>
|
||||
PDFMiner is a suite of programs that aims to help
|
||||
analyzing text data from PDF documents.
|
||||
It includes a PDF parser, a PDF renderer
|
||||
(though only rendering text is supported for now),
|
||||
and a couple of nice tools to extract texts.
|
||||
PDFMiner is a suite of programs that help
|
||||
extracting and analyzing text data of PDF documents.
|
||||
Unlike other PDF-related tools, it allows to obtain
|
||||
the exact location of texts in a page, as well as
|
||||
other layout information such as font size or font name,
|
||||
which could be useful for analyzing the document.
|
||||
other extra information such as font information or ruled lines.
|
||||
It includes a PDF converter that can transform PDF files
|
||||
into other text formats (such as HTML). It has an extensible
|
||||
PDF parser that can be used for other purpoes instead of text analysis.
|
||||
<p>
|
||||
<strong>Features:</strong>
|
||||
<ul>
|
||||
<li> Written entirely in Python. (for version 2.5 or newer)
|
||||
<li> PDF-1.7 specification support.
|
||||
<li> Written entirely in Python. (version 2.4 or newer required)
|
||||
<li> PDF-1.7 specification support. (well, almost)
|
||||
<li> Non-ASCII languages and vertical writing scripts support.
|
||||
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
||||
<li> Basic encryption (RC4).
|
||||
<li> Basic encryption (RC4) support.
|
||||
<li> PDF to HTML conversion (with a sample converter web app).
|
||||
<li> Outline (TOC) extraction.
|
||||
<li> Tagged contents extraction.
|
||||
|
@ -78,33 +77,28 @@ http://pdf2html.tabesugi.net:8080/
|
|||
<a name="install"></a>
|
||||
<hr noshade>
|
||||
<h2>How to Install</h2>
|
||||
<p>
|
||||
<strong>Note:</strong>
|
||||
This software is not yet out-of-the-box.
|
||||
You have to download and unpack it manually,
|
||||
and spend some time to make it work.
|
||||
<strong>Your will</strong> is needed!
|
||||
I do not support easy_install or setup.py or any automated installation until
|
||||
this is very matured to the point that it really should be widely distributed.
|
||||
(For now, it's not yet up to the standard, IMO.)
|
||||
|
||||
<ol>
|
||||
<li> Install <a href="http://www.python.org/download/">Python</a> 2.4 or newer.
|
||||
<li> Download the <a href="#source">PDFMiner source</a>.
|
||||
<li> Extract it.
|
||||
<li> Go to the <code>pdfminer</code> directory.
|
||||
<li> Run <code>setup.py</code> to install:<br>
|
||||
<blockquote><pre>
|
||||
# <strong>python setup.py install</strong>
|
||||
</pre></blockquote>
|
||||
<li> Do the following test:<br>
|
||||
<blockquote><pre>
|
||||
$ <strong>python pdflib/pdf2txt.py samples/simple1.pdf</strong>
|
||||
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ascii">
|
||||
$ <strong>pdf2txt.py samples/simple1.pdf</strong>
|
||||
<html><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
</head><body>
|
||||
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div><span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"></span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:106px; top:224px; font-size:22px;">Hello </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:168px; top:224px; font-size:22px;">World </span>
|
||||
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
|
||||
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"> World </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"> </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> Hello </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;">World </span>
|
||||
<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span>
|
||||
<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
|
||||
</body></html>
|
||||
</pre></blockquote>
|
||||
|
|
|
@ -2,8 +2,7 @@
|
|||
|
||||
PYTHON=python
|
||||
CMAPDIR=../CMap
|
||||
CDBCMAPDIR=../CDBCMap
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR) -D$(CDBCMAPDIR)
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR)
|
||||
|
||||
HTMLS= \
|
||||
simple1.html \
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, os.path
|
||||
stderr = sys.stderr
|
||||
|
||||
def dumpcdb(cmap, cdbfile, verbose=1):
|
||||
from struct import pack, unpack
|
||||
try:
|
||||
import cdb
|
||||
except ImportError:
|
||||
import pdfminer.pycdb as cdb
|
||||
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
|
||||
if verbose:
|
||||
print >>stderr, 'Writing: %r...' % cdbfile
|
||||
for (k,v) in cmap.getall_attrs():
|
||||
m.add('/'+k, repr(v))
|
||||
for (code,cid) in cmap.getall_code2cid():
|
||||
m.add('c'+code, pack('>L',cid))
|
||||
for (cid,code) in cmap.getall_cid2code():
|
||||
m.add('i'+pack('>L',cid), code)
|
||||
m.finish()
|
||||
return
|
||||
|
||||
def convert_cmap(files, cmapdir, cdbcmapdir, force=False):
|
||||
from pdfminer.cmap import CMapDB
|
||||
CMapDB.initialize(cmapdir)
|
||||
for fname in files:
|
||||
if fname.endswith('.upr'): continue
|
||||
cmapname = os.path.basename(fname)
|
||||
cdbname = os.path.join(cdbcmapdir, cmapname+'.cmap.cdb')
|
||||
if not force and os.path.exists(cdbname):
|
||||
print >>stderr, 'Skipping: %r' % cdbname
|
||||
continue
|
||||
print >>stderr, 'Reading: %r...' % fname
|
||||
cmap = CMapDB.get_cmap(cmapname)
|
||||
dumpcdb(cmap, cdbname)
|
||||
return
|
||||
|
||||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-c cmapdir] [-C cdbcmapdir] [-f] file ...' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'c:C:f')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: usage()
|
||||
cmapdir = 'CMap'
|
||||
cdbcmapdir = 'CDBCMap'
|
||||
force = False
|
||||
for (k, v) in opts:
|
||||
if k == '-f': force = True
|
||||
elif k == '-c': cmapdir = v
|
||||
elif k == '-C': cdbcmapdir = v
|
||||
if not os.path.isdir(cmapdir):
|
||||
raise ValueError('not directory: %r' % cmapdir)
|
||||
if not os.path.isdir(cdbcmapdir):
|
||||
raise ValueError('not directory: %r' % cdbcmapdir)
|
||||
return convert_cmap(args, cmapdir, cdbcmapdir, force=force)
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
|
@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser
|
|||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
|
||||
from pdfminer.cmap import CMapDB
|
||||
from pdfminer.cmap import CMapDB, find_cmap_path
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
|
@ -20,8 +20,7 @@ def main(argv):
|
|||
# debug option
|
||||
debug = 0
|
||||
# path option
|
||||
cmapdir = 'CMap'
|
||||
cdbcmapdir = 'CDBCMap'
|
||||
cmapdir = find_cmap_path()
|
||||
# input option
|
||||
password = ''
|
||||
pagenos = set()
|
||||
|
@ -37,7 +36,6 @@ def main(argv):
|
|||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
elif k == '-C': cmapdir = v
|
||||
elif k == '-D': cdbcmapdir = v
|
||||
elif k == '-P': password = v
|
||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||
elif k == '-m': maxpages = int(v)
|
||||
|
@ -54,7 +52,7 @@ def main(argv):
|
|||
PDFPageInterpreter.debug = debug
|
||||
PDFDevice.debug = debug
|
||||
#
|
||||
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||
CMapDB.initialize(cmapdir)
|
||||
rsrc = PDFResourceManager()
|
||||
if outtype == 'sgml':
|
||||
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
|
||||
|
|
Loading…
Reference in New Issue