20090721
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@121 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
8a5bec5065
commit
9093c340af
|
@ -0,0 +1,42 @@
|
|||
TODO
|
||||
Makefile
|
||||
README.html
|
||||
setup.py
|
||||
pdfminer/Makefile
|
||||
pdfminer/__init__.py
|
||||
pdfminer/arcfour.py
|
||||
pdfminer/ascii85.py
|
||||
pdfminer/cmap.py
|
||||
pdfminer/converter.py
|
||||
pdfminer/fontmetrics.py
|
||||
pdfminer/glyphlist.py
|
||||
pdfminer/latin2ascii.py
|
||||
pdfminer/latin_enc.py
|
||||
pdfminer/layout.py
|
||||
pdfminer/lzw.py
|
||||
pdfminer/pdfcolor.py
|
||||
pdfminer/pdfdevice.py
|
||||
pdfminer/pdffont.py
|
||||
pdfminer/pdfinterp.py
|
||||
pdfminer/pdfparser.py
|
||||
pdfminer/pdftypes.py
|
||||
pdfminer/psparser.py
|
||||
pdfminer/pycdb.py
|
||||
pdfminer/rijndael.py
|
||||
pdfminer/utils.py
|
||||
tools/Makefile
|
||||
tools/dumppdf.py
|
||||
tools/pdf2txt.py
|
||||
tools/pdf2html.cgi
|
||||
tools/conv_afm.py
|
||||
tools/prof.py
|
||||
samples/Makefile
|
||||
samples/jo.pdf
|
||||
samples/simple1.pdf
|
||||
samples/simple2.pdf
|
||||
samples/dmca.pdf
|
||||
samples/f1040nr.pdf
|
||||
samples/i1040nr.pdf
|
||||
samples/kampo.pdf
|
||||
samples/naacl06-shinyama.pdf
|
||||
samples/nlp2004slides.pdf
|
34
Makefile
34
Makefile
|
@ -1,17 +1,16 @@
|
|||
# Makefile for pdfminer
|
||||
## Makefile (for maintainance purpose)
|
||||
##
|
||||
|
||||
PACKAGE=pdfminer
|
||||
PREFIX=/usr/local
|
||||
|
||||
SVN=svn
|
||||
GNUTAR=tar
|
||||
PYTHON=python
|
||||
PREFIX=/usr/local
|
||||
TMPDIR=/tmp
|
||||
VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
|
||||
DISTNAME=$(PACKAGE)-dist-$(VERSION)
|
||||
DISTFILE=$(DISTNAME).tar.gz
|
||||
RM=rm -f
|
||||
CP=cp -f
|
||||
|
||||
CONV_CMAP=$(PYTHON) pdfminer/cmap.py
|
||||
VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
|
||||
DISTFILE=$(PACKAGE)-$(VERSION).tar.gz
|
||||
|
||||
all:
|
||||
|
||||
|
@ -19,7 +18,8 @@ install:
|
|||
$(PYTHON) setup.py install --prefix=$(PREFIX)
|
||||
|
||||
clean:
|
||||
-rm -rf build
|
||||
-$(PYTHON) setup.py clean
|
||||
-$(RM) -r build dist
|
||||
-cd $(PACKAGE) && $(MAKE) clean
|
||||
-cd tools && $(MAKE) clean
|
||||
-cd samples && $(MAKE) clean
|
||||
|
@ -27,20 +27,16 @@ clean:
|
|||
test:
|
||||
cd samples && $(MAKE) test
|
||||
|
||||
# Maintainance:
|
||||
commit: clean
|
||||
$(SVN) commit
|
||||
|
||||
check:
|
||||
cd $(PACKAGE) && make check
|
||||
|
||||
dist: clean
|
||||
$(SVN) cleanup
|
||||
$(SVN) export . $(TMPDIR)/$(DISTNAME)
|
||||
$(GNUTAR) c -z -C$(TMPDIR) -f $(TMPDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
|
||||
-rm -rf $(TMPDIR)/$(DISTNAME)
|
||||
dist/$(DISTFILE): clean
|
||||
$(PYTHON) setup.py sdist
|
||||
|
||||
WEBDIR=$$HOME/Site/unixuser.org/python/pdfminer
|
||||
publish: dist
|
||||
cp $(TMPDIR)/$(DISTFILE) $(WEBDIR)
|
||||
cp README.html $(WEBDIR)/index.html
|
||||
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
|
||||
publish: dist/$(DISTFILE)
|
||||
$(CP) dist/$(DISTFILE) $(WEBDIR)
|
||||
$(CP) README.html $(WEBDIR)/index.html
|
||||
|
|
10
README.html
10
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Tue Jul 21 16:24:26 JST 2009
|
||||
Last Modified: Tue Jul 21 23:22:42 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -37,7 +37,7 @@ PDF parser that can be used for other purposes instead of text analysis.
|
|||
<p>
|
||||
<strong>Features:</strong>
|
||||
<ul>
|
||||
<li> Written entirely in Python. (version 2.4 or newer required)
|
||||
<li> Written entirely in Python. (for version 2.4 or newer)
|
||||
<li> PDF-1.7 specification support. (well, almost)
|
||||
<li> Non-ASCII languages and vertical writing scripts support.
|
||||
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
||||
|
@ -51,8 +51,8 @@ PDF parser that can be used for other purposes instead of text analysis.
|
|||
<a name="source"></a>
|
||||
<p>
|
||||
<strong>Download:</strong><br>
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz
|
||||
</a>
|
||||
(1.8Mbytes)
|
||||
|
||||
|
@ -158,7 +158,7 @@ Examples:
|
|||
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
|
||||
(extract text as an HTML file whose filename is output.html)
|
||||
|
||||
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
|
||||
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -D V -o output.html</strong>
|
||||
(extract a Japanese HTML file in vertical writing, CMap is required)
|
||||
|
||||
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
|
||||
|
|
|
@ -761,7 +761,7 @@ def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
|||
parser = PDFParser(doc, fp)
|
||||
doc.initialize(password)
|
||||
if not doc.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp.name)
|
||||
interpreter = PDFPageInterpreter(rsrc, device)
|
||||
for (pageno,page) in enumerate(doc.get_pages()):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
|
|
21
setup.py
21
setup.py
|
@ -14,18 +14,23 @@ other extra information such as font information or ruled lines.
|
|||
It includes a PDF converter that can transform PDF files
|
||||
into other text formats (such as HTML). It has an extensible
|
||||
PDF parser that can be used for other purposes instead of text analysis.''',
|
||||
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
||||
license='MIT/X',
|
||||
author='Yusuke Shinyama',
|
||||
author_email='yusuke at cs dot nyu dot edu',
|
||||
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
||||
packages=['pdfminer'],
|
||||
scripts=['tools/pdf2txt.py', 'tools/dumppdf.py'],
|
||||
packages=[
|
||||
'pdfminer'
|
||||
],
|
||||
scripts=[
|
||||
'tools/pdf2txt.py',
|
||||
'tools/dumppdf.py'
|
||||
],
|
||||
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
'Environment :: Console',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Development Status :: 4 - Beta',
|
||||
'Environment :: Console',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
],
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue