git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@121 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-07-21 14:23:23 +00:00
parent 8a5bec5065
commit 9093c340af
5 changed files with 76 additions and 33 deletions

42
MANIFEST Normal file
View File

@ -0,0 +1,42 @@
TODO
Makefile
README.html
setup.py
pdfminer/Makefile
pdfminer/__init__.py
pdfminer/arcfour.py
pdfminer/ascii85.py
pdfminer/cmap.py
pdfminer/converter.py
pdfminer/fontmetrics.py
pdfminer/glyphlist.py
pdfminer/latin2ascii.py
pdfminer/latin_enc.py
pdfminer/layout.py
pdfminer/lzw.py
pdfminer/pdfcolor.py
pdfminer/pdfdevice.py
pdfminer/pdffont.py
pdfminer/pdfinterp.py
pdfminer/pdfparser.py
pdfminer/pdftypes.py
pdfminer/psparser.py
pdfminer/pycdb.py
pdfminer/rijndael.py
pdfminer/utils.py
tools/Makefile
tools/dumppdf.py
tools/pdf2txt.py
tools/pdf2html.cgi
tools/conv_afm.py
tools/prof.py
samples/Makefile
samples/jo.pdf
samples/simple1.pdf
samples/simple2.pdf
samples/dmca.pdf
samples/f1040nr.pdf
samples/i1040nr.pdf
samples/kampo.pdf
samples/naacl06-shinyama.pdf
samples/nlp2004slides.pdf

View File

@ -1,17 +1,16 @@
# Makefile for pdfminer
## Makefile (for maintainance purpose)
##
PACKAGE=pdfminer
PREFIX=/usr/local
SVN=svn
GNUTAR=tar
PYTHON=python
PREFIX=/usr/local
TMPDIR=/tmp
VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz
RM=rm -f
CP=cp -f
CONV_CMAP=$(PYTHON) pdfminer/cmap.py
VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
DISTFILE=$(PACKAGE)-$(VERSION).tar.gz
all:
@ -19,7 +18,8 @@ install:
$(PYTHON) setup.py install --prefix=$(PREFIX)
clean:
-rm -rf build
-$(PYTHON) setup.py clean
-$(RM) -r build dist
-cd $(PACKAGE) && $(MAKE) clean
-cd tools && $(MAKE) clean
-cd samples && $(MAKE) clean
@ -27,20 +27,16 @@ clean:
test:
cd samples && $(MAKE) test
# Maintainance:
commit: clean
$(SVN) commit
check:
cd $(PACKAGE) && make check
dist: clean
$(SVN) cleanup
$(SVN) export . $(TMPDIR)/$(DISTNAME)
$(GNUTAR) c -z -C$(TMPDIR) -f $(TMPDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
-rm -rf $(TMPDIR)/$(DISTNAME)
dist/$(DISTFILE): clean
$(PYTHON) setup.py sdist
WEBDIR=$$HOME/Site/unixuser.org/python/pdfminer
publish: dist
cp $(TMPDIR)/$(DISTFILE) $(WEBDIR)
cp README.html $(WEBDIR)/index.html
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish: dist/$(DISTFILE)
$(CP) dist/$(DISTFILE) $(WEBDIR)
$(CP) README.html $(WEBDIR)/index.html

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Tue Jul 21 16:24:26 JST 2009
Last Modified: Tue Jul 21 23:22:42 JST 2009
<!-- hhmts end -->
</div>
@ -37,7 +37,7 @@ PDF parser that can be used for other purposes instead of text analysis.
<p>
<strong>Features:</strong>
<ul>
<li> Written entirely in Python. (version 2.4 or newer required)
<li> Written entirely in Python. (for version 2.4 or newer)
<li> PDF-1.7 specification support. (well, almost)
<li> Non-ASCII languages and vertical writing scripts support.
<li> Various font types (Type1, TrueType, Type3, and CID) support.
@ -51,8 +51,8 @@ PDF parser that can be used for other purposes instead of text analysis.
<a name="source"></a>
<p>
<strong>Download:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz
</a>
(1.8Mbytes)
@ -158,7 +158,7 @@ Examples:
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
(extract text as an HTML file whose filename is output.html)
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -D V -o output.html</strong>
(extract a Japanese HTML file in vertical writing, CMap is required)
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>

View File

@ -761,7 +761,7 @@ def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
parser = PDFParser(doc, fp)
doc.initialize(password)
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp.name)
interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue

View File

@ -14,18 +14,23 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''',
keywords=['pdf parser', 'pdf converter', 'text mining'],
license='MIT/X',
author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=['pdfminer'],
scripts=['tools/pdf2txt.py', 'tools/dumppdf.py'],
packages=[
'pdfminer'
],
scripts=[
'tools/pdf2txt.py',
'tools/dumppdf.py'
],
keywords=['pdf parser', 'pdf converter', 'text mining'],
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Development Status :: 4 - Beta',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
],
)