git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@121 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-07-21 14:23:23 +00:00
parent 8a5bec5065
commit 9093c340af
5 changed files with 76 additions and 33 deletions

42
MANIFEST Normal file
View File

@ -0,0 +1,42 @@
TODO
Makefile
README.html
setup.py
pdfminer/Makefile
pdfminer/__init__.py
pdfminer/arcfour.py
pdfminer/ascii85.py
pdfminer/cmap.py
pdfminer/converter.py
pdfminer/fontmetrics.py
pdfminer/glyphlist.py
pdfminer/latin2ascii.py
pdfminer/latin_enc.py
pdfminer/layout.py
pdfminer/lzw.py
pdfminer/pdfcolor.py
pdfminer/pdfdevice.py
pdfminer/pdffont.py
pdfminer/pdfinterp.py
pdfminer/pdfparser.py
pdfminer/pdftypes.py
pdfminer/psparser.py
pdfminer/pycdb.py
pdfminer/rijndael.py
pdfminer/utils.py
tools/Makefile
tools/dumppdf.py
tools/pdf2txt.py
tools/pdf2html.cgi
tools/conv_afm.py
tools/prof.py
samples/Makefile
samples/jo.pdf
samples/simple1.pdf
samples/simple2.pdf
samples/dmca.pdf
samples/f1040nr.pdf
samples/i1040nr.pdf
samples/kampo.pdf
samples/naacl06-shinyama.pdf
samples/nlp2004slides.pdf

View File

@ -1,17 +1,16 @@
# Makefile for pdfminer ## Makefile (for maintainance purpose)
##
PACKAGE=pdfminer PACKAGE=pdfminer
PREFIX=/usr/local
SVN=svn SVN=svn
GNUTAR=tar
PYTHON=python PYTHON=python
PREFIX=/usr/local RM=rm -f
TMPDIR=/tmp CP=cp -f
VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz
CONV_CMAP=$(PYTHON) pdfminer/cmap.py VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
DISTFILE=$(PACKAGE)-$(VERSION).tar.gz
all: all:
@ -19,7 +18,8 @@ install:
$(PYTHON) setup.py install --prefix=$(PREFIX) $(PYTHON) setup.py install --prefix=$(PREFIX)
clean: clean:
-rm -rf build -$(PYTHON) setup.py clean
-$(RM) -r build dist
-cd $(PACKAGE) && $(MAKE) clean -cd $(PACKAGE) && $(MAKE) clean
-cd tools && $(MAKE) clean -cd tools && $(MAKE) clean
-cd samples && $(MAKE) clean -cd samples && $(MAKE) clean
@ -27,20 +27,16 @@ clean:
test: test:
cd samples && $(MAKE) test cd samples && $(MAKE) test
# Maintainance:
commit: clean commit: clean
$(SVN) commit $(SVN) commit
check: check:
cd $(PACKAGE) && make check cd $(PACKAGE) && make check
dist: clean dist/$(DISTFILE): clean
$(SVN) cleanup $(PYTHON) setup.py sdist
$(SVN) export . $(TMPDIR)/$(DISTNAME)
$(GNUTAR) c -z -C$(TMPDIR) -f $(TMPDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
-rm -rf $(TMPDIR)/$(DISTNAME)
WEBDIR=$$HOME/Site/unixuser.org/python/pdfminer WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish: dist publish: dist/$(DISTFILE)
cp $(TMPDIR)/$(DISTFILE) $(WEBDIR) $(CP) dist/$(DISTFILE) $(WEBDIR)
cp README.html $(WEBDIR)/index.html $(CP) README.html $(WEBDIR)/index.html

View File

@ -18,7 +18,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Tue Jul 21 16:24:26 JST 2009 Last Modified: Tue Jul 21 23:22:42 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -37,7 +37,7 @@ PDF parser that can be used for other purposes instead of text analysis.
<p> <p>
<strong>Features:</strong> <strong>Features:</strong>
<ul> <ul>
<li> Written entirely in Python. (version 2.4 or newer required) <li> Written entirely in Python. (for version 2.4 or newer)
<li> PDF-1.7 specification support. (well, almost) <li> PDF-1.7 specification support. (well, almost)
<li> Non-ASCII languages and vertical writing scripts support. <li> Non-ASCII languages and vertical writing scripts support.
<li> Various font types (Type1, TrueType, Type3, and CID) support. <li> Various font types (Type1, TrueType, Type3, and CID) support.
@ -51,8 +51,8 @@ PDF parser that can be used for other purposes instead of text analysis.
<a name="source"></a> <a name="source"></a>
<p> <p>
<strong>Download:</strong><br> <strong>Download:</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz"> <a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz
</a> </a>
(1.8Mbytes) (1.8Mbytes)
@ -158,7 +158,7 @@ Examples:
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong> $ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
(extract text as an HTML file whose filename is output.html) (extract text as an HTML file whose filename is output.html)
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong> $ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -D V -o output.html</strong>
(extract a Japanese HTML file in vertical writing, CMap is required) (extract a Japanese HTML file in vertical writing, CMap is required)
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong> $ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>

View File

@ -761,7 +761,7 @@ def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
parser = PDFParser(doc, fp) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
if not doc.is_extractable: if not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp.name)
interpreter = PDFPageInterpreter(rsrc, device) interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages()): for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue if pagenos and (pageno not in pagenos): continue

View File

@ -14,13 +14,18 @@ other extra information such as font information or ruled lines.
It includes a PDF converter that can transform PDF files It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.''', PDF parser that can be used for other purposes instead of text analysis.''',
keywords=['pdf parser', 'pdf converter', 'text mining'],
license='MIT/X', license='MIT/X',
author='Yusuke Shinyama', author='Yusuke Shinyama',
author_email='yusuke at cs dot nyu dot edu', author_email='yusuke at cs dot nyu dot edu',
url='http://www.unixuser.org/~euske/python/pdfminer/index.html', url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=['pdfminer'], packages=[
scripts=['tools/pdf2txt.py', 'tools/dumppdf.py'], 'pdfminer'
],
scripts=[
'tools/pdf2txt.py',
'tools/dumppdf.py'
],
keywords=['pdf parser', 'pdf converter', 'text mining'],
classifiers=[ classifiers=[
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'Environment :: Console', 'Environment :: Console',