20090721
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@121 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
8a5bec5065
commit
9093c340af
|
@ -0,0 +1,42 @@
|
||||||
|
TODO
|
||||||
|
Makefile
|
||||||
|
README.html
|
||||||
|
setup.py
|
||||||
|
pdfminer/Makefile
|
||||||
|
pdfminer/__init__.py
|
||||||
|
pdfminer/arcfour.py
|
||||||
|
pdfminer/ascii85.py
|
||||||
|
pdfminer/cmap.py
|
||||||
|
pdfminer/converter.py
|
||||||
|
pdfminer/fontmetrics.py
|
||||||
|
pdfminer/glyphlist.py
|
||||||
|
pdfminer/latin2ascii.py
|
||||||
|
pdfminer/latin_enc.py
|
||||||
|
pdfminer/layout.py
|
||||||
|
pdfminer/lzw.py
|
||||||
|
pdfminer/pdfcolor.py
|
||||||
|
pdfminer/pdfdevice.py
|
||||||
|
pdfminer/pdffont.py
|
||||||
|
pdfminer/pdfinterp.py
|
||||||
|
pdfminer/pdfparser.py
|
||||||
|
pdfminer/pdftypes.py
|
||||||
|
pdfminer/psparser.py
|
||||||
|
pdfminer/pycdb.py
|
||||||
|
pdfminer/rijndael.py
|
||||||
|
pdfminer/utils.py
|
||||||
|
tools/Makefile
|
||||||
|
tools/dumppdf.py
|
||||||
|
tools/pdf2txt.py
|
||||||
|
tools/pdf2html.cgi
|
||||||
|
tools/conv_afm.py
|
||||||
|
tools/prof.py
|
||||||
|
samples/Makefile
|
||||||
|
samples/jo.pdf
|
||||||
|
samples/simple1.pdf
|
||||||
|
samples/simple2.pdf
|
||||||
|
samples/dmca.pdf
|
||||||
|
samples/f1040nr.pdf
|
||||||
|
samples/i1040nr.pdf
|
||||||
|
samples/kampo.pdf
|
||||||
|
samples/naacl06-shinyama.pdf
|
||||||
|
samples/nlp2004slides.pdf
|
34
Makefile
34
Makefile
|
@ -1,17 +1,16 @@
|
||||||
# Makefile for pdfminer
|
## Makefile (for maintainance purpose)
|
||||||
|
##
|
||||||
|
|
||||||
PACKAGE=pdfminer
|
PACKAGE=pdfminer
|
||||||
|
PREFIX=/usr/local
|
||||||
|
|
||||||
SVN=svn
|
SVN=svn
|
||||||
GNUTAR=tar
|
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
PREFIX=/usr/local
|
RM=rm -f
|
||||||
TMPDIR=/tmp
|
CP=cp -f
|
||||||
VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
|
|
||||||
DISTNAME=$(PACKAGE)-dist-$(VERSION)
|
|
||||||
DISTFILE=$(DISTNAME).tar.gz
|
|
||||||
|
|
||||||
CONV_CMAP=$(PYTHON) pdfminer/cmap.py
|
VERSION=`$(PYTHON) $(PACKAGE)/__init__.py`
|
||||||
|
DISTFILE=$(PACKAGE)-$(VERSION).tar.gz
|
||||||
|
|
||||||
all:
|
all:
|
||||||
|
|
||||||
|
@ -19,7 +18,8 @@ install:
|
||||||
$(PYTHON) setup.py install --prefix=$(PREFIX)
|
$(PYTHON) setup.py install --prefix=$(PREFIX)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm -rf build
|
-$(PYTHON) setup.py clean
|
||||||
|
-$(RM) -r build dist
|
||||||
-cd $(PACKAGE) && $(MAKE) clean
|
-cd $(PACKAGE) && $(MAKE) clean
|
||||||
-cd tools && $(MAKE) clean
|
-cd tools && $(MAKE) clean
|
||||||
-cd samples && $(MAKE) clean
|
-cd samples && $(MAKE) clean
|
||||||
|
@ -27,20 +27,16 @@ clean:
|
||||||
test:
|
test:
|
||||||
cd samples && $(MAKE) test
|
cd samples && $(MAKE) test
|
||||||
|
|
||||||
# Maintainance:
|
|
||||||
commit: clean
|
commit: clean
|
||||||
$(SVN) commit
|
$(SVN) commit
|
||||||
|
|
||||||
check:
|
check:
|
||||||
cd $(PACKAGE) && make check
|
cd $(PACKAGE) && make check
|
||||||
|
|
||||||
dist: clean
|
dist/$(DISTFILE): clean
|
||||||
$(SVN) cleanup
|
$(PYTHON) setup.py sdist
|
||||||
$(SVN) export . $(TMPDIR)/$(DISTNAME)
|
|
||||||
$(GNUTAR) c -z -C$(TMPDIR) -f $(TMPDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
|
|
||||||
-rm -rf $(TMPDIR)/$(DISTNAME)
|
|
||||||
|
|
||||||
WEBDIR=$$HOME/Site/unixuser.org/python/pdfminer
|
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
|
||||||
publish: dist
|
publish: dist/$(DISTFILE)
|
||||||
cp $(TMPDIR)/$(DISTFILE) $(WEBDIR)
|
$(CP) dist/$(DISTFILE) $(WEBDIR)
|
||||||
cp README.html $(WEBDIR)/index.html
|
$(CP) README.html $(WEBDIR)/index.html
|
||||||
|
|
10
README.html
10
README.html
|
@ -18,7 +18,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Tue Jul 21 16:24:26 JST 2009
|
Last Modified: Tue Jul 21 23:22:42 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ PDF parser that can be used for other purposes instead of text analysis.
|
||||||
<p>
|
<p>
|
||||||
<strong>Features:</strong>
|
<strong>Features:</strong>
|
||||||
<ul>
|
<ul>
|
||||||
<li> Written entirely in Python. (version 2.4 or newer required)
|
<li> Written entirely in Python. (for version 2.4 or newer)
|
||||||
<li> PDF-1.7 specification support. (well, almost)
|
<li> PDF-1.7 specification support. (well, almost)
|
||||||
<li> Non-ASCII languages and vertical writing scripts support.
|
<li> Non-ASCII languages and vertical writing scripts support.
|
||||||
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
<li> Various font types (Type1, TrueType, Type3, and CID) support.
|
||||||
|
@ -51,8 +51,8 @@ PDF parser that can be used for other purposes instead of text analysis.
|
||||||
<a name="source"></a>
|
<a name="source"></a>
|
||||||
<p>
|
<p>
|
||||||
<strong>Download:</strong><br>
|
<strong>Download:</strong><br>
|
||||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz">
|
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz">
|
||||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090711.tar.gz
|
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-20090721.tar.gz
|
||||||
</a>
|
</a>
|
||||||
(1.8Mbytes)
|
(1.8Mbytes)
|
||||||
|
|
||||||
|
@ -158,7 +158,7 @@ Examples:
|
||||||
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
|
$ <strong>pdf2txt.py samples/naacl06-shinyama.pdf -o output.html</strong>
|
||||||
(extract text as an HTML file whose filename is output.html)
|
(extract text as an HTML file whose filename is output.html)
|
||||||
|
|
||||||
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -o output.html</strong>
|
$ <strong>pdf2txt.py -c euc-jp samples/jo.pdf -D V -o output.html</strong>
|
||||||
(extract a Japanese HTML file in vertical writing, CMap is required)
|
(extract a Japanese HTML file in vertical writing, CMap is required)
|
||||||
|
|
||||||
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
|
$ <strong>pdf2txt.py -P mypassword secret.pdf -o output.txt</strong>
|
||||||
|
|
|
@ -761,7 +761,7 @@ def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
|
||||||
parser = PDFParser(doc, fp)
|
parser = PDFParser(doc, fp)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
if not doc.is_extractable:
|
if not doc.is_extractable:
|
||||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp.name)
|
||||||
interpreter = PDFPageInterpreter(rsrc, device)
|
interpreter = PDFPageInterpreter(rsrc, device)
|
||||||
for (pageno,page) in enumerate(doc.get_pages()):
|
for (pageno,page) in enumerate(doc.get_pages()):
|
||||||
if pagenos and (pageno not in pagenos): continue
|
if pagenos and (pageno not in pagenos): continue
|
||||||
|
|
21
setup.py
21
setup.py
|
@ -14,18 +14,23 @@ other extra information such as font information or ruled lines.
|
||||||
It includes a PDF converter that can transform PDF files
|
It includes a PDF converter that can transform PDF files
|
||||||
into other text formats (such as HTML). It has an extensible
|
into other text formats (such as HTML). It has an extensible
|
||||||
PDF parser that can be used for other purposes instead of text analysis.''',
|
PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
|
||||||
license='MIT/X',
|
license='MIT/X',
|
||||||
author='Yusuke Shinyama',
|
author='Yusuke Shinyama',
|
||||||
author_email='yusuke at cs dot nyu dot edu',
|
author_email='yusuke at cs dot nyu dot edu',
|
||||||
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
|
||||||
packages=['pdfminer'],
|
packages=[
|
||||||
scripts=['tools/pdf2txt.py', 'tools/dumppdf.py'],
|
'pdfminer'
|
||||||
|
],
|
||||||
|
scripts=[
|
||||||
|
'tools/pdf2txt.py',
|
||||||
|
'tools/dumppdf.py'
|
||||||
|
],
|
||||||
|
keywords=['pdf parser', 'pdf converter', 'text mining'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 4 - Beta',
|
'Development Status :: 4 - Beta',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'Intended Audience :: Developers',
|
'Intended Audience :: Developers',
|
||||||
'Intended Audience :: Science/Research',
|
'Intended Audience :: Science/Research',
|
||||||
'License :: OSI Approved :: MIT License',
|
'License :: OSI Approved :: MIT License',
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue