reorganize the directory structure.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@35 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-06-29 08:45:46 +00:00
parent 26e762c776
commit 24fdae38d4
26 changed files with 64 additions and 19 deletions

View File

@ -1,7 +1,7 @@
# Makefile for pdfminer # Makefile for pdfminer
PACKAGE=pdfminer PACKAGE=pdfminer
VERSION=20080429 VERSION=20080629
GNUTAR=tar GNUTAR=tar
SVN=svn SVN=svn
PYTHON=python PYTHON=python
@ -10,14 +10,21 @@ WORKDIR=/tmp
DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz DISTFILE=$(DISTNAME).tar.gz
CONV_CMAP=$(PYTHON) -m tools.conv_cmap
all: all:
cdbcmap: CMap cdbcmap: CMap
-mkdir CDBCMap -mkdir CDBCMap
$(PYTHON) conv_cmap.py CMap/* $(CONV_CMAP) CMap/*
samples:
cd samples && make
clean: clean:
-rm *.pyc *.pyo *~ cd pdfminer && make clean
cd tools && make clean
cd samples && make clean
# Maintainance: # Maintainance:

View File

@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
<h1>PDFMiner</h1> <h1>PDFMiner</h1>
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Wed Apr 30 19:15:12 JST 2008 Last Modified: Sun Jun 29 11:34:38 JST 2008
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -114,8 +114,8 @@ PDFMiner comes with two programs:
<code>pdf2txt.py</code> extracts text contents from a PDF file. <code>pdf2txt.py</code> extracts text contents from a PDF file.
It extracts all the texts that are to be rendered programatically. It extracts all the texts that are to be rendered programatically.
It also extracts the corresponding locations, font names, It also extracts the corresponding locations, font names,
and font sizes for each text portion in an SGML-like format. and font sizes for each text portion. However,
It cannot extract texts embedded within images it cannot extract texts embedded within images
(i.e. it does not do optical character recognition). (i.e. it does not do optical character recognition).
You can provide a password for protected PDF documents You can provide a password for protected PDF documents
whose access is limited. whose access is limited.
@ -129,7 +129,8 @@ Unicode Standard.
<p> <p>
Examples: Examples:
<blockquote><pre> <blockquote><pre>
$ <strong>./pdf2txt.py samples/naacl06-shinyama.pdf</strong> $ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong>
(extract text as an HTML file whose filename is output.html)
$ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong> $ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong>
(extract Japanese texts in vertical writing, CMap is required) (extract Japanese texts in vertical writing, CMap is required)
@ -145,15 +146,17 @@ Options:
<dd> Speficies the output file name. <dd> Speficies the output file name.
By default, it prints the extracted contents to stdout. By default, it prints the extracted contents to stdout.
<p> <p>
<dt> <code>-p <em>pageno</em></code> <dt> <code>-p <em>pageno[,pageno,...]</em></code>
<dd> Speficies the page number to be extracted. <dd> Speficies the comma-separated list of the page numbers to be extracted.
Note that page numbers start from zero. Page numbers are starting from zero.
Multiple <code>-p</code> options are allowed.
By default, it extracts texts from all the pages. By default, it extracts texts from all the pages.
<p> <p>
<dt> <code>-c <em>codec</em></code> <dt> <code>-c <em>codec</em></code>
<dd> Speficies the output codec for non-ASCII texts. <dd> Speficies the output codec for non-ASCII texts.
<p> <p>
<dt> <code>-H</code>
<dd> Speficies the output to be HTML file.
<p>
<dt> <code>-P <em>password</em></code> <dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file. <dd> Provides the user password to open the PDF file.
<p> <p>

1
TODO
View File

@ -1,7 +1,6 @@
TODOs: TODOs:
- Documentation. - Documentation.
- Error handling for invalid type. - Error handling for invalid type.
- Simple viewer application using pygame.
- Outlines. - Outlines.
- Named Objects. (pages) - Named Objects. (pages)

6
pdfminer/Makefile Normal file
View File

@ -0,0 +1,6 @@
# Makefile for pdfminer
all:
clean:
-rm *.pyc *.pyo

0
pdfminer/__init__.py Normal file
View File

24
samples/Makefile Normal file
View File

@ -0,0 +1,24 @@
# GNUMakefile for test
PYTHON=python
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
HTMLS= \
simple1.html \
simple2.html \
dmca.html \
f1040nr.html \
i1040nr.html \
jo.html \
kampo.html \
naacl06-shinyama.html \
nlp2004slides.html
all: $(HTMLS)
clean:
-rm $(HTMLS)
.SUFFIXES: .pdf .html
.pdf.html:
$(PDF2TXT) -H -o $@ $<

6
tools/Makefile Normal file
View File

@ -0,0 +1,6 @@
# Makefile for tools
all:
clean:
-rm *.pyc *.pyo

0
tools/__init__.py Normal file
View File

View File

@ -7,7 +7,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
try: try:
import cdb import cdb
except ImportError: except ImportError:
import pycdb as cdb import pdfminer.pycdb as cdb
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose: if verbose:
print >>stderr, 'Writing: %r...' % cdbfile print >>stderr, 'Writing: %r...' % cdbfile
@ -21,7 +21,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
return return
def convert_cmap(files, cmapdir, cdbcmapdir, force=False): def convert_cmap(files, cmapdir, cdbcmapdir, force=False):
from cmap import CMapDB from pdfminer.cmap import CMapDB
CMapDB.initialize(cmapdir) CMapDB.initialize(cmapdir)
for fname in files: for fname in files:
if fname.endswith('.upr'): continue if fname.endswith('.upr'): continue

View File

@ -7,7 +7,7 @@
# -i objid : object id # -i objid : object id
# #
import sys, re import sys, re
from pdfparser import PDFDocument, PDFParser, PDFStream, \ from pdfminer.pdfparser import PDFDocument, PDFParser, PDFStream, \
PDFObjRef, PSKeyword, PSLiteral PDFObjRef, PSKeyword, PSLiteral
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr

View File

@ -2,11 +2,11 @@
import sys import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFDevice, PDFResourceManager, \ from pdfminer.pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined, \ PDFPageInterpreter, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix mult_matrix, apply_matrix
from cmap import CMapDB from pdfminer.cmap import CMapDB
def enc(x, codec): def enc(x, codec):

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
from sgml import PDFSGMLParser, Document from pdfminer.sgml import PDFSGMLParser, Document
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
try: try: