reorganize the directory structure.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@35 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-06-29 08:45:46 +00:00
parent 26e762c776
commit 24fdae38d4
26 changed files with 64 additions and 19 deletions

View File

@ -1,7 +1,7 @@
# Makefile for pdfminer
PACKAGE=pdfminer
VERSION=20080429
VERSION=20080629
GNUTAR=tar
SVN=svn
PYTHON=python
@ -10,14 +10,21 @@ WORKDIR=/tmp
DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz
CONV_CMAP=$(PYTHON) -m tools.conv_cmap
all:
cdbcmap: CMap
-mkdir CDBCMap
$(PYTHON) conv_cmap.py CMap/*
$(CONV_CMAP) CMap/*
samples:
cd samples && make
clean:
-rm *.pyc *.pyo *~
cd pdfminer && make clean
cd tools && make clean
cd samples && make clean
# Maintainance:

View File

@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
<h1>PDFMiner</h1>
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Wed Apr 30 19:15:12 JST 2008
Last Modified: Sun Jun 29 11:34:38 JST 2008
<!-- hhmts end -->
</div>
@ -114,8 +114,8 @@ PDFMiner comes with two programs:
<code>pdf2txt.py</code> extracts text contents from a PDF file.
It extracts all the texts that are to be rendered programatically.
It also extracts the corresponding locations, font names,
and font sizes for each text portion in an SGML-like format.
It cannot extract texts embedded within images
and font sizes for each text portion. However,
it cannot extract texts embedded within images
(i.e. it does not do optical character recognition).
You can provide a password for protected PDF documents
whose access is limited.
@ -129,7 +129,8 @@ Unicode Standard.
<p>
Examples:
<blockquote><pre>
$ <strong>./pdf2txt.py samples/naacl06-shinyama.pdf</strong>
$ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong>
(extract text as an HTML file whose filename is output.html)
$ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong>
(extract Japanese texts in vertical writing, CMap is required)
@ -145,15 +146,17 @@ Options:
<dd> Speficies the output file name.
By default, it prints the extracted contents to stdout.
<p>
<dt> <code>-p <em>pageno</em></code>
<dd> Speficies the page number to be extracted.
Note that page numbers start from zero.
Multiple <code>-p</code> options are allowed.
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
<dd> Speficies the comma-separated list of the page numbers to be extracted.
Page numbers are starting from zero.
By default, it extracts texts from all the pages.
<p>
<dt> <code>-c <em>codec</em></code>
<dd> Speficies the output codec for non-ASCII texts.
<p>
<dt> <code>-H</code>
<dd> Speficies the output to be HTML file.
<p>
<dt> <code>-P <em>password</em></code>
<dd> Provides the user password to open the PDF file.
<p>

1
TODO
View File

@ -1,7 +1,6 @@
TODOs:
- Documentation.
- Error handling for invalid type.
- Simple viewer application using pygame.
- Outlines.
- Named Objects. (pages)

6
pdfminer/Makefile Normal file
View File

@ -0,0 +1,6 @@
# Makefile for pdfminer
all:
clean:
-rm *.pyc *.pyo

0
pdfminer/__init__.py Normal file
View File

24
samples/Makefile Normal file
View File

@ -0,0 +1,24 @@
# GNUMakefile for test
PYTHON=python
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
HTMLS= \
simple1.html \
simple2.html \
dmca.html \
f1040nr.html \
i1040nr.html \
jo.html \
kampo.html \
naacl06-shinyama.html \
nlp2004slides.html
all: $(HTMLS)
clean:
-rm $(HTMLS)
.SUFFIXES: .pdf .html
.pdf.html:
$(PDF2TXT) -H -o $@ $<

6
tools/Makefile Normal file
View File

@ -0,0 +1,6 @@
# Makefile for tools
all:
clean:
-rm *.pyc *.pyo

0
tools/__init__.py Normal file
View File

View File

@ -7,7 +7,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
try:
import cdb
except ImportError:
import pycdb as cdb
import pdfminer.pycdb as cdb
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
if verbose:
print >>stderr, 'Writing: %r...' % cdbfile
@ -21,7 +21,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
return
def convert_cmap(files, cmapdir, cdbcmapdir, force=False):
from cmap import CMapDB
from pdfminer.cmap import CMapDB
CMapDB.initialize(cmapdir)
for fname in files:
if fname.endswith('.upr'): continue

View File

@ -7,7 +7,7 @@
# -i objid : object id
#
import sys, re
from pdfparser import PDFDocument, PDFParser, PDFStream, \
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFStream, \
PDFObjRef, PSKeyword, PSLiteral
stdout = sys.stdout
stderr = sys.stderr

View File

@ -2,11 +2,11 @@
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFDevice, PDFResourceManager, \
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfminer.pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix
from cmap import CMapDB
from pdfminer.cmap import CMapDB
def enc(x, codec):

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
import sys
from sgml import PDFSGMLParser, Document
from pdfminer.sgml import PDFSGMLParser, Document
stdout = sys.stdout
stderr = sys.stderr
try: