reorganize the directory structure.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@35 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
26e762c776
commit
24fdae38d4
13
Makefile
13
Makefile
|
@ -1,7 +1,7 @@
|
|||
# Makefile for pdfminer
|
||||
|
||||
PACKAGE=pdfminer
|
||||
VERSION=20080429
|
||||
VERSION=20080629
|
||||
GNUTAR=tar
|
||||
SVN=svn
|
||||
PYTHON=python
|
||||
|
@ -10,14 +10,21 @@ WORKDIR=/tmp
|
|||
DISTNAME=$(PACKAGE)-dist-$(VERSION)
|
||||
DISTFILE=$(DISTNAME).tar.gz
|
||||
|
||||
CONV_CMAP=$(PYTHON) -m tools.conv_cmap
|
||||
|
||||
all:
|
||||
|
||||
cdbcmap: CMap
|
||||
-mkdir CDBCMap
|
||||
$(PYTHON) conv_cmap.py CMap/*
|
||||
$(CONV_CMAP) CMap/*
|
||||
|
||||
samples:
|
||||
cd samples && make
|
||||
|
||||
clean:
|
||||
-rm *.pyc *.pyo *~
|
||||
cd pdfminer && make clean
|
||||
cd tools && make clean
|
||||
cd samples && make clean
|
||||
|
||||
# Maintainance:
|
||||
|
||||
|
|
19
README.html
19
README.html
|
@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
|
|||
<h1>PDFMiner</h1>
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Wed Apr 30 19:15:12 JST 2008
|
||||
Last Modified: Sun Jun 29 11:34:38 JST 2008
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -114,8 +114,8 @@ PDFMiner comes with two programs:
|
|||
<code>pdf2txt.py</code> extracts text contents from a PDF file.
|
||||
It extracts all the texts that are to be rendered programatically.
|
||||
It also extracts the corresponding locations, font names,
|
||||
and font sizes for each text portion in an SGML-like format.
|
||||
It cannot extract texts embedded within images
|
||||
and font sizes for each text portion. However,
|
||||
it cannot extract texts embedded within images
|
||||
(i.e. it does not do optical character recognition).
|
||||
You can provide a password for protected PDF documents
|
||||
whose access is limited.
|
||||
|
@ -129,7 +129,8 @@ Unicode Standard.
|
|||
<p>
|
||||
Examples:
|
||||
<blockquote><pre>
|
||||
$ <strong>./pdf2txt.py samples/naacl06-shinyama.pdf</strong>
|
||||
$ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong>
|
||||
(extract text as an HTML file whose filename is output.html)
|
||||
|
||||
$ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong>
|
||||
(extract Japanese texts in vertical writing, CMap is required)
|
||||
|
@ -145,15 +146,17 @@ Options:
|
|||
<dd> Speficies the output file name.
|
||||
By default, it prints the extracted contents to stdout.
|
||||
<p>
|
||||
<dt> <code>-p <em>pageno</em></code>
|
||||
<dd> Speficies the page number to be extracted.
|
||||
Note that page numbers start from zero.
|
||||
Multiple <code>-p</code> options are allowed.
|
||||
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
||||
<dd> Speficies the comma-separated list of the page numbers to be extracted.
|
||||
Page numbers are starting from zero.
|
||||
By default, it extracts texts from all the pages.
|
||||
<p>
|
||||
<dt> <code>-c <em>codec</em></code>
|
||||
<dd> Speficies the output codec for non-ASCII texts.
|
||||
<p>
|
||||
<dt> <code>-H</code>
|
||||
<dd> Speficies the output to be HTML file.
|
||||
<p>
|
||||
<dt> <code>-P <em>password</em></code>
|
||||
<dd> Provides the user password to open the PDF file.
|
||||
<p>
|
||||
|
|
1
TODO
1
TODO
|
@ -1,7 +1,6 @@
|
|||
TODOs:
|
||||
- Documentation.
|
||||
- Error handling for invalid type.
|
||||
- Simple viewer application using pygame.
|
||||
|
||||
- Outlines.
|
||||
- Named Objects. (pages)
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
# Makefile for pdfminer
|
||||
|
||||
all:
|
||||
|
||||
clean:
|
||||
-rm *.pyc *.pyo
|
|
@ -0,0 +1,24 @@
|
|||
# GNUMakefile for test
|
||||
|
||||
PYTHON=python
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
|
||||
|
||||
HTMLS= \
|
||||
simple1.html \
|
||||
simple2.html \
|
||||
dmca.html \
|
||||
f1040nr.html \
|
||||
i1040nr.html \
|
||||
jo.html \
|
||||
kampo.html \
|
||||
naacl06-shinyama.html \
|
||||
nlp2004slides.html
|
||||
|
||||
all: $(HTMLS)
|
||||
|
||||
clean:
|
||||
-rm $(HTMLS)
|
||||
|
||||
.SUFFIXES: .pdf .html
|
||||
.pdf.html:
|
||||
$(PDF2TXT) -H -o $@ $<
|
|
@ -0,0 +1,6 @@
|
|||
# Makefile for tools
|
||||
|
||||
all:
|
||||
|
||||
clean:
|
||||
-rm *.pyc *.pyo
|
|
@ -7,7 +7,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
|
|||
try:
|
||||
import cdb
|
||||
except ImportError:
|
||||
import pycdb as cdb
|
||||
import pdfminer.pycdb as cdb
|
||||
m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
|
||||
if verbose:
|
||||
print >>stderr, 'Writing: %r...' % cdbfile
|
||||
|
@ -21,7 +21,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
|
|||
return
|
||||
|
||||
def convert_cmap(files, cmapdir, cdbcmapdir, force=False):
|
||||
from cmap import CMapDB
|
||||
from pdfminer.cmap import CMapDB
|
||||
CMapDB.initialize(cmapdir)
|
||||
for fname in files:
|
||||
if fname.endswith('.upr'): continue
|
|
@ -7,7 +7,7 @@
|
|||
# -i objid : object id
|
||||
#
|
||||
import sys, re
|
||||
from pdfparser import PDFDocument, PDFParser, PDFStream, \
|
||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFStream, \
|
||||
PDFObjRef, PSKeyword, PSLiteral
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
|
@ -2,11 +2,11 @@
|
|||
import sys
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFDevice, PDFResourceManager, \
|
||||
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfminer.pdfinterp import PDFDevice, PDFResourceManager, \
|
||||
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
||||
mult_matrix, apply_matrix
|
||||
from cmap import CMapDB
|
||||
from pdfminer.cmap import CMapDB
|
||||
|
||||
|
||||
def enc(x, codec):
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from sgml import PDFSGMLParser, Document
|
||||
from pdfminer.sgml import PDFSGMLParser, Document
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
try:
|
Loading…
Reference in New Issue