reorganize the directory structure.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@35 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-06-29 08:45:46 +00:00 · 2008-06-29 08:45:46 +00:00 · 24fdae38d4
parent 26e762c776
commit 24fdae38d4
26 changed files with 64 additions and 19 deletions
--- a/13
+++ b/13
@ -1,7 +1,7 @@
 # Makefile for pdfminer

 PACKAGE=pdfminer
-VERSION=20080429
+VERSION=20080629
 GNUTAR=tar
 SVN=svn
 PYTHON=python
@ -10,14 +10,21 @@ WORKDIR=/tmp
 DISTNAME=$(PACKAGE)-dist-$(VERSION)
 DISTFILE=$(DISTNAME).tar.gz

+CONV_CMAP=$(PYTHON) -m tools.conv_cmap
+
 all:

 cdbcmap: CMap
 	-mkdir CDBCMap
-	$(PYTHON) conv_cmap.py CMap/*
+	$(CONV_CMAP) CMap/*
+
+samples:
+	cd samples && make

 clean:
-	-rm *.pyc *.pyo *~
+	cd pdfminer && make clean
+	cd tools && make clean
+	cd samples && make clean

 # Maintainance:

--- a/README.html
+++ b/README.html
@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
 <h1>PDFMiner</h1>
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Wed Apr 30 19:15:12 JST 2008
+Last Modified: Sun Jun 29 11:34:38 JST 2008
 <!-- hhmts end -->
 </div>

@ -114,8 +114,8 @@ PDFMiner comes with two programs:
 <code>pdf2txt.py</code> extracts text contents from a PDF file.
 It extracts all the texts that are to be rendered programatically.
 It also extracts the corresponding locations, font names,
-and font sizes for each text portion in an SGML-like format.
-It cannot extract texts embedded within images
+and font sizes for each text portion. However,
+it cannot extract texts embedded within images
 (i.e. it does not do optical character recognition).
 You can provide a password for protected PDF documents 
 whose access is limited.
@ -129,7 +129,8 @@ Unicode Standard.
 <p>
 Examples:
 <blockquote><pre>
-$ <strong>./pdf2txt.py samples/naacl06-shinyama.pdf</strong>
+$ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong>
+(extract text as an HTML file whose filename is output.html)

 $ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong>
 (extract Japanese texts in vertical writing, CMap is required)
@ -145,15 +146,17 @@ Options:
 <dd> Speficies the output file name.
 By default, it prints the extracted contents to stdout.
 <p>
-<dt> <code>-p <em>pageno</em></code> 
-<dd> Speficies the page number to be extracted. 
-Note that page numbers start from zero.
-Multiple <code>-p</code> options are allowed.
+<dt> <code>-p <em>pageno[,pageno,...]</em></code> 
+<dd> Speficies the comma-separated list of the page numbers to be extracted. 
+Page numbers are starting from zero.
 By default, it extracts texts from all the pages.
 <p>
 <dt> <code>-c <em>codec</em></code> 
 <dd> Speficies the output codec for non-ASCII texts.
 <p>
+<dt> <code>-H</code> 
+<dd> Speficies the output to be HTML file.
+<p>
 <dt> <code>-P <em>password</em></code> 
 <dd> Provides the user password to open the PDF file.
 <p>
--- a/1
+++ b/1
@ -1,7 +1,6 @@
 TODOs:
  - Documentation.
  - Error handling for invalid type.
-  - Simple viewer application using pygame.

  - Outlines.
  - Named Objects. (pages)
--- a/pdfminer/Makefile
+++ b/pdfminer/Makefile
@ -0,0 +1,6 @@
+# Makefile for pdfminer
+
+all:
+
+clean:
+	-rm *.pyc *.pyo
--- a/pdfminer/init.py
+++ b/pdfminer/init.py
--- a/pdfminer/arcfour.py
+++ b/pdfminer/arcfour.py
--- a/pdfminer/cmap.py
+++ b/pdfminer/cmap.py
--- a/pdfminer/fontmetrics.py
+++ b/pdfminer/fontmetrics.py
--- a/pdfminer/glyphlist.py
+++ b/pdfminer/glyphlist.py
--- a/pdfminer/latin_enc.py
+++ b/pdfminer/latin_enc.py
--- a/pdfminer/lzw.py
+++ b/pdfminer/lzw.py
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
--- a/pdfminer/psparser.py
+++ b/pdfminer/psparser.py
--- a/pdfminer/pycdb.py
+++ b/pdfminer/pycdb.py
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
--- a/samples/Makefile
+++ b/samples/Makefile
@ -0,0 +1,24 @@
+# GNUMakefile for test
+
+PYTHON=python
+PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
+
+HTMLS= \
+	simple1.html \
+	simple2.html \
+	dmca.html \
+	f1040nr.html \
+	i1040nr.html \
+	jo.html \
+	kampo.html \
+	naacl06-shinyama.html \
+	nlp2004slides.html
+
+all: $(HTMLS)
+
+clean:
+	-rm $(HTMLS)
+
+.SUFFIXES: .pdf .html
+.pdf.html:
+	$(PDF2TXT) -H -o $@ $<
--- a/tools/Makefile
+++ b/tools/Makefile
@ -0,0 +1,6 @@
+# Makefile for tools
+
+all:
+
+clean:
+	-rm *.pyc *.pyo
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/conv_afm.py
+++ b/tools/conv_afm.py
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@ -7,7 +7,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
  try:
    import cdb
  except ImportError:
-    import pycdb as cdb
+    import pdfminer.pycdb as cdb
  m = cdb.cdbmake(cdbfile, cdbfile+'.tmp')
  if verbose:
    print >>stderr, 'Writing: %r...' % cdbfile
@ -21,7 +21,7 @@ def dumpcdb(cmap, cdbfile, verbose=1):
  return

 def convert_cmap(files, cmapdir, cdbcmapdir, force=False):
-  from cmap import CMapDB
+  from pdfminer.cmap import CMapDB
  CMapDB.initialize(cmapdir)
  for fname in files:
    if fname.endswith('.upr'): continue
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -7,7 +7,7 @@
 #    -i objid : object id
 #
 import sys, re
-from pdfparser import PDFDocument, PDFParser, PDFStream, \
+from pdfminer.pdfparser import PDFDocument, PDFParser, PDFStream, \
     PDFObjRef, PSKeyword, PSLiteral
 stdout = sys.stdout
 stderr = sys.stderr
--- a/tools/extent.py
+++ b/tools/extent.py
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -2,11 +2,11 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
-from pdfinterp import PDFDevice, PDFResourceManager, \
+from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
+from pdfminer.pdfinterp import PDFDevice, PDFResourceManager, \
     PDFPageInterpreter, PDFUnicodeNotDefined, \
     mult_matrix, apply_matrix
-from cmap import CMapDB
+from pdfminer.cmap import CMapDB


 def enc(x, codec):
--- a/tools/sgml.py
+++ b/tools/sgml.py
--- a/tools/viewpdf.py
+++ b/tools/viewpdf.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 import sys
-from sgml import PDFSGMLParser, Document
+from pdfminer.sgml import PDFSGMLParser, Document
 stdout = sys.stdout
 stderr = sys.stderr
 try: