From 24fdae38d412b8cf166fe1d91cb02a0d9f05f910 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 29 Jun 2008 08:45:46 +0000 Subject: [PATCH] reorganize the directory structure. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@35 1aa58f4a-7d42-0410-adbc-911cccaed67c --- Makefile | 13 +++++++++--- README.html | 19 ++++++++++-------- TODO | 1 - pdfminer/Makefile | 6 ++++++ pdfminer/__init__.py | 0 arcfour.py => pdfminer/arcfour.py | 0 cmap.py => pdfminer/cmap.py | 0 fontmetrics.py => pdfminer/fontmetrics.py | 0 glyphlist.py => pdfminer/glyphlist.py | 0 latin_enc.py => pdfminer/latin_enc.py | 0 lzw.py => pdfminer/lzw.py | 0 pdfinterp.py => pdfminer/pdfinterp.py | 0 pdfparser.py => pdfminer/pdfparser.py | 0 psparser.py => pdfminer/psparser.py | 0 pycdb.py => pdfminer/pycdb.py | 0 utils.py => pdfminer/utils.py | 0 samples/Makefile | 24 +++++++++++++++++++++++ tools/Makefile | 6 ++++++ tools/__init__.py | 0 conv_afm.py => tools/conv_afm.py | 0 conv_cmap.py => tools/conv_cmap.py | 4 ++-- dumppdf.py => tools/dumppdf.py | 2 +- extent.py => tools/extent.py | 0 pdf2txt.py => tools/pdf2txt.py | 6 +++--- sgml.py => tools/sgml.py | 0 viewpdf.py => tools/viewpdf.py | 2 +- 26 files changed, 64 insertions(+), 19 deletions(-) create mode 100644 pdfminer/Makefile create mode 100644 pdfminer/__init__.py rename arcfour.py => pdfminer/arcfour.py (100%) rename cmap.py => pdfminer/cmap.py (100%) rename fontmetrics.py => pdfminer/fontmetrics.py (100%) rename glyphlist.py => pdfminer/glyphlist.py (100%) rename latin_enc.py => pdfminer/latin_enc.py (100%) rename lzw.py => pdfminer/lzw.py (100%) rename pdfinterp.py => pdfminer/pdfinterp.py (100%) rename pdfparser.py => pdfminer/pdfparser.py (100%) rename psparser.py => pdfminer/psparser.py (100%) rename pycdb.py => pdfminer/pycdb.py (100%) rename utils.py => pdfminer/utils.py (100%) create mode 100644 samples/Makefile create mode 100644 tools/Makefile create mode 100644 tools/__init__.py rename conv_afm.py => tools/conv_afm.py (100%) rename conv_cmap.py => tools/conv_cmap.py (96%) rename dumppdf.py => tools/dumppdf.py (98%) rename extent.py => tools/extent.py (100%) rename pdf2txt.py => tools/pdf2txt.py (97%) rename sgml.py => tools/sgml.py (100%) rename viewpdf.py => tools/viewpdf.py (98%) diff --git a/Makefile b/Makefile index 0e793c5..4e20d5d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Makefile for pdfminer PACKAGE=pdfminer -VERSION=20080429 +VERSION=20080629 GNUTAR=tar SVN=svn PYTHON=python @@ -10,14 +10,21 @@ WORKDIR=/tmp DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTFILE=$(DISTNAME).tar.gz +CONV_CMAP=$(PYTHON) -m tools.conv_cmap + all: cdbcmap: CMap -mkdir CDBCMap - $(PYTHON) conv_cmap.py CMap/* + $(CONV_CMAP) CMap/* + +samples: + cd samples && make clean: - -rm *.pyc *.pyo *~ + cd pdfminer && make clean + cd tools && make clean + cd samples && make clean # Maintainance: diff --git a/README.html b/README.html index 24669d3..5f9cec9 100644 --- a/README.html +++ b/README.html @@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }

PDFMiner

-Last Modified: Wed Apr 30 19:15:12 JST 2008 +Last Modified: Sun Jun 29 11:34:38 JST 2008
@@ -114,8 +114,8 @@ PDFMiner comes with two programs: pdf2txt.py extracts text contents from a PDF file. It extracts all the texts that are to be rendered programatically. It also extracts the corresponding locations, font names, -and font sizes for each text portion in an SGML-like format. -It cannot extract texts embedded within images +and font sizes for each text portion. However, +it cannot extract texts embedded within images (i.e. it does not do optical character recognition). You can provide a password for protected PDF documents whose access is limited. @@ -129,7 +129,8 @@ Unicode Standard.

Examples:

-$ ./pdf2txt.py samples/naacl06-shinyama.pdf
+$ ./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf
+(extract text as an HTML file whose filename is output.html)
 
 $ ./pdf2txt.py -c euc-jp samples/jo.pdf
 (extract Japanese texts in vertical writing, CMap is required)
@@ -145,15 +146,17 @@ Options:
 
Speficies the output file name. By default, it prints the extracted contents to stdout.

-

-p pageno -
Speficies the page number to be extracted. -Note that page numbers start from zero. -Multiple -p options are allowed. +
-p pageno[,pageno,...] +
Speficies the comma-separated list of the page numbers to be extracted. +Page numbers are starting from zero. By default, it extracts texts from all the pages.

-c codec
Speficies the output codec for non-ASCII texts.

+

-H +
Speficies the output to be HTML file. +

-P password
Provides the user password to open the PDF file.

diff --git a/TODO b/TODO index db6c611..9b12f11 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,6 @@ TODOs: - Documentation. - Error handling for invalid type. - - Simple viewer application using pygame. - Outlines. - Named Objects. (pages) diff --git a/pdfminer/Makefile b/pdfminer/Makefile new file mode 100644 index 0000000..efee7fc --- /dev/null +++ b/pdfminer/Makefile @@ -0,0 +1,6 @@ +# Makefile for pdfminer + +all: + +clean: + -rm *.pyc *.pyo diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arcfour.py b/pdfminer/arcfour.py similarity index 100% rename from arcfour.py rename to pdfminer/arcfour.py diff --git a/cmap.py b/pdfminer/cmap.py similarity index 100% rename from cmap.py rename to pdfminer/cmap.py diff --git a/fontmetrics.py b/pdfminer/fontmetrics.py similarity index 100% rename from fontmetrics.py rename to pdfminer/fontmetrics.py diff --git a/glyphlist.py b/pdfminer/glyphlist.py similarity index 100% rename from glyphlist.py rename to pdfminer/glyphlist.py diff --git a/latin_enc.py b/pdfminer/latin_enc.py similarity index 100% rename from latin_enc.py rename to pdfminer/latin_enc.py diff --git a/lzw.py b/pdfminer/lzw.py similarity index 100% rename from lzw.py rename to pdfminer/lzw.py diff --git a/pdfinterp.py b/pdfminer/pdfinterp.py similarity index 100% rename from pdfinterp.py rename to pdfminer/pdfinterp.py diff --git a/pdfparser.py b/pdfminer/pdfparser.py similarity index 100% rename from pdfparser.py rename to pdfminer/pdfparser.py diff --git a/psparser.py b/pdfminer/psparser.py similarity index 100% rename from psparser.py rename to pdfminer/psparser.py diff --git a/pycdb.py b/pdfminer/pycdb.py similarity index 100% rename from pycdb.py rename to pdfminer/pycdb.py diff --git a/utils.py b/pdfminer/utils.py similarity index 100% rename from utils.py rename to pdfminer/utils.py diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 0000000..bde7570 --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,24 @@ +# GNUMakefile for test + +PYTHON=python +PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt + +HTMLS= \ + simple1.html \ + simple2.html \ + dmca.html \ + f1040nr.html \ + i1040nr.html \ + jo.html \ + kampo.html \ + naacl06-shinyama.html \ + nlp2004slides.html + +all: $(HTMLS) + +clean: + -rm $(HTMLS) + +.SUFFIXES: .pdf .html +.pdf.html: + $(PDF2TXT) -H -o $@ $< diff --git a/tools/Makefile b/tools/Makefile new file mode 100644 index 0000000..8299398 --- /dev/null +++ b/tools/Makefile @@ -0,0 +1,6 @@ +# Makefile for tools + +all: + +clean: + -rm *.pyc *.pyo diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/conv_afm.py b/tools/conv_afm.py similarity index 100% rename from conv_afm.py rename to tools/conv_afm.py diff --git a/conv_cmap.py b/tools/conv_cmap.py similarity index 96% rename from conv_cmap.py rename to tools/conv_cmap.py index d8a8022..e5fc4ec 100755 --- a/conv_cmap.py +++ b/tools/conv_cmap.py @@ -7,7 +7,7 @@ def dumpcdb(cmap, cdbfile, verbose=1): try: import cdb except ImportError: - import pycdb as cdb + import pdfminer.pycdb as cdb m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') if verbose: print >>stderr, 'Writing: %r...' % cdbfile @@ -21,7 +21,7 @@ def dumpcdb(cmap, cdbfile, verbose=1): return def convert_cmap(files, cmapdir, cdbcmapdir, force=False): - from cmap import CMapDB + from pdfminer.cmap import CMapDB CMapDB.initialize(cmapdir) for fname in files: if fname.endswith('.upr'): continue diff --git a/dumppdf.py b/tools/dumppdf.py similarity index 98% rename from dumppdf.py rename to tools/dumppdf.py index c86cf3a..0568d1a 100755 --- a/dumppdf.py +++ b/tools/dumppdf.py @@ -7,7 +7,7 @@ # -i objid : object id # import sys, re -from pdfparser import PDFDocument, PDFParser, PDFStream, \ +from pdfminer.pdfparser import PDFDocument, PDFParser, PDFStream, \ PDFObjRef, PSKeyword, PSLiteral stdout = sys.stdout stderr = sys.stderr diff --git a/extent.py b/tools/extent.py similarity index 100% rename from extent.py rename to tools/extent.py diff --git a/pdf2txt.py b/tools/pdf2txt.py similarity index 97% rename from pdf2txt.py rename to tools/pdf2txt.py index 8b2736f..071e8e3 100755 --- a/pdf2txt.py +++ b/tools/pdf2txt.py @@ -2,11 +2,11 @@ import sys stdout = sys.stdout stderr = sys.stderr -from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect -from pdfinterp import PDFDevice, PDFResourceManager, \ +from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect +from pdfminer.pdfinterp import PDFDevice, PDFResourceManager, \ PDFPageInterpreter, PDFUnicodeNotDefined, \ mult_matrix, apply_matrix -from cmap import CMapDB +from pdfminer.cmap import CMapDB def enc(x, codec): diff --git a/sgml.py b/tools/sgml.py similarity index 100% rename from sgml.py rename to tools/sgml.py diff --git a/viewpdf.py b/tools/viewpdf.py similarity index 98% rename from viewpdf.py rename to tools/viewpdf.py index ec25d12..6721036 100755 --- a/viewpdf.py +++ b/tools/viewpdf.py @@ -1,6 +1,6 @@ #!/usr/bin/env python import sys -from sgml import PDFSGMLParser, Document +from pdfminer.sgml import PDFSGMLParser, Document stdout = sys.stdout stderr = sys.stderr try: