diff --git a/Makefile b/Makefile index c09b472..01de158 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ VERSION=`$(PYTHON) $(PACKAGE)/__init__.py` DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTFILE=$(DISTNAME).tar.gz -CONV_CMAP=$(PYTHON) -m tools.conv_cmap +CONV_CMAP=$(PYTHON) pdfminer/cmap.py all: @@ -27,8 +27,7 @@ test: cd samples && make test cdbcmap: CMap - -mkdir CDBCMap - $(CONV_CMAP) CMap/* + $(CONV_CMAP) CMap # Maintainance: commit: clean diff --git a/README.html b/README.html index bce814f..72e5bba 100644 --- a/README.html +++ b/README.html @@ -18,7 +18,7 @@ Python PDF parser and analyzer
-PDFMiner is a suite of programs that aims to help -analyzing text data from PDF documents. -It includes a PDF parser, a PDF renderer -(though only rendering text is supported for now), -and a couple of nice tools to extract texts. +PDFMiner is a suite of programs that help +extracting and analyzing text data of PDF documents. Unlike other PDF-related tools, it allows to obtain the exact location of texts in a page, as well as -other layout information such as font size or font name, -which could be useful for analyzing the document. +other extra information such as font information or ruled lines. +It includes a PDF converter that can transform PDF files +into other text formats (such as HTML). It has an extensible +PDF parser that can be used for other purpoes instead of text analysis.
Features:
-Note: -This software is not yet out-of-the-box. -You have to download and unpack it manually, -and spend some time to make it work. -Your will is needed! -I do not support easy_install or setup.py or any automated installation until -this is very matured to the point that it really should be widely distributed. -(For now, it's not yet up to the standard, IMO.)
pdfminer
directory.
+setup.py
to install:+# python setup.py install +
diff --git a/samples/Makefile b/samples/Makefile index fa5c728..da56fc6 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -2,8 +2,7 @@ PYTHON=python CMAPDIR=../CMap -CDBCMAPDIR=../CDBCMap -PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR) -D$(CDBCMAPDIR) +PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR) HTMLS= \ simple1.html \ diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py deleted file mode 100755 index e5fc4ec..0000000 --- a/tools/conv_cmap.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python -import sys, os.path -stderr = sys.stderr - -def dumpcdb(cmap, cdbfile, verbose=1): - from struct import pack, unpack - try: - import cdb - except ImportError: - import pdfminer.pycdb as cdb - m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') - if verbose: - print >>stderr, 'Writing: %r...' % cdbfile - for (k,v) in cmap.getall_attrs(): - m.add('/'+k, repr(v)) - for (code,cid) in cmap.getall_code2cid(): - m.add('c'+code, pack('>L',cid)) - for (cid,code) in cmap.getall_cid2code(): - m.add('i'+pack('>L',cid), code) - m.finish() - return - -def convert_cmap(files, cmapdir, cdbcmapdir, force=False): - from pdfminer.cmap import CMapDB - CMapDB.initialize(cmapdir) - for fname in files: - if fname.endswith('.upr'): continue - cmapname = os.path.basename(fname) - cdbname = os.path.join(cdbcmapdir, cmapname+'.cmap.cdb') - if not force and os.path.exists(cdbname): - print >>stderr, 'Skipping: %r' % cdbname - continue - print >>stderr, 'Reading: %r...' % fname - cmap = CMapDB.get_cmap(cmapname) - dumpcdb(cmap, cdbname) - return - -def main(argv): - import getopt - def usage(): - print 'usage: %s [-c cmapdir] [-C cdbcmapdir] [-f] file ...' % argv[0] - return 100 - try: - (opts, args) = getopt.getopt(argv[1:], 'c:C:f') - except getopt.GetoptError: - return usage() - if not args: usage() - cmapdir = 'CMap' - cdbcmapdir = 'CDBCMap' - force = False - for (k, v) in opts: - if k == '-f': force = True - elif k == '-c': cmapdir = v - elif k == '-C': cdbcmapdir = v - if not os.path.isdir(cmapdir): - raise ValueError('not directory: %r' % cmapdir) - if not os.path.isdir(cdbcmapdir): - raise ValueError('not directory: %r' % cdbcmapdir) - return convert_cmap(args, cmapdir, cdbcmapdir, force=force) - -if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 2e8ed18..39d283d 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -4,7 +4,7 @@ from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor -from pdfminer.cmap import CMapDB +from pdfminer.cmap import CMapDB, find_cmap_path # main def main(argv): @@ -20,8 +20,7 @@ def main(argv): # debug option debug = 0 # path option - cmapdir = 'CMap' - cdbcmapdir = 'CDBCMap' + cmapdir = find_cmap_path() # input option password = '' pagenos = set() @@ -37,7 +36,6 @@ def main(argv): for (k, v) in opts: if k == '-d': debug += 1 elif k == '-C': cmapdir = v - elif k == '-D': cdbcmapdir = v elif k == '-P': password = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) @@ -54,7 +52,7 @@ def main(argv): PDFPageInterpreter.debug = debug PDFDevice.debug = debug # - CMapDB.initialize(cmapdir, cdbcmapdir) + CMapDB.initialize(cmapdir) rsrc = PDFResourceManager() if outtype == 'sgml': device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)-$ python pdflib/pdf2txt.py samples/simple1.pdf -<html><head><meta http-equiv="Content-Type" content="text/html; charset=ascii"> +$ pdf2txt.py samples/simple1.pdf +<html><head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> </head><body> -<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div><span style="position:absolute; border: 1px solid gray; left:0px; top:50px; width:612px; height:792px;"></span> -<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> </span> -<span style="position:absolute; writing-mode:lr-tb; left:106px; top:224px; font-size:22px;">Hello </span> -<span style="position:absolute; writing-mode:lr-tb; left:168px; top:224px; font-size:22px;">World </span> +<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span> +<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div> +<span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"> World </span> <span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"> </span> -<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span> +<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> Hello </span> <span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;">World </span> +<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span> <div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div> </body></html>