diff --git a/pdflib/pdf2txt.py b/pdflib/converter.py old mode 100755 new mode 100644 similarity index 83% rename from pdflib/pdf2txt.py rename to pdflib/converter.py index a0d1d55..678dec6 --- a/pdflib/pdf2txt.py +++ b/pdflib/converter.py @@ -1,13 +1,8 @@ #!/usr/bin/env python -import sys from pdfdevice import PDFDevice from pdffont import PDFUnicodeNotDefined from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox from utils import mult_matrix, translate_matrix, enc -from pdfparser import PDFDocument, PDFParser -from pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf -from cmap import CMapDB - ## PDFPageAggregator @@ -322,73 +317,3 @@ class TextConverter(PDFConverter): render(page) self.outfp.write('\f') return - - def close(self): - return - - -# main -def main(argv): - import getopt - def usage(): - print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] - return 100 - try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w') - except getopt.GetoptError: - return usage() - if not args: return usage() - # debug option - debug = 0 - # path option - cmapdir = 'CMap' - cdbcmapdir = 'CDBCMap' - # input option - password = '' - pagenos = set() - maxpages = 0 - # output option - outtype = 'html' - codec = 'utf-8' - outfp = sys.stdout - cluster_margin = None - pageno = 1 - scale = 1 - showpageno = True - for (k, v) in opts: - if k == '-d': debug += 1 - elif k == '-C': cmapdir = v - elif k == '-D': cdbcmapdir = v - elif k == '-P': password = v - elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) - elif k == '-m': maxpages = int(v) - elif k == '-t': outtype = v - elif k == '-c': codec = v - elif k == '-o': outfp = file(v, 'wb') - elif k == '-s': scale = float(v) - elif k == '-T': cluster_margin = float(v) - # - CMapDB.debug = debug - PDFResourceManager.debug = debug - PDFDocument.debug = debug - PDFParser.debug = debug - PDFPageInterpreter.debug = debug - # - CMapDB.initialize(cmapdir, cdbcmapdir) - rsrc = PDFResourceManager() - if outtype == 'sgml': - device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) - elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale) - elif outtype == 'text': - device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) - elif outtype == 'tag': - device = TagExtractor(rsrc, outfp, codec=codec) - else: - return usage() - for fname in args: - process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password) - device.close() - return - -if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/samples/Makefile b/samples/Makefile index e6091aa..08be431 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,8 +1,9 @@ # GNUMakefile for test PYTHON=python +CMAPDIR=../CMap CDBCMAPDIR=../CDBCMap -PDF2TXT=$(PYTHON) ../pdflib/pdf2txt.py +PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR) -D$(CDBCMAPDIR) HTMLS= \ simple1.html \ @@ -20,6 +21,10 @@ all: $(HTMLS) clean: -rm $(HTMLS) -.SUFFIXES: .pdf .html +.SUFFIXES: .pdf .html .sgml .txt .pdf.html: - $(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $< + $(PDF2TXT) -t html -o $@ $< +.pdf.sgml: + $(PDF2TXT) -t sgml -o $@ $< +.pdf.txt: + $(PDF2TXT) -t text -o $@ $< diff --git a/tools/dumppdf.py b/tools/dumppdf.py index 34802e6..a3396ad 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -9,8 +9,6 @@ import sys, re from pdflib.pdfparser import PDFDocument, PDFParser from pdflib.pdftypes import PDFStream, PDFObjRef, PSKeyword, PSLiteral, resolve1 -stdout = sys.stdout -stderr = sys.stderr ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]') @@ -163,7 +161,7 @@ def main(argv): password = '' dumpall = False proc = dumppdf - outfp = stdout + outfp = sys.stdout for (k, v) in opts: if k == '-d': debug += 1 elif k == '-i': objids.extend( int(x) for x in v.split(',') ) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py new file mode 100755 index 0000000..4adb809 --- /dev/null +++ b/tools/pdf2txt.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +import sys +from pdflib.pdfparser import PDFDocument, PDFParser +from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf +from pdflib.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor +from pdflib.cmap import CMapDB + +# main +def main(argv): + import getopt + def usage(): + print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] + return 100 + try: + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w') + except getopt.GetoptError: + return usage() + if not args: return usage() + # debug option + debug = 0 + # path option + cmapdir = 'CMap' + cdbcmapdir = 'CDBCMap' + # input option + password = '' + pagenos = set() + maxpages = 0 + # output option + outtype = 'html' + codec = 'utf-8' + outfp = sys.stdout + cluster_margin = None + pageno = 1 + scale = 1 + showpageno = True + for (k, v) in opts: + if k == '-d': debug += 1 + elif k == '-C': cmapdir = v + elif k == '-D': cdbcmapdir = v + elif k == '-P': password = v + elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) + elif k == '-m': maxpages = int(v) + elif k == '-t': outtype = v + elif k == '-c': codec = v + elif k == '-o': outfp = file(v, 'wb') + elif k == '-s': scale = float(v) + elif k == '-T': cluster_margin = float(v) + # + CMapDB.debug = debug + PDFResourceManager.debug = debug + PDFDocument.debug = debug + PDFParser.debug = debug + PDFPageInterpreter.debug = debug + # + CMapDB.initialize(cmapdir, cdbcmapdir) + rsrc = PDFResourceManager() + if outtype == 'sgml': + device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) + elif outtype == 'html': + device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale) + elif outtype == 'text': + device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin) + elif outtype == 'tag': + device = TagExtractor(rsrc, outfp, codec=codec) + else: + return usage() + for fname in args: + process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password) + device.close() + return + +if __name__ == '__main__': sys.exit(main(sys.argv))