git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@101 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2009-05-15 14:34:53 +00:00
parent e93059480b
commit f628c0d3fe
4 changed files with 81 additions and 81 deletions

75
pdflib/pdf2txt.py → pdflib/converter.py Executable file → Normal file
View File

@ -1,13 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys
from pdfdevice import PDFDevice from pdfdevice import PDFDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox
from utils import mult_matrix, translate_matrix, enc from utils import mult_matrix, translate_matrix, enc
from pdfparser import PDFDocument, PDFParser
from pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from cmap import CMapDB
## PDFPageAggregator ## PDFPageAggregator
@ -322,73 +317,3 @@ class TextConverter(PDFConverter):
render(page) render(page)
self.outfp.write('\f') self.outfp.write('\f')
return return
def close(self):
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# path option
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outtype = 'html'
codec = 'utf-8'
outfp = sys.stdout
cluster_margin = None
pageno = 1
scale = 1
showpageno = True
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-C': cmapdir = v
elif k == '-D': cdbcmapdir = v
elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfp = file(v, 'wb')
elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
#
CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager()
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
device.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -1,8 +1,9 @@
# GNUMakefile for test # GNUMakefile for test
PYTHON=python PYTHON=python
CMAPDIR=../CMap
CDBCMAPDIR=../CDBCMap CDBCMAPDIR=../CDBCMap
PDF2TXT=$(PYTHON) ../pdflib/pdf2txt.py PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -C$(CMAPDIR) -D$(CDBCMAPDIR)
HTMLS= \ HTMLS= \
simple1.html \ simple1.html \
@ -20,6 +21,10 @@ all: $(HTMLS)
clean: clean:
-rm $(HTMLS) -rm $(HTMLS)
.SUFFIXES: .pdf .html .SUFFIXES: .pdf .html .sgml .txt
.pdf.html: .pdf.html:
$(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $< $(PDF2TXT) -t html -o $@ $<
.pdf.sgml:
$(PDF2TXT) -t sgml -o $@ $<
.pdf.txt:
$(PDF2TXT) -t text -o $@ $<

View File

@ -9,8 +9,6 @@
import sys, re import sys, re
from pdflib.pdfparser import PDFDocument, PDFParser from pdflib.pdfparser import PDFDocument, PDFParser
from pdflib.pdftypes import PDFStream, PDFObjRef, PSKeyword, PSLiteral, resolve1 from pdflib.pdftypes import PDFStream, PDFObjRef, PSKeyword, PSLiteral, resolve1
stdout = sys.stdout
stderr = sys.stderr
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]') ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
@ -163,7 +161,7 @@ def main(argv):
password = '' password = ''
dumpall = False dumpall = False
proc = dumppdf proc = dumppdf
outfp = stdout outfp = sys.stdout
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )

72
tools/pdf2txt.py Executable file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
import sys
from pdflib.pdfparser import PDFDocument, PDFParser
from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdflib.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdflib.cmap import CMapDB
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:t:o:C:D:m:w')
except getopt.GetoptError:
return usage()
if not args: return usage()
# debug option
debug = 0
# path option
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outtype = 'html'
codec = 'utf-8'
outfp = sys.stdout
cluster_margin = None
pageno = 1
scale = 1
showpageno = True
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-C': cmapdir = v
elif k == '-D': cdbcmapdir = v
elif k == '-P': password = v
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfp = file(v, 'wb')
elif k == '-s': scale = float(v)
elif k == '-T': cluster_margin = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
#
CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager()
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, scale=scale)
elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:
process_pdf(rsrc, device, fname, pagenos, maxpages=maxpages, password=password)
device.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))