add splitwords option.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@72 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-03-20 11:00:14 +00:00
parent b432a3f4ae
commit 435d0553fa
2 changed files with 31 additions and 26 deletions

View File

@ -21,8 +21,8 @@ def encprops(props, codec):
## TextConverter
class TextConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii'):
PDFPageAggregator.__init__(self, rsrc)
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
self.outfp = outfp
self.codec = codec
return
@ -60,8 +60,8 @@ class SGMLConverter(TextConverter):
##
class HTMLConverter(TextConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1):
TextConverter.__init__(self, rsrc, outfp, codec=codec)
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, splitwords=False):
TextConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
@ -190,10 +190,10 @@ def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[0]
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t html|sgml|tag] [-o output] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:')
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
except getopt.GetoptError:
return usage()
if not args: return usage()
@ -205,6 +205,7 @@ def main(argv):
maxpages = 0
outtype = 'html'
password = ''
splitwords = False
outfp = stdout
for (k, v) in opts:
if k == '-d': debug += 1
@ -216,6 +217,7 @@ def main(argv):
elif k == '-D': cdbcmapdir = v
elif k == '-t': outtype = v
elif k == '-o': outfp = file(v, 'wb')
elif k == '-w': splitwords = True
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
@ -226,11 +228,11 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager()
if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec)
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec)
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec)
device = TagExtractor(rsrc, outfp, codec=codec, splitwords=splitwords)
else:
return usage()
for fname in args:

View File

@ -145,9 +145,10 @@ class TextItem(object):
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1):
def __init__(self, rsrc, pageno=1, splitwords=False):
PDFDevice.__init__(self, rsrc)
self.pageno = pageno
self.splitwords = splitwords
self.stack = []
return
@ -180,29 +181,31 @@ class PDFPageAggregator(PDFDevice):
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
text = []
textmatrix = mult_matrix(textmatrix, self.ctm)
chars = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
text.append((None, None, x))
chars.append((None, None, x))
else:
chars = font.decode(x)
for cid in chars:
for cid in font.decode(x):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
text.append((char, cid, font.char_disp(cid)))
if cid == 32 and not font.is_multibyte():
if text:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item)
(dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
text = []
if text:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
chars.append((char, cid, font.char_disp(cid)))
textmatrix = mult_matrix(textmatrix, self.ctm)
word = []
for (char, cid, disp) in chars:
word.append((char,cid,disp))
if self.splitwords and cid == 32 and not font.is_multibyte():
if word:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
self.cur_item.add(item)
(dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
word = []
if word:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
self.cur_item.add(item)
return