add splitwords option.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@72 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
b432a3f4ae
commit
435d0553fa
|
@ -21,8 +21,8 @@ def encprops(props, codec):
|
||||||
## TextConverter
|
## TextConverter
|
||||||
class TextConverter(PDFPageAggregator):
|
class TextConverter(PDFPageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='ascii'):
|
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
|
||||||
PDFPageAggregator.__init__(self, rsrc)
|
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
@ -60,8 +60,8 @@ class SGMLConverter(TextConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(TextConverter):
|
class HTMLConverter(TextConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1):
|
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, splitwords=False):
|
||||||
TextConverter.__init__(self, rsrc, outfp, codec=codec)
|
TextConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||||
self.pagenum = pagenum
|
self.pagenum = pagenum
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
@ -190,10 +190,10 @@ def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[0]
|
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t html|sgml|tag] [-o output] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -205,6 +205,7 @@ def main(argv):
|
||||||
maxpages = 0
|
maxpages = 0
|
||||||
outtype = 'html'
|
outtype = 'html'
|
||||||
password = ''
|
password = ''
|
||||||
|
splitwords = False
|
||||||
outfp = stdout
|
outfp = stdout
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
|
@ -216,6 +217,7 @@ def main(argv):
|
||||||
elif k == '-D': cdbcmapdir = v
|
elif k == '-D': cdbcmapdir = v
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
|
elif k == '-w': splitwords = True
|
||||||
#
|
#
|
||||||
CMapDB.debug = debug
|
CMapDB.debug = debug
|
||||||
PDFResourceManager.debug = debug
|
PDFResourceManager.debug = debug
|
||||||
|
@ -226,11 +228,11 @@ def main(argv):
|
||||||
CMapDB.initialize(cmapdir, cdbcmapdir)
|
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||||
rsrc = PDFResourceManager()
|
rsrc = PDFResourceManager()
|
||||||
if outtype == 'sgml':
|
if outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec)
|
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec)
|
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrc, outfp, codec)
|
device = TagExtractor(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||||
else:
|
else:
|
||||||
return usage()
|
return usage()
|
||||||
for fname in args:
|
for fname in args:
|
||||||
|
|
|
@ -145,9 +145,10 @@ class TextItem(object):
|
||||||
##
|
##
|
||||||
class PDFPageAggregator(PDFDevice):
|
class PDFPageAggregator(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, pageno=1):
|
def __init__(self, rsrc, pageno=1, splitwords=False):
|
||||||
PDFDevice.__init__(self, rsrc)
|
PDFDevice.__init__(self, rsrc)
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
|
self.splitwords = splitwords
|
||||||
self.stack = []
|
self.stack = []
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -180,29 +181,31 @@ class PDFPageAggregator(PDFDevice):
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
def render_string(self, textstate, textmatrix, seq):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
text = []
|
chars = []
|
||||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
|
||||||
for x in seq:
|
for x in seq:
|
||||||
if isinstance(x, int) or isinstance(x, float):
|
if isinstance(x, int) or isinstance(x, float):
|
||||||
text.append((None, None, x))
|
chars.append((None, None, x))
|
||||||
else:
|
else:
|
||||||
chars = font.decode(x)
|
for cid in font.decode(x):
|
||||||
for cid in chars:
|
|
||||||
try:
|
try:
|
||||||
char = font.to_unicode(cid)
|
char = font.to_unicode(cid)
|
||||||
except PDFUnicodeNotDefined, e:
|
except PDFUnicodeNotDefined, e:
|
||||||
(cidcoding, cid) = e.args
|
(cidcoding, cid) = e.args
|
||||||
char = self.handle_undefined_char(cidcoding, cid)
|
char = self.handle_undefined_char(cidcoding, cid)
|
||||||
text.append((char, cid, font.char_disp(cid)))
|
chars.append((char, cid, font.char_disp(cid)))
|
||||||
if cid == 32 and not font.is_multibyte():
|
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||||
if text:
|
word = []
|
||||||
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
for (char, cid, disp) in chars:
|
||||||
|
word.append((char,cid,disp))
|
||||||
|
if self.splitwords and cid == 32 and not font.is_multibyte():
|
||||||
|
if word:
|
||||||
|
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
(dx,dy) = item.adv
|
(dx,dy) = item.adv
|
||||||
dx += textstate.wordspace * textstate.scaling * .01
|
dx += textstate.wordspace * textstate.scaling * .01
|
||||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||||
text = []
|
word = []
|
||||||
if text:
|
if word:
|
||||||
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, word)
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue