move TagExtractor

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@212 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-04-24 13:31:11 +00:00
parent 32d65b70f8
commit 833f859449
2 changed files with 58 additions and 58 deletions

View File

@ -343,61 +343,3 @@ class XMLConverter(PDFConverter):
def close(self): def close(self):
self.outfp.write('</pages>\n') self.outfp.write('</pages>\n')
return return
## TagExtractor
##
class TagExtractor(PDFDevice):
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self.tag = None
return
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unichr(cid)
text += char
except PDFUnicodeNotDefined:
pass
self.outfp.write(enc(text, self.codec))
return
def begin_page(self, page, ctm):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox2str(page.mediabox), page.rotate))
return
def end_page(self, page):
self.outfp.write('</page>\n')
self.pageno += 1
return
def begin_tag(self, tag, props=None):
s = ''
if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self.tag = None
return

View File

@ -115,3 +115,61 @@ class PDFTextDevice(PDFDevice):
def render_char(self, matrix, font, fontsize, scaling, cid): def render_char(self, matrix, font, fontsize, scaling, cid):
return 0 return 0
## TagExtractor
##
class TagExtractor(PDFDevice):
def __init__(self, rsrcmgr, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp
self.codec = codec
self.pageno = 0
self.tag = None
return
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unichr(cid)
text += char
except PDFUnicodeNotDefined:
pass
self.outfp.write(enc(text, self.codec))
return
def begin_page(self, page, ctm):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox2str(page.mediabox), page.rotate))
return
def end_page(self, page):
self.outfp.write('</page>\n')
self.pageno += 1
return
def begin_tag(self, tag, props=None):
s = ''
if props:
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self.tag = tag
return
def end_tag(self):
assert self.tag
self.outfp.write('</%s>' % enc(self.tag.name))
self.tag = None
return
def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self.tag = None
return