change PDFPageAggregator -> PDFLayoutAnalyzer

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@213 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-04-24 13:31:21 +00:00
parent 833f859449
commit eb535d4106
2 changed files with 50 additions and 32 deletions

View File

@ -10,9 +10,9 @@ from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str, create_bmp from utils import enc, bbox2str, create_bmp
## PDFPageAggregator ## PDFLayoutAnalyzer
## ##
class PDFPageAggregator(PDFTextDevice): class PDFLayoutAnalyzer(PDFTextDevice):
def __init__(self, rsrcmgr, pageno=1, laparams=None): def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrcmgr) PDFTextDevice.__init__(self, rsrcmgr)
@ -29,13 +29,14 @@ class PDFPageAggregator(PDFTextDevice):
self.cur_item = LTPage(self.pageno, mediabox) self.cur_item = LTPage(self.pageno, mediabox)
return return
def end_page(self, _): def end_page(self, page):
assert not self.stack assert not self.stack
assert isinstance(self.cur_item, LTPage) assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate() self.cur_item.fixate()
self.cur_item.analyze(self.laparams) self.cur_item.analyze(self.laparams)
self.pageno += 1 self.pageno += 1
return self.cur_item self.receive_layout(self.cur_item)
return
def begin_figure(self, name, bbox, matrix): def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item) self.stack.append(self.cur_item)
@ -95,13 +96,33 @@ class PDFPageAggregator(PDFTextDevice):
self.cur_item.add(item) self.cur_item.add(item)
return item.adv return item.adv
def receive_layout(self, ltpage):
return
## PDFPageAggregator
##
class PDFPageAggregator(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.result = None
return
def receive_layout(self, ltpage):
self.result = ltpage
return
def get_result(self):
return self.result
## PDFConverter ## PDFConverter
## ##
class PDFConverter(PDFPageAggregator): class PDFConverter(PDFLayoutAnalyzer):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
@ -148,7 +169,7 @@ class TextConverter(PDFConverter):
self.outfp.write(text.encode(self.codec, 'ignore')) self.outfp.write(text.encode(self.codec, 'ignore'))
return return
def end_page(self, page): def receive_layout(self, ltpage):
def render(item): def render(item):
if isinstance(item, LTText): if isinstance(item, LTText):
self.write(item.text) self.write(item.text)
@ -157,10 +178,9 @@ class TextConverter(PDFConverter):
render(child) render(child)
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write('\n') self.write('\n')
page = PDFConverter.end_page(self, page)
if self.showpageno: if self.showpageno:
self.write('Page %s\n' % page.pageid) self.write('Page %s\n' % ltpage.pageid)
render(page) render(ltpage)
self.write('\f') self.write('\f')
return return
@ -198,7 +218,7 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
return return
def end_page(self, page): def receive_layout(self, ltpage):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.yoffset += item.y1 self.yoffset += item.y1
@ -206,7 +226,7 @@ class HTMLConverter(PDFConverter):
if self.showpageno: if self.showpageno:
self.outfp.write('<div style="position:absolute; top:%dpx;">' % self.outfp.write('<div style="position:absolute; top:%dpx;">' %
((self.yoffset-item.y1)*self.scale)) ((self.yoffset-item.y1)*self.scale))
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.pageid, page.pageid)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
@ -237,16 +257,15 @@ class HTMLConverter(PDFConverter):
item.x0*self.scale, (self.yoffset-item.y1)*self.scale, item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.width*self.scale, item.height*self.scale)) item.width*self.scale, item.height*self.scale))
return return
page = PDFConverter.end_page(self, page) render(ltpage)
render(page) if self.debug and ltpage.layout:
if self.debug and page.layout:
def show_layout(item): def show_layout(item):
if isinstance(item, LTTextGroup): if isinstance(item, LTTextGroup):
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
for child in item: for child in item:
show_layout(child) show_layout(child)
return return
show_layout(page.layout) show_layout(ltpage.layout)
self.yoffset += self.pagepad self.yoffset += self.pagepad
return return
@ -268,7 +287,7 @@ class XMLConverter(PDFConverter):
self.outfp.write('<pages>\n') self.outfp.write('<pages>\n')
return return
def end_page(self, page): def receive_layout(self, ltpage):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
@ -323,20 +342,19 @@ class XMLConverter(PDFConverter):
else: else:
assert 0, item assert 0, item
return return
page = PDFConverter.end_page(self, page) def show_layout(item):
render(page) if isinstance(item, LTTextBox):
if page.layout: self.outfp.write('<textbox id="%d" bbox="%s" />\n' % (item.index, bbox2str(item.bbox)))
def show_layout(item): elif isinstance(item, LTTextGroup):
if isinstance(item, LTTextBox): self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
self.outfp.write('<textbox id="%d" bbox="%s" />\n' % (item.index, bbox2str(item.bbox))) for child in item:
elif isinstance(item, LTTextGroup): show_layout(child)
self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox)) self.outfp.write('</textgroup>\n')
for child in item: return
show_layout(child) render(ltpage)
self.outfp.write('</textgroup>\n') if ltpage.layout:
return
self.outfp.write('<layout>\n') self.outfp.write('<layout>\n')
show_layout(page.layout) show_layout(ltpage.layout)
self.outfp.write('</layout>\n') self.outfp.write('</layout>\n')
return return

View File

@ -2,8 +2,8 @@
import sys import sys
from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams from pdfminer.layout import LAParams