change PDFPageAggregator -> PDFLayoutAnalyzer
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@213 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
833f859449
commit
eb535d4106
|
@ -10,9 +10,9 @@ from utils import apply_matrix_pt, mult_matrix
|
||||||
from utils import enc, bbox2str, create_bmp
|
from utils import enc, bbox2str, create_bmp
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
## PDFLayoutAnalyzer
|
||||||
##
|
##
|
||||||
class PDFPageAggregator(PDFTextDevice):
|
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||||
PDFTextDevice.__init__(self, rsrcmgr)
|
PDFTextDevice.__init__(self, rsrcmgr)
|
||||||
|
@ -29,13 +29,14 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
self.cur_item = LTPage(self.pageno, mediabox)
|
self.cur_item = LTPage(self.pageno, mediabox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, _):
|
def end_page(self, page):
|
||||||
assert not self.stack
|
assert not self.stack
|
||||||
assert isinstance(self.cur_item, LTPage)
|
assert isinstance(self.cur_item, LTPage)
|
||||||
self.cur_item.fixate()
|
self.cur_item.fixate()
|
||||||
self.cur_item.analyze(self.laparams)
|
self.cur_item.analyze(self.laparams)
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
return self.cur_item
|
self.receive_layout(self.cur_item)
|
||||||
|
return
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
def begin_figure(self, name, bbox, matrix):
|
||||||
self.stack.append(self.cur_item)
|
self.stack.append(self.cur_item)
|
||||||
|
@ -95,13 +96,33 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return item.adv
|
return item.adv
|
||||||
|
|
||||||
|
def receive_layout(self, ltpage):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## PDFPageAggregator
|
||||||
|
##
|
||||||
|
class PDFPageAggregator(PDFLayoutAnalyzer):
|
||||||
|
|
||||||
|
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||||
|
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||||
|
self.result = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def receive_layout(self, ltpage):
|
||||||
|
self.result = ltpage
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_result(self):
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
|
||||||
## PDFConverter
|
## PDFConverter
|
||||||
##
|
##
|
||||||
class PDFConverter(PDFPageAggregator):
|
class PDFConverter(PDFLayoutAnalyzer):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||||
PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
@ -148,7 +169,7 @@ class TextConverter(PDFConverter):
|
||||||
self.outfp.write(text.encode(self.codec, 'ignore'))
|
self.outfp.write(text.encode(self.codec, 'ignore'))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def receive_layout(self, ltpage):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTText):
|
if isinstance(item, LTText):
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
|
@ -157,10 +178,9 @@ class TextConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
if isinstance(item, LTTextBox):
|
if isinstance(item, LTTextBox):
|
||||||
self.write('\n')
|
self.write('\n')
|
||||||
page = PDFConverter.end_page(self, page)
|
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.write('Page %s\n' % page.pageid)
|
self.write('Page %s\n' % ltpage.pageid)
|
||||||
render(page)
|
render(ltpage)
|
||||||
self.write('\f')
|
self.write('\f')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -198,7 +218,7 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('</span>\n')
|
self.outfp.write('</span>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def receive_layout(self, ltpage):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.yoffset += item.y1
|
self.yoffset += item.y1
|
||||||
|
@ -206,7 +226,7 @@ class HTMLConverter(PDFConverter):
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
self.outfp.write('<div style="position:absolute; top:%dpx;">' %
|
||||||
((self.yoffset-item.y1)*self.scale))
|
((self.yoffset-item.y1)*self.scale))
|
||||||
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.pageid, page.pageid))
|
self.outfp.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
|
@ -237,16 +257,15 @@ class HTMLConverter(PDFConverter):
|
||||||
item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
||||||
item.width*self.scale, item.height*self.scale))
|
item.width*self.scale, item.height*self.scale))
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
render(ltpage)
|
||||||
render(page)
|
if self.debug and ltpage.layout:
|
||||||
if self.debug and page.layout:
|
|
||||||
def show_layout(item):
|
def show_layout(item):
|
||||||
if isinstance(item, LTTextGroup):
|
if isinstance(item, LTTextGroup):
|
||||||
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
|
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
show_layout(child)
|
show_layout(child)
|
||||||
return
|
return
|
||||||
show_layout(page.layout)
|
show_layout(ltpage.layout)
|
||||||
self.yoffset += self.pagepad
|
self.yoffset += self.pagepad
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -268,7 +287,7 @@ class XMLConverter(PDFConverter):
|
||||||
self.outfp.write('<pages>\n')
|
self.outfp.write('<pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def receive_layout(self, ltpage):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||||
|
@ -323,20 +342,19 @@ class XMLConverter(PDFConverter):
|
||||||
else:
|
else:
|
||||||
assert 0, item
|
assert 0, item
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
def show_layout(item):
|
||||||
render(page)
|
if isinstance(item, LTTextBox):
|
||||||
if page.layout:
|
self.outfp.write('<textbox id="%d" bbox="%s" />\n' % (item.index, bbox2str(item.bbox)))
|
||||||
def show_layout(item):
|
elif isinstance(item, LTTextGroup):
|
||||||
if isinstance(item, LTTextBox):
|
self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
||||||
self.outfp.write('<textbox id="%d" bbox="%s" />\n' % (item.index, bbox2str(item.bbox)))
|
for child in item:
|
||||||
elif isinstance(item, LTTextGroup):
|
show_layout(child)
|
||||||
self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
self.outfp.write('</textgroup>\n')
|
||||||
for child in item:
|
return
|
||||||
show_layout(child)
|
render(ltpage)
|
||||||
self.outfp.write('</textgroup>\n')
|
if ltpage.layout:
|
||||||
return
|
|
||||||
self.outfp.write('<layout>\n')
|
self.outfp.write('<layout>\n')
|
||||||
show_layout(page.layout)
|
show_layout(ltpage.layout)
|
||||||
self.outfp.write('</layout>\n')
|
self.outfp.write('</layout>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
import sys
|
import sys
|
||||||
from pdfminer.pdfparser import PDFDocument, PDFParser
|
from pdfminer.pdfparser import PDFDocument, PDFParser
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice, TagExtractor
|
||||||
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
|
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
||||||
from pdfminer.cmapdb import CMapDB
|
from pdfminer.cmapdb import CMapDB
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue