added LTText.get_text() and .text property is no longer accessible.

pull/1/head
Yusuke Shinyama 2011-05-14 15:45:08 +09:00
parent 5004e4b28d
commit 038ce4cd0c
2 changed files with 83 additions and 69 deletions

View File

@ -183,7 +183,7 @@ class TextConverter(PDFConverter):
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.write_text(item.text) self.write_text(item.get_text())
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write_text('\n') self.write_text('\n')
if self.showpageno: if self.showpageno:
@ -368,7 +368,7 @@ class HTMLConverter(PDFConverter):
render(child) render(child)
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.place_border('char', 1, item) self.place_border('char', 1, item)
self.place_text('char', item.text, item.x0, item.y1, item.size) self.place_text('char', item.get_text(), item.x0, item.y1, item.size)
else: else:
if isinstance(item, LTTextLine): if isinstance(item, LTTextLine):
for child in item: for child in item:
@ -382,9 +382,9 @@ class HTMLConverter(PDFConverter):
render(child) render(child)
self.end_textbox('textbox') self.end_textbox('textbox')
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.put_text(item.text, item.fontname, item.size) self.put_text(item.get_text(), item.fontname, item.size)
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.write_text(item.text) self.write_text(item.get_text())
return return
render(ltpage) render(ltpage)
self._yoffset += self.pagemargin self._yoffset += self.pagemargin
@ -472,10 +472,10 @@ class XMLConverter(PDFConverter):
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' % self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
(enc(item.fontname), bbox2str(item.bbox), item.size)) (enc(item.fontname), bbox2str(item.bbox), item.size))
self.write_text(item.text) self.write_text(item.get_text())
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text) self.outfp.write('<text>%s</text>\n' % item.get_text())
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
if self.outdir: if self.outdir:
name = self.write_image(item) name = self.write_image(item)

View File

@ -34,7 +34,29 @@ class LAParams(object):
## ##
class LTItem(object): class LTItem(object):
def analyze(self, laparams):
"""Perform the layout analysis."""
return
## LTText
##
class LTText(object):
def __repr__(self):
return ('<%s %r>' %
(self.__class__.__name__, self.get_text()))
def get_text(self):
raise NotImplementedError
## LTComponent
##
class LTComponent(LTItem):
def __init__(self, bbox): def __init__(self, bbox):
LTItem.__init__(self)
self.set_bbox(bbox) self.set_bbox(bbox)
return return
@ -56,54 +78,50 @@ class LTItem(object):
return self.width <= 0 or self.height <= 0 return self.width <= 0 or self.height <= 0
def is_hoverlap(self, obj): def is_hoverlap(self, obj):
assert isinstance(obj, LTItem) assert isinstance(obj, LTComponent)
return obj.x0 <= self.x1 and self.x0 <= obj.x1 return obj.x0 <= self.x1 and self.x0 <= obj.x1
def hdistance(self, obj): def hdistance(self, obj):
assert isinstance(obj, LTItem) assert isinstance(obj, LTComponent)
if self.is_hoverlap(obj): if self.is_hoverlap(obj):
return 0 return 0
else: else:
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
def hoverlap(self, obj): def hoverlap(self, obj):
assert isinstance(obj, LTItem) assert isinstance(obj, LTComponent)
if self.is_hoverlap(obj): if self.is_hoverlap(obj):
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
else: else:
return 0 return 0
def is_voverlap(self, obj): def is_voverlap(self, obj):
assert isinstance(obj, LTItem) assert isinstance(obj, LTComponent)
return obj.y0 <= self.y1 and self.y0 <= obj.y1 return obj.y0 <= self.y1 and self.y0 <= obj.y1
def vdistance(self, obj): def vdistance(self, obj):
assert isinstance(obj, LTItem) assert isinstance(obj, LTComponent)
if self.is_voverlap(obj): if self.is_voverlap(obj):
return 0 return 0
else: else:
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
def voverlap(self, obj): def voverlap(self, obj):
assert isinstance(obj, LTItem) assert isinstance(obj, LTComponent)
if self.is_voverlap(obj): if self.is_voverlap(obj):
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0)) return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
else: else:
return 0 return 0
def analyze(self, laparams):
"""Perform the layout analysis."""
return
## LTCurve ## LTCurve
## ##
class LTCurve(LTItem): class LTCurve(LTComponent):
def __init__(self, linewidth, pts): def __init__(self, linewidth, pts):
LTComponent.__init__(self, get_bound(pts))
self.pts = pts self.pts = pts
self.linewidth = linewidth self.linewidth = linewidth
LTItem.__init__(self, get_bound(pts))
return return
def get_pts(self): def get_pts(self):
@ -130,10 +148,10 @@ class LTRect(LTCurve):
## LTImage ## LTImage
## ##
class LTImage(LTItem): class LTImage(LTComponent):
def __init__(self, name, stream, bbox): def __init__(self, name, stream, bbox):
LTItem.__init__(self, bbox) LTComponent.__init__(self, bbox)
self.name = name self.name = name
self.stream = stream self.stream = stream
self.srcsize = (stream.get_any(('W', 'Width')), self.srcsize = (stream.get_any(('W', 'Width')),
@ -146,41 +164,30 @@ class LTImage(LTItem):
return return
def __repr__(self): def __repr__(self):
(w,h) = self.srcsize return ('<%s(%s) %s %r>' %
return ('<%s(%s) %s %dx%d>' %
(self.__class__.__name__, self.name, (self.__class__.__name__, self.name,
bbox2str(self.bbox), w, h)) bbox2str(self.bbox), self.srcsize))
## LTText
##
class LTText(object):
def __init__(self, text):
self.text = text
return
def __repr__(self):
return ('<%s %r>' %
(self.__class__.__name__, self.text))
## LTAnon ## LTAnon
## ##
class LTAnon(LTText): class LTAnon(LTItem, LTText):
def analyze(self, laparams): def __init__(self, text):
self._text = text
return return
def get_text(self):
return self._text
## LTChar ## LTChar
## ##
class LTChar(LTItem, LTText): class LTChar(LTComponent, LTText):
debug = 0
def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp): def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp):
LTText.__init__(self, text) LTText.__init__(self)
self._text = text
self.matrix = matrix self.matrix = matrix
self.fontname = font.fontname self.fontname = font.fontname
self.adv = textwidth * fontsize * scaling self.adv = textwidth * fontsize * scaling
@ -213,7 +220,7 @@ class LTChar(LTItem, LTText):
(x0,x1) = (x1,x0) (x0,x1) = (x1,x0)
if y1 < y0: if y1 < y0:
(y0,y1) = (y1,y0) (y0,y1) = (y1,y0)
LTItem.__init__(self, (x0,y0,x1,y1)) LTComponent.__init__(self, (x0,y0,x1,y1))
if font.is_vertical(): if font.is_vertical():
self.size = self.width self.size = self.width
else: else:
@ -221,13 +228,13 @@ class LTChar(LTItem, LTText):
return return
def __repr__(self): def __repr__(self):
if self.debug: return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' % (self.__class__.__name__, bbox2str(self.bbox),
(self.__class__.__name__, bbox2str(self.bbox), matrix2str(self.matrix), self.fontname, self.adv,
matrix2str(self.matrix), self.fontname, self.get_text()))
self.adv, self.text))
else: def get_text(self):
return '<char %r>' % self.text return self._text
def is_compatible(self, obj): def is_compatible(self, obj):
"""Returns True if two characters can coexist in the same line.""" """Returns True if two characters can coexist in the same line."""
@ -236,10 +243,10 @@ class LTChar(LTItem, LTText):
## LTContainer ## LTContainer
## ##
class LTContainer(LTItem): class LTContainer(LTComponent):
def __init__(self, bbox): def __init__(self, bbox):
LTItem.__init__(self, bbox) LTComponent.__init__(self, bbox)
self._objs = [] self._objs = []
return return
@ -279,23 +286,36 @@ class LTExpandableContainer(LTContainer):
return return
## LTTextContainer
##
class LTTextContainer(LTExpandableContainer, LTText):
def __init__(self):
LTText.__init__(self)
LTExpandableContainer.__init__(self)
return
def get_text(self):
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) )
## LTTextLine ## LTTextLine
## ##
class LTTextLine(LTExpandableContainer, LTText): class LTTextLine(LTTextContainer):
def __init__(self, word_margin): def __init__(self, word_margin):
LTExpandableContainer.__init__(self) LTTextContainer.__init__(self)
self.word_margin = word_margin self.word_margin = word_margin
return return
def __repr__(self): def __repr__(self):
return ('<%s %s %r>' % return ('<%s %s %r>' %
(self.__class__.__name__, bbox2str(self.bbox), self.text)) (self.__class__.__name__, bbox2str(self.bbox),
self.get_text()))
def analyze(self, laparams): def analyze(self, laparams):
LTExpandableContainer.analyze(self, laparams) LTTextContainer.analyze(self, laparams)
LTContainer.add(self, LTAnon('\n')) LTContainer.add(self, LTAnon('\n'))
self.text = ''.join( obj.text for obj in self if isinstance(obj, LTText) )
return return
def find_neighbors(self, plane, ratio): def find_neighbors(self, plane, ratio):
@ -349,23 +369,17 @@ class LTTextLineVertical(LTTextLine):
## A set of text objects that are grouped within ## A set of text objects that are grouped within
## a certain rectangular area. ## a certain rectangular area.
## ##
class LTTextBox(LTExpandableContainer): class LTTextBox(LTTextContainer):
def __init__(self): def __init__(self):
LTExpandableContainer.__init__(self) LTTextContainer.__init__(self)
self.index = None self.index = None
self.text = None
return return
def __repr__(self): def __repr__(self):
return ('<%s(%s) %s %r>' % return ('<%s(%s) %s %r>' %
(self.__class__.__name__, self.index, (self.__class__.__name__,
bbox2str(self.bbox), self.text)) self.index, bbox2str(self.bbox), self.get_text()))
def analyze(self, laparams):
LTExpandableContainer.analyze(self, laparams)
self.text = ''.join( obj.text for obj in self if isinstance(obj, LTTextLine) )
return
class LTTextBoxHorizontal(LTTextBox): class LTTextBoxHorizontal(LTTextBox):
@ -390,10 +404,10 @@ class LTTextBoxVertical(LTTextBox):
## LTTextGroup ## LTTextGroup
## ##
class LTTextGroup(LTExpandableContainer): class LTTextGroup(LTTextContainer):
def __init__(self, objs): def __init__(self, objs):
LTExpandableContainer.__init__(self) LTTextContainer.__init__(self)
self.extend(objs) self.extend(objs)
return return