added LTText.get_text() and .text property is no longer accessible.
parent
5004e4b28d
commit
038ce4cd0c
|
@ -183,7 +183,7 @@ class TextConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.write_text(item.text)
|
self.write_text(item.get_text())
|
||||||
if isinstance(item, LTTextBox):
|
if isinstance(item, LTTextBox):
|
||||||
self.write_text('\n')
|
self.write_text('\n')
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
|
@ -368,7 +368,7 @@ class HTMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.place_border('char', 1, item)
|
self.place_border('char', 1, item)
|
||||||
self.place_text('char', item.text, item.x0, item.y1, item.size)
|
self.place_text('char', item.get_text(), item.x0, item.y1, item.size)
|
||||||
else:
|
else:
|
||||||
if isinstance(item, LTTextLine):
|
if isinstance(item, LTTextLine):
|
||||||
for child in item:
|
for child in item:
|
||||||
|
@ -382,9 +382,9 @@ class HTMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
self.end_textbox('textbox')
|
self.end_textbox('textbox')
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.put_text(item.text, item.fontname, item.size)
|
self.put_text(item.get_text(), item.fontname, item.size)
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.write_text(item.text)
|
self.write_text(item.get_text())
|
||||||
return
|
return
|
||||||
render(ltpage)
|
render(ltpage)
|
||||||
self._yoffset += self.pagemargin
|
self._yoffset += self.pagemargin
|
||||||
|
@ -472,10 +472,10 @@ class XMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
|
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
|
||||||
(enc(item.fontname), bbox2str(item.bbox), item.size))
|
(enc(item.fontname), bbox2str(item.bbox), item.size))
|
||||||
self.write_text(item.text)
|
self.write_text(item.get_text())
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
self.outfp.write('<text>%s</text>\n' % item.get_text())
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
if self.outdir:
|
if self.outdir:
|
||||||
name = self.write_image(item)
|
name = self.write_image(item)
|
||||||
|
|
|
@ -34,7 +34,29 @@ class LAParams(object):
|
||||||
##
|
##
|
||||||
class LTItem(object):
|
class LTItem(object):
|
||||||
|
|
||||||
|
def analyze(self, laparams):
|
||||||
|
"""Perform the layout analysis."""
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## LTText
|
||||||
|
##
|
||||||
|
class LTText(object):
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<%s %r>' %
|
||||||
|
(self.__class__.__name__, self.get_text()))
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
## LTComponent
|
||||||
|
##
|
||||||
|
class LTComponent(LTItem):
|
||||||
|
|
||||||
def __init__(self, bbox):
|
def __init__(self, bbox):
|
||||||
|
LTItem.__init__(self)
|
||||||
self.set_bbox(bbox)
|
self.set_bbox(bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -56,54 +78,50 @@ class LTItem(object):
|
||||||
return self.width <= 0 or self.height <= 0
|
return self.width <= 0 or self.height <= 0
|
||||||
|
|
||||||
def is_hoverlap(self, obj):
|
def is_hoverlap(self, obj):
|
||||||
assert isinstance(obj, LTItem)
|
assert isinstance(obj, LTComponent)
|
||||||
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
return obj.x0 <= self.x1 and self.x0 <= obj.x1
|
||||||
|
|
||||||
def hdistance(self, obj):
|
def hdistance(self, obj):
|
||||||
assert isinstance(obj, LTItem)
|
assert isinstance(obj, LTComponent)
|
||||||
if self.is_hoverlap(obj):
|
if self.is_hoverlap(obj):
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
|
|
||||||
def hoverlap(self, obj):
|
def hoverlap(self, obj):
|
||||||
assert isinstance(obj, LTItem)
|
assert isinstance(obj, LTComponent)
|
||||||
if self.is_hoverlap(obj):
|
if self.is_hoverlap(obj):
|
||||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def is_voverlap(self, obj):
|
def is_voverlap(self, obj):
|
||||||
assert isinstance(obj, LTItem)
|
assert isinstance(obj, LTComponent)
|
||||||
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
return obj.y0 <= self.y1 and self.y0 <= obj.y1
|
||||||
|
|
||||||
def vdistance(self, obj):
|
def vdistance(self, obj):
|
||||||
assert isinstance(obj, LTItem)
|
assert isinstance(obj, LTComponent)
|
||||||
if self.is_voverlap(obj):
|
if self.is_voverlap(obj):
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||||
|
|
||||||
def voverlap(self, obj):
|
def voverlap(self, obj):
|
||||||
assert isinstance(obj, LTItem)
|
assert isinstance(obj, LTComponent)
|
||||||
if self.is_voverlap(obj):
|
if self.is_voverlap(obj):
|
||||||
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def analyze(self, laparams):
|
|
||||||
"""Perform the layout analysis."""
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## LTCurve
|
## LTCurve
|
||||||
##
|
##
|
||||||
class LTCurve(LTItem):
|
class LTCurve(LTComponent):
|
||||||
|
|
||||||
def __init__(self, linewidth, pts):
|
def __init__(self, linewidth, pts):
|
||||||
|
LTComponent.__init__(self, get_bound(pts))
|
||||||
self.pts = pts
|
self.pts = pts
|
||||||
self.linewidth = linewidth
|
self.linewidth = linewidth
|
||||||
LTItem.__init__(self, get_bound(pts))
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_pts(self):
|
def get_pts(self):
|
||||||
|
@ -130,10 +148,10 @@ class LTRect(LTCurve):
|
||||||
|
|
||||||
## LTImage
|
## LTImage
|
||||||
##
|
##
|
||||||
class LTImage(LTItem):
|
class LTImage(LTComponent):
|
||||||
|
|
||||||
def __init__(self, name, stream, bbox):
|
def __init__(self, name, stream, bbox):
|
||||||
LTItem.__init__(self, bbox)
|
LTComponent.__init__(self, bbox)
|
||||||
self.name = name
|
self.name = name
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
self.srcsize = (stream.get_any(('W', 'Width')),
|
self.srcsize = (stream.get_any(('W', 'Width')),
|
||||||
|
@ -146,41 +164,30 @@ class LTImage(LTItem):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
(w,h) = self.srcsize
|
return ('<%s(%s) %s %r>' %
|
||||||
return ('<%s(%s) %s %dx%d>' %
|
|
||||||
(self.__class__.__name__, self.name,
|
(self.__class__.__name__, self.name,
|
||||||
bbox2str(self.bbox), w, h))
|
bbox2str(self.bbox), self.srcsize))
|
||||||
|
|
||||||
|
|
||||||
## LTText
|
|
||||||
##
|
|
||||||
class LTText(object):
|
|
||||||
|
|
||||||
def __init__(self, text):
|
|
||||||
self.text = text
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<%s %r>' %
|
|
||||||
(self.__class__.__name__, self.text))
|
|
||||||
|
|
||||||
|
|
||||||
## LTAnon
|
## LTAnon
|
||||||
##
|
##
|
||||||
class LTAnon(LTText):
|
class LTAnon(LTItem, LTText):
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def __init__(self, text):
|
||||||
|
self._text = text
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
return self._text
|
||||||
|
|
||||||
|
|
||||||
## LTChar
|
## LTChar
|
||||||
##
|
##
|
||||||
class LTChar(LTItem, LTText):
|
class LTChar(LTComponent, LTText):
|
||||||
|
|
||||||
debug = 0
|
|
||||||
|
|
||||||
def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp):
|
def __init__(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp):
|
||||||
LTText.__init__(self, text)
|
LTText.__init__(self)
|
||||||
|
self._text = text
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
self.fontname = font.fontname
|
self.fontname = font.fontname
|
||||||
self.adv = textwidth * fontsize * scaling
|
self.adv = textwidth * fontsize * scaling
|
||||||
|
@ -213,7 +220,7 @@ class LTChar(LTItem, LTText):
|
||||||
(x0,x1) = (x1,x0)
|
(x0,x1) = (x1,x0)
|
||||||
if y1 < y0:
|
if y1 < y0:
|
||||||
(y0,y1) = (y1,y0)
|
(y0,y1) = (y1,y0)
|
||||||
LTItem.__init__(self, (x0,y0,x1,y1))
|
LTComponent.__init__(self, (x0,y0,x1,y1))
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
self.size = self.width
|
self.size = self.width
|
||||||
else:
|
else:
|
||||||
|
@ -221,13 +228,13 @@ class LTChar(LTItem, LTText):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
if self.debug:
|
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
||||||
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
|
(self.__class__.__name__, bbox2str(self.bbox),
|
||||||
(self.__class__.__name__, bbox2str(self.bbox),
|
matrix2str(self.matrix), self.fontname, self.adv,
|
||||||
matrix2str(self.matrix), self.fontname,
|
self.get_text()))
|
||||||
self.adv, self.text))
|
|
||||||
else:
|
def get_text(self):
|
||||||
return '<char %r>' % self.text
|
return self._text
|
||||||
|
|
||||||
def is_compatible(self, obj):
|
def is_compatible(self, obj):
|
||||||
"""Returns True if two characters can coexist in the same line."""
|
"""Returns True if two characters can coexist in the same line."""
|
||||||
|
@ -236,10 +243,10 @@ class LTChar(LTItem, LTText):
|
||||||
|
|
||||||
## LTContainer
|
## LTContainer
|
||||||
##
|
##
|
||||||
class LTContainer(LTItem):
|
class LTContainer(LTComponent):
|
||||||
|
|
||||||
def __init__(self, bbox):
|
def __init__(self, bbox):
|
||||||
LTItem.__init__(self, bbox)
|
LTComponent.__init__(self, bbox)
|
||||||
self._objs = []
|
self._objs = []
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -279,23 +286,36 @@ class LTExpandableContainer(LTContainer):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## LTTextContainer
|
||||||
|
##
|
||||||
|
class LTTextContainer(LTExpandableContainer, LTText):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
LTText.__init__(self)
|
||||||
|
LTExpandableContainer.__init__(self)
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) )
|
||||||
|
|
||||||
|
|
||||||
## LTTextLine
|
## LTTextLine
|
||||||
##
|
##
|
||||||
class LTTextLine(LTExpandableContainer, LTText):
|
class LTTextLine(LTTextContainer):
|
||||||
|
|
||||||
def __init__(self, word_margin):
|
def __init__(self, word_margin):
|
||||||
LTExpandableContainer.__init__(self)
|
LTTextContainer.__init__(self)
|
||||||
self.word_margin = word_margin
|
self.word_margin = word_margin
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<%s %s %r>' %
|
return ('<%s %s %r>' %
|
||||||
(self.__class__.__name__, bbox2str(self.bbox), self.text))
|
(self.__class__.__name__, bbox2str(self.bbox),
|
||||||
|
self.get_text()))
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
LTExpandableContainer.analyze(self, laparams)
|
LTTextContainer.analyze(self, laparams)
|
||||||
LTContainer.add(self, LTAnon('\n'))
|
LTContainer.add(self, LTAnon('\n'))
|
||||||
self.text = ''.join( obj.text for obj in self if isinstance(obj, LTText) )
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
|
@ -349,23 +369,17 @@ class LTTextLineVertical(LTTextLine):
|
||||||
## A set of text objects that are grouped within
|
## A set of text objects that are grouped within
|
||||||
## a certain rectangular area.
|
## a certain rectangular area.
|
||||||
##
|
##
|
||||||
class LTTextBox(LTExpandableContainer):
|
class LTTextBox(LTTextContainer):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
LTExpandableContainer.__init__(self)
|
LTTextContainer.__init__(self)
|
||||||
self.index = None
|
self.index = None
|
||||||
self.text = None
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<%s(%s) %s %r>' %
|
return ('<%s(%s) %s %r>' %
|
||||||
(self.__class__.__name__, self.index,
|
(self.__class__.__name__,
|
||||||
bbox2str(self.bbox), self.text))
|
self.index, bbox2str(self.bbox), self.get_text()))
|
||||||
|
|
||||||
def analyze(self, laparams):
|
|
||||||
LTExpandableContainer.analyze(self, laparams)
|
|
||||||
self.text = ''.join( obj.text for obj in self if isinstance(obj, LTTextLine) )
|
|
||||||
return
|
|
||||||
|
|
||||||
class LTTextBoxHorizontal(LTTextBox):
|
class LTTextBoxHorizontal(LTTextBox):
|
||||||
|
|
||||||
|
@ -390,10 +404,10 @@ class LTTextBoxVertical(LTTextBox):
|
||||||
|
|
||||||
## LTTextGroup
|
## LTTextGroup
|
||||||
##
|
##
|
||||||
class LTTextGroup(LTExpandableContainer):
|
class LTTextGroup(LTTextContainer):
|
||||||
|
|
||||||
def __init__(self, objs):
|
def __init__(self, objs):
|
||||||
LTExpandableContainer.__init__(self)
|
LTTextContainer.__init__(self)
|
||||||
self.extend(objs)
|
self.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue