-A (all_texts) option added for layout analysis
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@205 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
836eb37b47
commit
e77a6ba997
|
@ -32,7 +32,8 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
def end_page(self, _):
|
def end_page(self, _):
|
||||||
assert not self.stack
|
assert not self.stack
|
||||||
assert isinstance(self.cur_item, LTPage)
|
assert isinstance(self.cur_item, LTPage)
|
||||||
self.cur_item.fixate(self.laparams)
|
self.cur_item.fixate()
|
||||||
|
self.cur_item.analyze(self.laparams)
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
return self.cur_item
|
return self.cur_item
|
||||||
|
|
||||||
|
@ -43,7 +44,9 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
|
|
||||||
def end_figure(self, _):
|
def end_figure(self, _):
|
||||||
fig = self.cur_item
|
fig = self.cur_item
|
||||||
|
assert isinstance(self.cur_item, LTFigure)
|
||||||
self.cur_item.fixate()
|
self.cur_item.fixate()
|
||||||
|
self.cur_item.analyze(self.laparams)
|
||||||
self.cur_item = self.stack.pop()
|
self.cur_item = self.stack.pop()
|
||||||
self.cur_item.add(fig)
|
self.cur_item.add(fig)
|
||||||
return
|
return
|
||||||
|
@ -226,14 +229,13 @@ class HTMLConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
name = ''
|
|
||||||
if self.outdir:
|
if self.outdir:
|
||||||
name = self.write_image(item)
|
name = self.write_image(item)
|
||||||
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
|
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||||
'width="%d" height="%d" />\n' %
|
'width="%d" height="%d" />\n' %
|
||||||
(enc(name),
|
(enc(name),
|
||||||
item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
||||||
item.width*self.scale, item.height*self.scale))
|
item.width*self.scale, item.height*self.scale))
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
|
@ -311,11 +313,13 @@ class XMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
name = ''
|
|
||||||
if self.outdir:
|
if self.outdir:
|
||||||
name = self.write_image(item)
|
name = self.write_image(item)
|
||||||
self.outfp.write('<image name="%s" width="%d" height="%d" />\n' %
|
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||||
(enc(name), item.width, item.height))
|
(enc(name), item.width, item.height))
|
||||||
|
else:
|
||||||
|
self.outfp.write('<image width="%d" height="%d" />\n' %
|
||||||
|
(item.width, item.height))
|
||||||
else:
|
else:
|
||||||
assert 0, item
|
assert 0, item
|
||||||
return
|
return
|
||||||
|
|
|
@ -168,7 +168,7 @@ class LTImage(LTItem):
|
||||||
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
|
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
|
||||||
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
|
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
|
||||||
if not isinstance(self.colorspace, list):
|
if not isinstance(self.colorspace, list):
|
||||||
self.colorspace = [colorspace]
|
self.colorspace = [self.colorspace]
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -550,41 +550,12 @@ def group_boxes(groupfunc, objs, distfunc, debug=0):
|
||||||
return objs.pop()
|
return objs.pop()
|
||||||
|
|
||||||
|
|
||||||
## LTFigure
|
## LTAnalyzer
|
||||||
##
|
##
|
||||||
class LTFigure(LTContainer):
|
class LTAnalyzer(LTContainer):
|
||||||
|
|
||||||
def __init__(self, name, bbox, matrix):
|
def analyze(self, laparams):
|
||||||
(x,y,w,h) = bbox
|
|
||||||
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
|
||||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
|
||||||
self.name = name
|
|
||||||
self.matrix = matrix
|
|
||||||
LTContainer.__init__(self, bbox)
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<figure %r bbox=%s matrix=%s>' %
|
|
||||||
(self.name, bbox2str(self.bbox), matrix2str(self.matrix)))
|
|
||||||
|
|
||||||
|
|
||||||
## LTPage
|
|
||||||
##
|
|
||||||
class LTPage(LTContainer):
|
|
||||||
|
|
||||||
def __init__(self, pageid, bbox, rotate=0):
|
|
||||||
LTContainer.__init__(self, bbox)
|
|
||||||
self.pageid = pageid
|
|
||||||
self.rotate = rotate
|
|
||||||
self.layout = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate))
|
|
||||||
|
|
||||||
def fixate(self, laparams):
|
|
||||||
"""Perform the layout analysis."""
|
"""Perform the layout analysis."""
|
||||||
LTContainer.fixate(self)
|
|
||||||
(textobjs, otherobjs) = self.get_textobjs()
|
(textobjs, otherobjs) = self.get_textobjs()
|
||||||
if not laparams or not textobjs: return
|
if not laparams or not textobjs: return
|
||||||
if laparams.writing_mode not in ('lr-tb', 'tb-rl'):
|
if laparams.writing_mode not in ('lr-tb', 'tb-rl'):
|
||||||
|
@ -694,3 +665,41 @@ class LTPage(LTContainer):
|
||||||
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
|
||||||
(obj1.width*obj1.height + obj2.width*obj2.height))
|
(obj1.width*obj1.height + obj2.width*obj2.height))
|
||||||
return group_boxes(LTTextGroupTBRL, boxes, dist)
|
return group_boxes(LTTextGroupTBRL, boxes, dist)
|
||||||
|
|
||||||
|
|
||||||
|
## LTFigure
|
||||||
|
##
|
||||||
|
class LTFigure(LTAnalyzer):
|
||||||
|
|
||||||
|
def __init__(self, name, bbox, matrix):
|
||||||
|
(x,y,w,h) = bbox
|
||||||
|
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
||||||
|
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||||
|
LTAnalyzer.__init__(self, bbox)
|
||||||
|
self.name = name
|
||||||
|
self.matrix = matrix
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<figure %r bbox=%s matrix=%s>' %
|
||||||
|
(self.name, bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||||
|
|
||||||
|
def analyze(self, laparams):
|
||||||
|
if laparams.all_texts:
|
||||||
|
LTAnalyzer.analyze(self, laparams)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## LTPage
|
||||||
|
##
|
||||||
|
class LTPage(LTAnalyzer):
|
||||||
|
|
||||||
|
def __init__(self, pageid, bbox, rotate=0):
|
||||||
|
LTAnalyzer.__init__(self, bbox)
|
||||||
|
self.pageid = pageid
|
||||||
|
self.rotate = rotate
|
||||||
|
self.layout = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate))
|
||||||
|
|
|
@ -12,11 +12,11 @@ def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
|
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
|
||||||
'[-n] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
|
'[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||||
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nD:M:L:W:O:t:c:s:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -42,6 +42,7 @@ def main(argv):
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
elif k == '-n': laparams = None
|
elif k == '-n': laparams = None
|
||||||
|
elif k == '-A': laparams.all_texts = True
|
||||||
elif k == '-D': laparams.writing_mode = v
|
elif k == '-D': laparams.writing_mode = v
|
||||||
elif k == '-M': laparams.char_margin = float(v)
|
elif k == '-M': laparams.char_margin = float(v)
|
||||||
elif k == '-L': laparams.line_margin = float(v)
|
elif k == '-L': laparams.line_margin = float(v)
|
||||||
|
|
Loading…
Reference in New Issue