diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 598df47..f8ed681 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -32,7 +32,8 @@ class PDFPageAggregator(PDFTextDevice): def end_page(self, _): assert not self.stack assert isinstance(self.cur_item, LTPage) - self.cur_item.fixate(self.laparams) + self.cur_item.fixate() + self.cur_item.analyze(self.laparams) self.pageno += 1 return self.cur_item @@ -43,7 +44,9 @@ class PDFPageAggregator(PDFTextDevice): def end_figure(self, _): fig = self.cur_item + assert isinstance(self.cur_item, LTFigure) self.cur_item.fixate() + self.cur_item.analyze(self.laparams) self.cur_item = self.stack.pop() self.cur_item.add(fig) return @@ -226,14 +229,13 @@ class HTMLConverter(PDFConverter): for child in item: render(child) elif isinstance(item, LTImage): - name = '' if self.outdir: name = self.write_image(item) - self.outfp.write('\n' % - (enc(name), - item.x0*self.scale, (self.yoffset-item.y1)*self.scale, - item.width*self.scale, item.height*self.scale)) + self.outfp.write('\n' % + (enc(name), + item.x0*self.scale, (self.yoffset-item.y1)*self.scale, + item.width*self.scale, item.height*self.scale)) return page = PDFConverter.end_page(self, page) render(page) @@ -311,11 +313,13 @@ class XMLConverter(PDFConverter): elif isinstance(item, LTText): self.outfp.write('%s\n' % item.text) elif isinstance(item, LTImage): - name = '' if self.outdir: name = self.write_image(item) - self.outfp.write('\n' % - (enc(name), item.width, item.height)) + self.outfp.write('\n' % + (enc(name), item.width, item.height)) + else: + self.outfp.write('\n' % + (item.width, item.height)) else: assert 0, item return diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 8c4ab91..6c33978 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -168,7 +168,7 @@ class LTImage(LTItem): self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1) self.colorspace = stream.get_any(('CS', 'ColorSpace')) if not isinstance(self.colorspace, list): - self.colorspace = [colorspace] + self.colorspace = [self.colorspace] return def __repr__(self): @@ -550,41 +550,12 @@ def group_boxes(groupfunc, objs, distfunc, debug=0): return objs.pop() -## LTFigure +## LTAnalyzer ## -class LTFigure(LTContainer): +class LTAnalyzer(LTContainer): - def __init__(self, name, bbox, matrix): - (x,y,w,h) = bbox - bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) - for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) - self.name = name - self.matrix = matrix - LTContainer.__init__(self, bbox) - return - - def __repr__(self): - return ('
' % - (self.name, bbox2str(self.bbox), matrix2str(self.matrix))) - - -## LTPage -## -class LTPage(LTContainer): - - def __init__(self, pageid, bbox, rotate=0): - LTContainer.__init__(self, bbox) - self.pageid = pageid - self.rotate = rotate - self.layout = None - return - - def __repr__(self): - return ('' % (self.pageid, bbox2str(self.bbox), self.rotate)) - - def fixate(self, laparams): + def analyze(self, laparams): """Perform the layout analysis.""" - LTContainer.fixate(self) (textobjs, otherobjs) = self.get_textobjs() if not laparams or not textobjs: return if laparams.writing_mode not in ('lr-tb', 'tb-rl'): @@ -694,3 +665,41 @@ class LTPage(LTContainer): (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - (obj1.width*obj1.height + obj2.width*obj2.height)) return group_boxes(LTTextGroupTBRL, boxes, dist) + + +## LTFigure +## +class LTFigure(LTAnalyzer): + + def __init__(self, name, bbox, matrix): + (x,y,w,h) = bbox + bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) + for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) + LTAnalyzer.__init__(self, bbox) + self.name = name + self.matrix = matrix + return + + def __repr__(self): + return ('
' % + (self.name, bbox2str(self.bbox), matrix2str(self.matrix))) + + def analyze(self, laparams): + if laparams.all_texts: + LTAnalyzer.analyze(self, laparams) + return + + +## LTPage +## +class LTPage(LTAnalyzer): + + def __init__(self, pageid, bbox, rotate=0): + LTAnalyzer.__init__(self, bbox) + self.pageid = pageid + self.rotate = rotate + self.layout = None + return + + def __repr__(self): + return ('' % (self.pageid, bbox2str(self.bbox), self.rotate)) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index ca7506d..13b4758 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -12,11 +12,11 @@ def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' - '[-n] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' + '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nD:M:L:W:O:t:c:s:') + (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() @@ -42,6 +42,7 @@ def main(argv): elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None + elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v)