diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 84345ea..08bfc9e 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -23,19 +23,19 @@ def csort(objs, key): class LAParams(object): def __init__(self, - writing_mode='lr-tb', line_overlap=0.5, char_margin=2.0, line_margin=0.5, word_margin=0.1, boxes_flow=0, + detect_vertical=False, all_texts=False): - self.writing_mode = writing_mode self.line_overlap = line_overlap self.char_margin = char_margin self.line_margin = line_margin self.word_margin = word_margin self.boxes_flow = boxes_flow + self.detect_vertical = detect_vertical self.all_texts = all_texts return @@ -480,7 +480,8 @@ class LTLayoutContainer(LTContainer): # |<--->| # (char_margin) k |= 1 - if (obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and + if (laparams.detect_vertical and + obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin): # obj0 and obj1 is vertically aligned: diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 5f4bb31..2aa121d 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -12,11 +12,11 @@ def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' - '[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' + '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:F:Y:O:t:c:s:') + (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() @@ -44,6 +44,7 @@ def main(argv): elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True + elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v)