From a24c452ba2a4a328f775a20f904e23fb6f5f747e Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Sun, 26 Dec 2010 17:26:39 +0900 Subject: [PATCH] boxes_flow patch by Daniel Gerber --- pdfminer/layout.py | 10 ++++++++-- tools/pdf2txt.py | 7 ++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 8eefa22..84345ea 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -28,12 +28,14 @@ class LAParams(object): char_margin=2.0, line_margin=0.5, word_margin=0.1, + boxes_flow=0, all_texts=False): self.writing_mode = writing_mode self.line_overlap = line_overlap self.char_margin = char_margin self.line_margin = line_margin self.word_margin = word_margin + self.boxes_flow = boxes_flow self.all_texts = all_texts return @@ -399,14 +401,18 @@ class LTTextGroupLRTB(LTTextGroup): def analyze(self, laparams): # reorder the objects from top-left to bottom-right. - self._objs = csort(self._objs, key=lambda obj: obj.x0+obj.x1-(obj.y0+obj.y1)) + self._objs = csort(self._objs, key=lambda obj: + (1-laparams.boxes_flow)*(obj.x0+obj.x1) - + (1+laparams.boxes_flow)*(obj.y0+obj.y1)) return LTTextGroup.analyze(self, laparams) class LTTextGroupTBRL(LTTextGroup): def analyze(self, laparams): # reorder the objects from top-right to bottom-left. - self._objs = csort(self._objs, key=lambda obj: -(obj.x0+obj.x1)-(obj.y0+obj.y1)) + self._objs = csort(self._objs, key=lambda obj: + -(1+laparams.boxes_flow)*(obj.x0+obj.x1) + -(1-laparams.boxes_flow)*(obj.y0+obj.y1)) return LTTextGroup.analyze(self, laparams) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 1704ffa..e997454 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -12,11 +12,11 @@ def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' - '[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-Y layout_mode] ' - '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) + '[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow]' + '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:Y:O:t:c:s:') + (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() @@ -47,6 +47,7 @@ def main(argv): elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) + elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v