From f5aff374fcd22d50a68dfc2d44ce5f8acfde91ea Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 19 Jun 2010 03:56:50 +0000 Subject: [PATCH] some wordings and documentations git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@229 1aa58f4a-7d42-0410-adbc-911cccaed67c --- cmaprsrc/README.txt | 2 +- pdfminer/layout.py | 19 ++++++++++++++++++- samples/README | 7 ++++--- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/cmaprsrc/README.txt b/cmaprsrc/README.txt index 909ff33..3cfb23e 100644 --- a/cmaprsrc/README.txt +++ b/cmaprsrc/README.txt @@ -1,7 +1,7 @@ README.txt for cmaprsrc This directory contains Adobe CMap resources. CMaps are required -to decode text data written in Chinese, Japanese or Korean language. +to decode text data written in CJK (Chinese, Japanese, Korean) language. CMap resources are now available freely from Adobe web site: http://opensource.adobe.com/wiki/display/cmap/CMap+Resources diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 1b756bb..983e907 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -559,15 +559,21 @@ class LTAnalyzer(LTContainer): def analyze(self, laparams): """Perform the layout analysis.""" (textobjs, otherobjs) = self.get_textobjs() + # textobjs is a list of LTChar objects, i.e. + # it has all the individual characters in the page. if not laparams or not textobjs: return if laparams.writing_mode not in ('lr-tb', 'tb-rl'): laparams.writing_mode = guess_wmode(textobjs) if (laparams.writing_mode.startswith('tb-') or laparams.writing_mode.startswith('bt-')): + # assemble them into vertical rows of text. textboxes = self.build_textbox_vertical(textobjs, laparams) + # turn them into a tree. top = self.group_textbox_tb_rl(textboxes, laparams) else: + # assemble them into horizontal rows of text. textboxes = self.build_textbox_horizontal(textobjs, laparams) + # turn them into a tree. top = self.group_textbox_lr_tb(textboxes, laparams) def assign_index(obj, i): if isinstance(obj, LTTextBox): @@ -635,7 +641,7 @@ class LTAnalyzer(LTContainer): # | | # +------+ # - # |<--->| + # |<-->| # (line_overlap) return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and (obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin)) @@ -656,6 +662,17 @@ class LTAnalyzer(LTContainer): def group_textbox_lr_tb(self, boxes, laparams): def dist(obj1, obj2): + """A distance function between two TextBoxes. + + Consider the bounding rectangle for obj1 and obj2. + Return its area less the areas of obj1 and obj2, + shown as 'www' below. This value may be negative. + +------+..........+ + | obj1 |wwwwwwwwww: + +------+www+------+ + :wwwwwwwwww| obj2 | + +..........+------+ + """ return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - (obj1.width*obj1.height + obj2.width*obj2.height)) diff --git a/samples/README b/samples/README index e9bae3e..bece0db 100644 --- a/samples/README +++ b/samples/README @@ -1,7 +1,8 @@ This directory contains sample PDF files. -The files in nonfree/ subdirectory can be distributed freely -but does not come with explicit licensing terms or source files. +These files (including ones in nonfree/ subdirectory) can be +distributed freely but does not come with explicit licensing +terms or source files. Here are the credits of the original files: @@ -16,7 +17,7 @@ simple2.pdf: jo.pdf: Kenji Miyazawa (1896-1933, copyright expired) Preface of "Haru to Shura" - (File generated by LaTeX and dvi2pdfm) + (File generated from jo.tex by LaTeX and dvi2pdfm) -- nonfree/dmca.pdf: