some wordings and documentations

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@229 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-06-19 03:56:50 +00:00
parent a0dd46bd8e
commit f5aff374fc
3 changed files with 23 additions and 5 deletions

View File

@ -1,7 +1,7 @@
README.txt for cmaprsrc README.txt for cmaprsrc
This directory contains Adobe CMap resources. CMaps are required This directory contains Adobe CMap resources. CMaps are required
to decode text data written in Chinese, Japanese or Korean language. to decode text data written in CJK (Chinese, Japanese, Korean) language.
CMap resources are now available freely from Adobe web site: CMap resources are now available freely from Adobe web site:
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources http://opensource.adobe.com/wiki/display/cmap/CMap+Resources

View File

@ -559,15 +559,21 @@ class LTAnalyzer(LTContainer):
def analyze(self, laparams): def analyze(self, laparams):
"""Perform the layout analysis.""" """Perform the layout analysis."""
(textobjs, otherobjs) = self.get_textobjs() (textobjs, otherobjs) = self.get_textobjs()
# textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page.
if not laparams or not textobjs: return if not laparams or not textobjs: return
if laparams.writing_mode not in ('lr-tb', 'tb-rl'): if laparams.writing_mode not in ('lr-tb', 'tb-rl'):
laparams.writing_mode = guess_wmode(textobjs) laparams.writing_mode = guess_wmode(textobjs)
if (laparams.writing_mode.startswith('tb-') or if (laparams.writing_mode.startswith('tb-') or
laparams.writing_mode.startswith('bt-')): laparams.writing_mode.startswith('bt-')):
# assemble them into vertical rows of text.
textboxes = self.build_textbox_vertical(textobjs, laparams) textboxes = self.build_textbox_vertical(textobjs, laparams)
# turn them into a tree.
top = self.group_textbox_tb_rl(textboxes, laparams) top = self.group_textbox_tb_rl(textboxes, laparams)
else: else:
# assemble them into horizontal rows of text.
textboxes = self.build_textbox_horizontal(textobjs, laparams) textboxes = self.build_textbox_horizontal(textobjs, laparams)
# turn them into a tree.
top = self.group_textbox_lr_tb(textboxes, laparams) top = self.group_textbox_lr_tb(textboxes, laparams)
def assign_index(obj, i): def assign_index(obj, i):
if isinstance(obj, LTTextBox): if isinstance(obj, LTTextBox):
@ -635,7 +641,7 @@ class LTAnalyzer(LTContainer):
# | | # | |
# +------+ # +------+
# #
# |<--->| # |<-->|
# (line_overlap) # (line_overlap)
return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and return ((min(obj1.width, obj2.width) * laparams.line_overlap < obj1.hoverlap(obj2)) and
(obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin)) (obj1.vdistance(obj2) < min(obj1.height, obj2.height) * laparams.char_margin))
@ -656,6 +662,17 @@ class LTAnalyzer(LTContainer):
def group_textbox_lr_tb(self, boxes, laparams): def group_textbox_lr_tb(self, boxes, laparams):
def dist(obj1, obj2): def dist(obj1, obj2):
"""A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2.
Return its area less the areas of obj1 and obj2,
shown as 'www' below. This value may be negative.
+------+..........+
| obj1 |wwwwwwwwww:
+------+www+------+
:wwwwwwwwww| obj2 |
+..........+------+
"""
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
(obj1.width*obj1.height + obj2.width*obj2.height)) (obj1.width*obj1.height + obj2.width*obj2.height))

View File

@ -1,7 +1,8 @@
This directory contains sample PDF files. This directory contains sample PDF files.
The files in nonfree/ subdirectory can be distributed freely These files (including ones in nonfree/ subdirectory) can be
but does not come with explicit licensing terms or source files. distributed freely but does not come with explicit licensing
terms or source files.
Here are the credits of the original files: Here are the credits of the original files:
@ -16,7 +17,7 @@ simple2.pdf:
jo.pdf: jo.pdf:
Kenji Miyazawa (1896-1933, copyright expired) Kenji Miyazawa (1896-1933, copyright expired)
Preface of "Haru to Shura" Preface of "Haru to Shura"
(File generated by LaTeX and dvi2pdfm) (File generated from jo.tex by LaTeX and dvi2pdfm)
-- --
nonfree/dmca.pdf: nonfree/dmca.pdf: