writing mode detection

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@196 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-03-25 11:38:47 +00:00
parent 510f6ae5f9
commit 2e5b92c18a
10 changed files with 7780 additions and 25476 deletions

View File

@ -43,12 +43,12 @@ def is_uniq(objs):
class LAParams(object):
def __init__(self,
direction=None,
writing_mode='lr-tb',
line_overlap=0.5,
char_margin=3.0,
line_margin=0.5,
word_margin=0.1):
self.direction = direction
self.writing_mode = writing_mode
self.line_overlap = line_overlap
self.char_margin = char_margin
self.line_margin = line_margin
@ -56,8 +56,8 @@ class LAParams(object):
return
def __repr__(self):
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.direction, self.char_margin, self.line_margin, self.word_margin))
return ('<LAParams: writing_mode=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.writing_mode, self.char_margin, self.line_margin, self.word_margin))
## LayoutItem
@ -149,7 +149,7 @@ class LayoutContainer(LayoutItem):
self.objs.extend(container.objs)
return
# fixate(): determines its boundery and writing direction.
# fixate(): determines its boundery.
def fixate(self):
if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
@ -413,7 +413,7 @@ class LTTextGroup(LayoutContainer):
LayoutContainer.fixate(self)
return
class LTTextGroupHorizontal(LTTextGroup):
class LTTextGroupLRTB(LTTextGroup):
def __init__(self, objs):
LTTextGroup.__init__(self, objs)
@ -421,7 +421,7 @@ class LTTextGroupHorizontal(LTTextGroup):
self.objs = csort(self.objs, key=lambda obj: obj.x0-obj.y1)
return
class LTTextGroupVertical(LTTextGroup):
class LTTextGroupTBRL(LTTextGroup):
def __init__(self, objs):
LTTextGroup.__init__(self, objs)
@ -470,9 +470,47 @@ class Plane(object):
return sorted(xobjs, key=lambda obj: self.idxs[obj])
## guess_wmode
##
def guess_wmode(objs):
"""Guess the writing mode by looking at the order of text objects."""
xy = tb = lr = 0
obj0 = None
for obj1 in objs:
if obj0 is not None:
dx = obj1.x0+obj1.x1-(obj0.x0+obj0.x1)
dy = obj1.y0+obj1.y1-(obj0.y0+obj0.y1)
if abs(dy) < abs(dx):
xy += 1
else:
xy -= 1
if 0 < dx:
lr += 1
else:
lr -= 1
if dy < 0:
tb += 1
else:
tb -= 1
obj0 = obj1
if 0 < lr:
lr = 'lr'
else:
lr = 'rl'
if 0 < tb:
tb = 'tb'
else:
tb = 'bt'
if 0 < xy:
return lr+'-'+tb
else:
return tb+'-'+lr
## group_lines
##
def group_lines(groupfunc, objs, *args):
"""Group LTTextLine objects to form a LTTextBox."""
plane = Plane(objs)
groups = {}
for obj in objs:
@ -538,12 +576,15 @@ class LTPage(LayoutContainer):
LayoutContainer.fixate(self)
(textobjs, otherobjs) = self.get_textobjs()
if not laparams or not textobjs: return
if laparams.direction == 'V':
if laparams.writing_mode not in ('lr-tb', 'tb-rl'):
laparams.writing_mode = guess_wmode(textobjs)
if (laparams.writing_mode.startswith('tb-') or
laparams.writing_mode.startswith('bt-')):
textboxes = self.build_textbox_vertical(textobjs, laparams)
top = self.group_textbox_vertical(textboxes, laparams)
top = self.group_textbox_tb_rl(textboxes, laparams)
else:
textboxes = self.build_textbox_horizontal(textobjs, laparams)
top = self.group_textbox_horizontal(textboxes, laparams)
top = self.group_textbox_lr_tb(textboxes, laparams)
def assign_index(obj, i):
if isinstance(obj, LTTextBox):
obj.index = i
@ -627,16 +668,16 @@ class LTPage(LayoutContainer):
lines.append(LTTextLineVertical(line, laparams.word_margin))
return group_lines(LTTextBoxVertical, lines, laparams.line_margin)
def group_textbox_horizontal(self, boxes, laparams):
def group_textbox_lr_tb(self, boxes, laparams):
def dist(obj1, obj2):
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
obj1.width*obj1.height - obj2.width*obj2.height)
return group_boxes(LTTextGroupHorizontal, boxes, dist)
return group_boxes(LTTextGroupLRTB, boxes, dist)
def group_textbox_vertical(self, boxes, laparams):
def group_textbox_tb_rl(self, boxes, laparams):
def dist(obj1, obj2):
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
obj1.width*obj1.height - obj2.width*obj2.height)
return group_boxes(LTTextGroupVertical, boxes, dist)
return group_boxes(LTTextGroupTBRL, boxes, dist)

View File

@ -455,7 +455,8 @@ class PDFDocument(object):
if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid]
else:
parser = PDFObjStrmParser(stream.get_data(), self)
parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = []
try:
while 1:
@ -732,20 +733,18 @@ class PDFParser(PSStackParser):
return xrefs
## PDFObjStrmParser
## PDFStreamParser
##
class PDFObjStrmParser(PDFParser):
class PDFStreamParser(PDFParser):
def __init__(self, data, doc):
PSStackParser.__init__(self, StringIO(data))
self.doc = doc
def __init__(self, data):
PDFParser.__init__(self, StringIO(data))
return
def flush(self):
self.add_results(*self.popall())
return
KEYWORD_R = KWD('R')
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object

View File

@ -4,7 +4,7 @@ RM=rm -f
#CMP=cmp
CMP=:
PYTHON=python
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -Dauto -p1
HTMLS= \
simple1.html \

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -12,7 +12,7 @@ def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-n] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
@ -42,7 +42,7 @@ def main(argv):
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-D': laparams.direction = v
elif k == '-D': laparams.writing_mode = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)