writing mode detection

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@196 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-03-25 11:38:47 +00:00
parent 510f6ae5f9
commit 2e5b92c18a
10 changed files with 7780 additions and 25476 deletions

View File

@ -43,12 +43,12 @@ def is_uniq(objs):
class LAParams(object): class LAParams(object):
def __init__(self, def __init__(self,
direction=None, writing_mode='lr-tb',
line_overlap=0.5, line_overlap=0.5,
char_margin=3.0, char_margin=3.0,
line_margin=0.5, line_margin=0.5,
word_margin=0.1): word_margin=0.1):
self.direction = direction self.writing_mode = writing_mode
self.line_overlap = line_overlap self.line_overlap = line_overlap
self.char_margin = char_margin self.char_margin = char_margin
self.line_margin = line_margin self.line_margin = line_margin
@ -56,8 +56,8 @@ class LAParams(object):
return return
def __repr__(self): def __repr__(self):
return ('<LAParams: direction=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' % return ('<LAParams: writing_mode=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f>' %
(self.direction, self.char_margin, self.line_margin, self.word_margin)) (self.writing_mode, self.char_margin, self.line_margin, self.word_margin))
## LayoutItem ## LayoutItem
@ -149,7 +149,7 @@ class LayoutContainer(LayoutItem):
self.objs.extend(container.objs) self.objs.extend(container.objs)
return return
# fixate(): determines its boundery and writing direction. # fixate(): determines its boundery.
def fixate(self): def fixate(self):
if not self.width and self.objs: if not self.width and self.objs:
(bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF)
@ -413,7 +413,7 @@ class LTTextGroup(LayoutContainer):
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
return return
class LTTextGroupHorizontal(LTTextGroup): class LTTextGroupLRTB(LTTextGroup):
def __init__(self, objs): def __init__(self, objs):
LTTextGroup.__init__(self, objs) LTTextGroup.__init__(self, objs)
@ -421,7 +421,7 @@ class LTTextGroupHorizontal(LTTextGroup):
self.objs = csort(self.objs, key=lambda obj: obj.x0-obj.y1) self.objs = csort(self.objs, key=lambda obj: obj.x0-obj.y1)
return return
class LTTextGroupVertical(LTTextGroup): class LTTextGroupTBRL(LTTextGroup):
def __init__(self, objs): def __init__(self, objs):
LTTextGroup.__init__(self, objs) LTTextGroup.__init__(self, objs)
@ -470,9 +470,47 @@ class Plane(object):
return sorted(xobjs, key=lambda obj: self.idxs[obj]) return sorted(xobjs, key=lambda obj: self.idxs[obj])
## guess_wmode
##
def guess_wmode(objs):
"""Guess the writing mode by looking at the order of text objects."""
xy = tb = lr = 0
obj0 = None
for obj1 in objs:
if obj0 is not None:
dx = obj1.x0+obj1.x1-(obj0.x0+obj0.x1)
dy = obj1.y0+obj1.y1-(obj0.y0+obj0.y1)
if abs(dy) < abs(dx):
xy += 1
else:
xy -= 1
if 0 < dx:
lr += 1
else:
lr -= 1
if dy < 0:
tb += 1
else:
tb -= 1
obj0 = obj1
if 0 < lr:
lr = 'lr'
else:
lr = 'rl'
if 0 < tb:
tb = 'tb'
else:
tb = 'bt'
if 0 < xy:
return lr+'-'+tb
else:
return tb+'-'+lr
## group_lines ## group_lines
## ##
def group_lines(groupfunc, objs, *args): def group_lines(groupfunc, objs, *args):
"""Group LTTextLine objects to form a LTTextBox."""
plane = Plane(objs) plane = Plane(objs)
groups = {} groups = {}
for obj in objs: for obj in objs:
@ -538,12 +576,15 @@ class LTPage(LayoutContainer):
LayoutContainer.fixate(self) LayoutContainer.fixate(self)
(textobjs, otherobjs) = self.get_textobjs() (textobjs, otherobjs) = self.get_textobjs()
if not laparams or not textobjs: return if not laparams or not textobjs: return
if laparams.direction == 'V': if laparams.writing_mode not in ('lr-tb', 'tb-rl'):
laparams.writing_mode = guess_wmode(textobjs)
if (laparams.writing_mode.startswith('tb-') or
laparams.writing_mode.startswith('bt-')):
textboxes = self.build_textbox_vertical(textobjs, laparams) textboxes = self.build_textbox_vertical(textobjs, laparams)
top = self.group_textbox_vertical(textboxes, laparams) top = self.group_textbox_tb_rl(textboxes, laparams)
else: else:
textboxes = self.build_textbox_horizontal(textobjs, laparams) textboxes = self.build_textbox_horizontal(textobjs, laparams)
top = self.group_textbox_horizontal(textboxes, laparams) top = self.group_textbox_lr_tb(textboxes, laparams)
def assign_index(obj, i): def assign_index(obj, i):
if isinstance(obj, LTTextBox): if isinstance(obj, LTTextBox):
obj.index = i obj.index = i
@ -627,16 +668,16 @@ class LTPage(LayoutContainer):
lines.append(LTTextLineVertical(line, laparams.word_margin)) lines.append(LTTextLineVertical(line, laparams.word_margin))
return group_lines(LTTextBoxVertical, lines, laparams.line_margin) return group_lines(LTTextBoxVertical, lines, laparams.line_margin)
def group_textbox_horizontal(self, boxes, laparams): def group_textbox_lr_tb(self, boxes, laparams):
def dist(obj1, obj2): def dist(obj1, obj2):
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
obj1.width*obj1.height - obj2.width*obj2.height) obj1.width*obj1.height - obj2.width*obj2.height)
return group_boxes(LTTextGroupHorizontal, boxes, dist) return group_boxes(LTTextGroupLRTB, boxes, dist)
def group_textbox_vertical(self, boxes, laparams): def group_textbox_tb_rl(self, boxes, laparams):
def dist(obj1, obj2): def dist(obj1, obj2):
return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) *
(max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) -
obj1.width*obj1.height - obj2.width*obj2.height) obj1.width*obj1.height - obj2.width*obj2.height)
return group_boxes(LTTextGroupVertical, boxes, dist) return group_boxes(LTTextGroupTBRL, boxes, dist)

View File

@ -455,7 +455,8 @@ class PDFDocument(object):
if strmid in self.parsed_objs: if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid] objs = self.parsed_objs[strmid]
else: else:
parser = PDFObjStrmParser(stream.get_data(), self) parser = PDFStreamParser(stream.get_data())
parser.set_document(self)
objs = [] objs = []
try: try:
while 1: while 1:
@ -732,20 +733,18 @@ class PDFParser(PSStackParser):
return xrefs return xrefs
## PDFObjStrmParser ## PDFStreamParser
## ##
class PDFObjStrmParser(PDFParser): class PDFStreamParser(PDFParser):
def __init__(self, data, doc): def __init__(self, data):
PSStackParser.__init__(self, StringIO(data)) PDFParser.__init__(self, StringIO(data))
self.doc = doc
return return
def flush(self): def flush(self):
self.add_results(*self.popall()) self.add_results(*self.popall())
return return
KEYWORD_R = KWD('R')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
if token is self.KEYWORD_R: if token is self.KEYWORD_R:
# reference to indirect object # reference to indirect object

View File

@ -4,7 +4,7 @@ RM=rm -f
#CMP=cmp #CMP=cmp
CMP=: CMP=:
PYTHON=python PYTHON=python
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -Dauto -p1
HTMLS= \ HTMLS= \
simple1.html \ simple1.html \

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -12,7 +12,7 @@ def main(argv):
import getopt import getopt
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-n] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100 return 100
try: try:
@ -42,7 +42,7 @@ def main(argv):
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-n': laparams = None elif k == '-n': laparams = None
elif k == '-D': laparams.direction = v elif k == '-D': laparams.writing_mode = v
elif k == '-M': laparams.char_margin = float(v) elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v) elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v) elif k == '-W': laparams.word_margin = float(v)