git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@124 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2009-07-23 14:03:58 +00:00
parent 11885cc15e
commit 585dd59b70
4 changed files with 155 additions and 147 deletions

View File

@ -1,131 +1,11 @@
#!/usr/bin/env python
import sys
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfdevice import PDFDevice, PDFTextDevice
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
from pdfminer.utils import apply_matrix_pt, enc
## PDFPageAggregator
##
class PDFPageAggregator(PDFDevice):
def __init__(self, rsrc, pageno=1, laparams=None):
PDFDevice.__init__(self, rsrc)
self.laparams = laparams
self.undefined_char = '?'
self.pageno = pageno
self.stack = []
return
def begin_page(self, page):
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1
return self.cur_item
def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, matrix)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return self.undefined_char
def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return
def render_chars(self, textmatrix, textstate, chars):
if not chars: return (0, 0)
item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
textstate.charspace, textstate.scaling, chars)
self.cur_item.add(item)
return item.adv
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm)
scaling = textstate.scaling * .01
dxscale = scaling / (font.hscale*1000) * .01
wordspace = textstate.wordspace * scaling
chars = []
for x in seq:
if isinstance(x, int) or isinstance(x, float):
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy))
chars = []
else:
for cid in font.decode(x):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy))
chars = []
self.render_chars(textmatrix, textstate, chars)
return
## PDFConverter
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp
self.codec = codec
return
def write(self, text):
self.outfp.write(enc(text, self.codec))
return
## TagExtractor
##
class TagExtractor(PDFDevice):
@ -138,12 +18,12 @@ class TagExtractor(PDFDevice):
self.tag = None
return
def render_string(self, textstate, textmatrix, seq):
def render_string(self, textstate, seq):
font = textstate.font
text = ''
for x in seq:
if not isinstance(x, str): continue
chars = font.decode(x)
for obj in seq:
if not isinstance(obj, str): continue
chars = font.decode(obj)
for cid in chars:
try:
char = font.to_unicode(cid)
@ -186,6 +66,92 @@ class TagExtractor(PDFDevice):
return
## PDFPageAggregator
##
class PDFPageAggregator(PDFTextDevice):
def __init__(self, rsrc, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrc)
self.laparams = laparams
self.pageno = pageno
self.stack = []
return
def begin_page(self, page):
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, LTPage)
self.cur_item.fixate()
if self.laparams:
self.cur_item.analyze_layout(self.laparams)
self.pageno += 1
return self.cur_item
def begin_figure(self, name, bbox, matrix):
self.stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, matrix)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item.fixate()
self.cur_item = self.stack.pop()
self.cur_item.add(fig)
return
def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml': # horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
if y0 == y1:
# horizontal ruler
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
elif x0 == x1:
# vertical ruler
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
elif shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
return
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
if not chars: return (0, 0)
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
self.cur_item.add(item)
return item.adv
## PDFConverter
##
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
self.outfp = outfp
self.codec = codec
return
def write(self, text):
self.outfp.write(enc(text, self.codec))
return
## SGMLConverter
##
class SGMLConverter(PDFConverter):

View File

@ -270,7 +270,7 @@ class LTTextItem(LayoutItem, LTText):
self.vertical = self.font.is_vertical()
self.text = ''.join( char for (char,_) in chars )
adv = sum( font.char_width(cid) for (_,cid) in chars )
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
adv = (adv * fontsize + len(chars)*charspace) * scaling
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.vertical:
# horizontal text
@ -410,6 +410,7 @@ def tsort(objs, f):
go = dict( (obj,[]) for obj in objs )
for obj1 in objs:
for obj2 in objs:
if obj1 is obj2: continue
if f(obj1, obj2): # obj1 -> obj2
go[obj1].append(obj2)
gi[obj2].append(obj1)
@ -478,7 +479,7 @@ class LTPage(LayoutContainer):
elif obj1.voverlap(obj2):
return obj1.x1 < obj2.x0
else:
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
return obj1.x0 < obj2.x0 and obj2.y1 < obj1.y1
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
hline)

View File

@ -1,5 +1,8 @@
#!/usr/bin/env python
from pdfminer.utils import mult_matrix, translate_matrix
## PDFDevice
##
class PDFDevice(object):
@ -39,7 +42,59 @@ class PDFDevice(object):
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_string(self, textstate, textmatrix, seq):
return
def render_image(self, stream, size):
return
def render_string(self, textstate, seq):
return
## PDFTextDevice
##
class PDFTextDevice(PDFDevice):
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
return '?'
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
return (0, 0)
def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm)
font = textstate.font
fontsize = textstate.fontsize
charspace = textstate.charspace
scaling = textstate.scaling * .01
wordspace = textstate.wordspace * scaling
dxscale = scaling / (font.hscale*1000) * .01
chars = []
(x,y) = textstate.linematrix
for obj in seq:
if isinstance(obj, int) or isinstance(obj, float):
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx-obj*dxscale
y += dy
chars = []
else:
for cid in font.decode(obj):
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid))
if cid == 32 and textstate.wordspace and not font.is_multibyte():
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx + wordspace
y += dy
chars = []
if chars:
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
fontsize, charspace, scaling, chars)
x += dx
y += dy
textstate.linematrix = (x,y)
return

View File

@ -49,6 +49,8 @@ class PDFTextState(object):
self.render = 0
self.rise = 0
self.reset()
# self.matrix is set
# self.linematrix is set
return
def __repr__(self):
@ -630,23 +632,7 @@ class PDFPageInterpreter(object):
# show-pos
def do_TJ(self, seq):
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
textstate = self.textstate
textmatrix = translate_matrix(textstate.matrix, textstate.linematrix)
self.device.render_string(textstate, textmatrix, seq)
font = textstate.font
s = ''.join( x for x in seq if isinstance(x, str) )
w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
len(s) * textstate.charspace)
(lx,ly) = textstate.linematrix
if font.is_vertical():
# advance vertically
ly += w * (textstate.scaling * .01)
else:
# advance horizontally
if not font.is_multibyte():
w += s.count(' ')*textstate.wordspace
lx += w * (textstate.scaling * .01)
textstate.linematrix = (lx,ly)
self.device.render_string(self.textstate, seq)
return
# show
def do_Tj(self, s):