git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@124 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
11885cc15e
commit
585dd59b70
|
@ -1,129 +1,9 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from pdfminer.pdfdevice import PDFDevice
|
from pdfminer.pdfdevice import PDFDevice, PDFTextDevice
|
||||||
from pdfminer.pdffont import PDFUnicodeNotDefined
|
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
|
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
|
||||||
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
from pdfminer.utils import apply_matrix_pt, enc
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
|
||||||
##
|
|
||||||
class PDFPageAggregator(PDFDevice):
|
|
||||||
|
|
||||||
def __init__(self, rsrc, pageno=1, laparams=None):
|
|
||||||
PDFDevice.__init__(self, rsrc)
|
|
||||||
self.laparams = laparams
|
|
||||||
self.undefined_char = '?'
|
|
||||||
self.pageno = pageno
|
|
||||||
self.stack = []
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_page(self, page):
|
|
||||||
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
|
|
||||||
return
|
|
||||||
|
|
||||||
def end_page(self, _):
|
|
||||||
assert not self.stack
|
|
||||||
assert isinstance(self.cur_item, LTPage)
|
|
||||||
self.cur_item.fixate()
|
|
||||||
if self.laparams:
|
|
||||||
self.cur_item.analyze_layout(self.laparams)
|
|
||||||
self.pageno += 1
|
|
||||||
return self.cur_item
|
|
||||||
|
|
||||||
def begin_figure(self, name, bbox, matrix):
|
|
||||||
self.stack.append(self.cur_item)
|
|
||||||
self.cur_item = LTFigure(name, bbox, matrix)
|
|
||||||
return
|
|
||||||
|
|
||||||
def end_figure(self, _):
|
|
||||||
fig = self.cur_item
|
|
||||||
self.cur_item.fixate()
|
|
||||||
self.cur_item = self.stack.pop()
|
|
||||||
self.cur_item.add(fig)
|
|
||||||
return
|
|
||||||
|
|
||||||
def handle_undefined_char(self, cidcoding, cid):
|
|
||||||
if self.debug:
|
|
||||||
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
|
||||||
return self.undefined_char
|
|
||||||
|
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
|
||||||
shape = ''.join(x[0] for x in path)
|
|
||||||
if shape == 'ml': # horizontal/vertical line
|
|
||||||
(_,x0,y0) = path[0]
|
|
||||||
(_,x1,y1) = path[1]
|
|
||||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
|
||||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
|
||||||
if y0 == y1:
|
|
||||||
# horizontal ruler
|
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
|
||||||
elif x0 == x1:
|
|
||||||
# vertical ruler
|
|
||||||
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
|
||||||
elif shape == 'mlllh':
|
|
||||||
# rectangle
|
|
||||||
(_,x0,y0) = path[0]
|
|
||||||
(_,x1,y1) = path[1]
|
|
||||||
(_,x2,y2) = path[2]
|
|
||||||
(_,x3,y3) = path[3]
|
|
||||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
|
||||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
|
||||||
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
|
||||||
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
|
|
||||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
|
||||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
|
||||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
|
||||||
return
|
|
||||||
|
|
||||||
def render_chars(self, textmatrix, textstate, chars):
|
|
||||||
if not chars: return (0, 0)
|
|
||||||
item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
|
|
||||||
textstate.charspace, textstate.scaling, chars)
|
|
||||||
self.cur_item.add(item)
|
|
||||||
return item.adv
|
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
|
||||||
font = textstate.font
|
|
||||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
|
||||||
scaling = textstate.scaling * .01
|
|
||||||
dxscale = scaling / (font.hscale*1000) * .01
|
|
||||||
wordspace = textstate.wordspace * scaling
|
|
||||||
chars = []
|
|
||||||
for x in seq:
|
|
||||||
if isinstance(x, int) or isinstance(x, float):
|
|
||||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
|
||||||
textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy))
|
|
||||||
chars = []
|
|
||||||
else:
|
|
||||||
for cid in font.decode(x):
|
|
||||||
try:
|
|
||||||
char = font.to_unicode(cid)
|
|
||||||
except PDFUnicodeNotDefined, e:
|
|
||||||
(cidcoding, cid) = e.args
|
|
||||||
char = self.handle_undefined_char(cidcoding, cid)
|
|
||||||
chars.append((char, cid))
|
|
||||||
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
|
||||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
|
||||||
textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy))
|
|
||||||
chars = []
|
|
||||||
self.render_chars(textmatrix, textstate, chars)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## PDFConverter
|
|
||||||
##
|
|
||||||
class PDFConverter(PDFPageAggregator):
|
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
|
||||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
|
||||||
self.outfp = outfp
|
|
||||||
self.codec = codec
|
|
||||||
return
|
|
||||||
|
|
||||||
def write(self, text):
|
|
||||||
self.outfp.write(enc(text, self.codec))
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
## TagExtractor
|
## TagExtractor
|
||||||
|
@ -138,12 +18,12 @@ class TagExtractor(PDFDevice):
|
||||||
self.tag = None
|
self.tag = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
def render_string(self, textstate, seq):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
text = ''
|
text = ''
|
||||||
for x in seq:
|
for obj in seq:
|
||||||
if not isinstance(x, str): continue
|
if not isinstance(obj, str): continue
|
||||||
chars = font.decode(x)
|
chars = font.decode(obj)
|
||||||
for cid in chars:
|
for cid in chars:
|
||||||
try:
|
try:
|
||||||
char = font.to_unicode(cid)
|
char = font.to_unicode(cid)
|
||||||
|
@ -186,6 +66,92 @@ class TagExtractor(PDFDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## PDFPageAggregator
|
||||||
|
##
|
||||||
|
class PDFPageAggregator(PDFTextDevice):
|
||||||
|
|
||||||
|
def __init__(self, rsrc, pageno=1, laparams=None):
|
||||||
|
PDFTextDevice.__init__(self, rsrc)
|
||||||
|
self.laparams = laparams
|
||||||
|
self.pageno = pageno
|
||||||
|
self.stack = []
|
||||||
|
return
|
||||||
|
|
||||||
|
def begin_page(self, page):
|
||||||
|
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
|
||||||
|
return
|
||||||
|
|
||||||
|
def end_page(self, _):
|
||||||
|
assert not self.stack
|
||||||
|
assert isinstance(self.cur_item, LTPage)
|
||||||
|
self.cur_item.fixate()
|
||||||
|
if self.laparams:
|
||||||
|
self.cur_item.analyze_layout(self.laparams)
|
||||||
|
self.pageno += 1
|
||||||
|
return self.cur_item
|
||||||
|
|
||||||
|
def begin_figure(self, name, bbox, matrix):
|
||||||
|
self.stack.append(self.cur_item)
|
||||||
|
self.cur_item = LTFigure(name, bbox, matrix)
|
||||||
|
return
|
||||||
|
|
||||||
|
def end_figure(self, _):
|
||||||
|
fig = self.cur_item
|
||||||
|
self.cur_item.fixate()
|
||||||
|
self.cur_item = self.stack.pop()
|
||||||
|
self.cur_item.add(fig)
|
||||||
|
return
|
||||||
|
|
||||||
|
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||||
|
shape = ''.join(x[0] for x in path)
|
||||||
|
if shape == 'ml': # horizontal/vertical line
|
||||||
|
(_,x0,y0) = path[0]
|
||||||
|
(_,x1,y1) = path[1]
|
||||||
|
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||||
|
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||||
|
if y0 == y1:
|
||||||
|
# horizontal ruler
|
||||||
|
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
||||||
|
elif x0 == x1:
|
||||||
|
# vertical ruler
|
||||||
|
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
||||||
|
elif shape == 'mlllh':
|
||||||
|
# rectangle
|
||||||
|
(_,x0,y0) = path[0]
|
||||||
|
(_,x1,y1) = path[1]
|
||||||
|
(_,x2,y2) = path[2]
|
||||||
|
(_,x3,y3) = path[3]
|
||||||
|
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||||
|
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||||
|
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
||||||
|
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
|
||||||
|
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||||
|
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||||
|
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||||
|
return
|
||||||
|
|
||||||
|
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
|
if not chars: return (0, 0)
|
||||||
|
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
|
||||||
|
self.cur_item.add(item)
|
||||||
|
return item.adv
|
||||||
|
|
||||||
|
|
||||||
|
## PDFConverter
|
||||||
|
##
|
||||||
|
class PDFConverter(PDFPageAggregator):
|
||||||
|
|
||||||
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||||
|
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
||||||
|
self.outfp = outfp
|
||||||
|
self.codec = codec
|
||||||
|
return
|
||||||
|
|
||||||
|
def write(self, text):
|
||||||
|
self.outfp.write(enc(text, self.codec))
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## SGMLConverter
|
## SGMLConverter
|
||||||
##
|
##
|
||||||
class SGMLConverter(PDFConverter):
|
class SGMLConverter(PDFConverter):
|
||||||
|
|
|
@ -270,7 +270,7 @@ class LTTextItem(LayoutItem, LTText):
|
||||||
self.vertical = self.font.is_vertical()
|
self.vertical = self.font.is_vertical()
|
||||||
self.text = ''.join( char for (char,_) in chars )
|
self.text = ''.join( char for (char,_) in chars )
|
||||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||||
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
adv = (adv * fontsize + len(chars)*charspace) * scaling
|
||||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||||
if not self.vertical:
|
if not self.vertical:
|
||||||
# horizontal text
|
# horizontal text
|
||||||
|
@ -410,6 +410,7 @@ def tsort(objs, f):
|
||||||
go = dict( (obj,[]) for obj in objs )
|
go = dict( (obj,[]) for obj in objs )
|
||||||
for obj1 in objs:
|
for obj1 in objs:
|
||||||
for obj2 in objs:
|
for obj2 in objs:
|
||||||
|
if obj1 is obj2: continue
|
||||||
if f(obj1, obj2): # obj1 -> obj2
|
if f(obj1, obj2): # obj1 -> obj2
|
||||||
go[obj1].append(obj2)
|
go[obj1].append(obj2)
|
||||||
gi[obj2].append(obj1)
|
gi[obj2].append(obj1)
|
||||||
|
@ -478,7 +479,7 @@ class LTPage(LayoutContainer):
|
||||||
elif obj1.voverlap(obj2):
|
elif obj1.voverlap(obj2):
|
||||||
return obj1.x1 < obj2.x0
|
return obj1.x1 < obj2.x0
|
||||||
else:
|
else:
|
||||||
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
|
return obj1.x0 < obj2.x0 and obj2.y1 < obj1.y1
|
||||||
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
||||||
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
|
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
|
||||||
hline)
|
hline)
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from pdfminer.utils import mult_matrix, translate_matrix
|
||||||
|
|
||||||
|
|
||||||
## PDFDevice
|
## PDFDevice
|
||||||
##
|
##
|
||||||
class PDFDevice(object):
|
class PDFDevice(object):
|
||||||
|
@ -39,7 +42,59 @@ class PDFDevice(object):
|
||||||
|
|
||||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||||
return
|
return
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
|
||||||
return
|
|
||||||
def render_image(self, stream, size):
|
def render_image(self, stream, size):
|
||||||
return
|
return
|
||||||
|
def render_string(self, textstate, seq):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## PDFTextDevice
|
||||||
|
##
|
||||||
|
class PDFTextDevice(PDFDevice):
|
||||||
|
|
||||||
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
|
if self.debug:
|
||||||
|
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
|
return '?'
|
||||||
|
|
||||||
|
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||||
|
return (0, 0)
|
||||||
|
|
||||||
|
def render_string(self, textstate, seq):
|
||||||
|
matrix = mult_matrix(textstate.matrix, self.ctm)
|
||||||
|
font = textstate.font
|
||||||
|
fontsize = textstate.fontsize
|
||||||
|
charspace = textstate.charspace
|
||||||
|
scaling = textstate.scaling * .01
|
||||||
|
wordspace = textstate.wordspace * scaling
|
||||||
|
dxscale = scaling / (font.hscale*1000) * .01
|
||||||
|
chars = []
|
||||||
|
(x,y) = textstate.linematrix
|
||||||
|
for obj in seq:
|
||||||
|
if isinstance(obj, int) or isinstance(obj, float):
|
||||||
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
|
fontsize, charspace, scaling, chars)
|
||||||
|
x += dx-obj*dxscale
|
||||||
|
y += dy
|
||||||
|
chars = []
|
||||||
|
else:
|
||||||
|
for cid in font.decode(obj):
|
||||||
|
try:
|
||||||
|
char = font.to_unicode(cid)
|
||||||
|
except PDFUnicodeNotDefined, e:
|
||||||
|
(cidcoding, cid) = e.args
|
||||||
|
char = self.handle_undefined_char(cidcoding, cid)
|
||||||
|
chars.append((char, cid))
|
||||||
|
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
||||||
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
|
fontsize, charspace, scaling, chars)
|
||||||
|
x += dx + wordspace
|
||||||
|
y += dy
|
||||||
|
chars = []
|
||||||
|
if chars:
|
||||||
|
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||||
|
fontsize, charspace, scaling, chars)
|
||||||
|
x += dx
|
||||||
|
y += dy
|
||||||
|
textstate.linematrix = (x,y)
|
||||||
|
return
|
||||||
|
|
|
@ -49,6 +49,8 @@ class PDFTextState(object):
|
||||||
self.render = 0
|
self.render = 0
|
||||||
self.rise = 0
|
self.rise = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
|
# self.matrix is set
|
||||||
|
# self.linematrix is set
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -630,23 +632,7 @@ class PDFPageInterpreter(object):
|
||||||
# show-pos
|
# show-pos
|
||||||
def do_TJ(self, seq):
|
def do_TJ(self, seq):
|
||||||
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
||||||
textstate = self.textstate
|
self.device.render_string(self.textstate, seq)
|
||||||
textmatrix = translate_matrix(textstate.matrix, textstate.linematrix)
|
|
||||||
self.device.render_string(textstate, textmatrix, seq)
|
|
||||||
font = textstate.font
|
|
||||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
|
||||||
w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
|
|
||||||
len(s) * textstate.charspace)
|
|
||||||
(lx,ly) = textstate.linematrix
|
|
||||||
if font.is_vertical():
|
|
||||||
# advance vertically
|
|
||||||
ly += w * (textstate.scaling * .01)
|
|
||||||
else:
|
|
||||||
# advance horizontally
|
|
||||||
if not font.is_multibyte():
|
|
||||||
w += s.count(' ')*textstate.wordspace
|
|
||||||
lx += w * (textstate.scaling * .01)
|
|
||||||
textstate.linematrix = (lx,ly)
|
|
||||||
return
|
return
|
||||||
# show
|
# show
|
||||||
def do_Tj(self, s):
|
def do_Tj(self, s):
|
||||||
|
|
Loading…
Reference in New Issue