git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@124 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
11885cc15e
commit
585dd59b70
|
@ -1,131 +1,11 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.pdfdevice import PDFDevice, PDFTextDevice
|
||||
from pdfminer.pdffont import PDFUnicodeNotDefined
|
||||
from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine
|
||||
from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc
|
||||
from pdfminer.utils import apply_matrix_pt, enc
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
##
|
||||
class PDFPageAggregator(PDFDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1, laparams=None):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
self.laparams = laparams
|
||||
self.undefined_char = '?'
|
||||
self.pageno = pageno
|
||||
self.stack = []
|
||||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
|
||||
return
|
||||
|
||||
def end_page(self, _):
|
||||
assert not self.stack
|
||||
assert isinstance(self.cur_item, LTPage)
|
||||
self.cur_item.fixate()
|
||||
if self.laparams:
|
||||
self.cur_item.analyze_layout(self.laparams)
|
||||
self.pageno += 1
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
self.stack.append(self.cur_item)
|
||||
self.cur_item = LTFigure(name, bbox, matrix)
|
||||
return
|
||||
|
||||
def end_figure(self, _):
|
||||
fig = self.cur_item
|
||||
self.cur_item.fixate()
|
||||
self.cur_item = self.stack.pop()
|
||||
self.cur_item.add(fig)
|
||||
return
|
||||
|
||||
def handle_undefined_char(self, cidcoding, cid):
|
||||
if self.debug:
|
||||
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
return self.undefined_char
|
||||
|
||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
shape = ''.join(x[0] for x in path)
|
||||
if shape == 'ml': # horizontal/vertical line
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
if y0 == y1:
|
||||
# horizontal ruler
|
||||
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
||||
elif x0 == x1:
|
||||
# vertical ruler
|
||||
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
||||
elif shape == 'mlllh':
|
||||
# rectangle
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(_,x2,y2) = path[2]
|
||||
(_,x3,y3) = path[3]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
||||
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
|
||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||
return
|
||||
|
||||
def render_chars(self, textmatrix, textstate, chars):
|
||||
if not chars: return (0, 0)
|
||||
item = LTTextItem(textmatrix, textstate.font, textstate.fontsize,
|
||||
textstate.charspace, textstate.scaling, chars)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
font = textstate.font
|
||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||
scaling = textstate.scaling * .01
|
||||
dxscale = scaling / (font.hscale*1000) * .01
|
||||
wordspace = textstate.wordspace * scaling
|
||||
chars = []
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy))
|
||||
chars = []
|
||||
else:
|
||||
for cid in font.decode(x):
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid))
|
||||
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy))
|
||||
chars = []
|
||||
self.render_chars(textmatrix, textstate, chars)
|
||||
return
|
||||
|
||||
|
||||
## PDFConverter
|
||||
##
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
self.outfp.write(enc(text, self.codec))
|
||||
return
|
||||
|
||||
|
||||
## TagExtractor
|
||||
##
|
||||
class TagExtractor(PDFDevice):
|
||||
|
@ -138,12 +18,12 @@ class TagExtractor(PDFDevice):
|
|||
self.tag = None
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
def render_string(self, textstate, seq):
|
||||
font = textstate.font
|
||||
text = ''
|
||||
for x in seq:
|
||||
if not isinstance(x, str): continue
|
||||
chars = font.decode(x)
|
||||
for obj in seq:
|
||||
if not isinstance(obj, str): continue
|
||||
chars = font.decode(obj)
|
||||
for cid in chars:
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
|
@ -186,6 +66,92 @@ class TagExtractor(PDFDevice):
|
|||
return
|
||||
|
||||
|
||||
## PDFPageAggregator
|
||||
##
|
||||
class PDFPageAggregator(PDFTextDevice):
|
||||
|
||||
def __init__(self, rsrc, pageno=1, laparams=None):
|
||||
PDFTextDevice.__init__(self, rsrc)
|
||||
self.laparams = laparams
|
||||
self.pageno = pageno
|
||||
self.stack = []
|
||||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate)
|
||||
return
|
||||
|
||||
def end_page(self, _):
|
||||
assert not self.stack
|
||||
assert isinstance(self.cur_item, LTPage)
|
||||
self.cur_item.fixate()
|
||||
if self.laparams:
|
||||
self.cur_item.analyze_layout(self.laparams)
|
||||
self.pageno += 1
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
self.stack.append(self.cur_item)
|
||||
self.cur_item = LTFigure(name, bbox, matrix)
|
||||
return
|
||||
|
||||
def end_figure(self, _):
|
||||
fig = self.cur_item
|
||||
self.cur_item.fixate()
|
||||
self.cur_item = self.stack.pop()
|
||||
self.cur_item.add(fig)
|
||||
return
|
||||
|
||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
shape = ''.join(x[0] for x in path)
|
||||
if shape == 'ml': # horizontal/vertical line
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
if y0 == y1:
|
||||
# horizontal ruler
|
||||
self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1)))
|
||||
elif x0 == x1:
|
||||
# vertical ruler
|
||||
self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1)))
|
||||
elif shape == 'mlllh':
|
||||
# rectangle
|
||||
(_,x0,y0) = path[0]
|
||||
(_,x1,y1) = path[1]
|
||||
(_,x2,y2) = path[2]
|
||||
(_,x3,y3) = path[3]
|
||||
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
|
||||
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
|
||||
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y2))
|
||||
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
|
||||
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
|
||||
return
|
||||
|
||||
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
if not chars: return (0, 0)
|
||||
item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars)
|
||||
self.cur_item.add(item)
|
||||
return item.adv
|
||||
|
||||
|
||||
## PDFConverter
|
||||
##
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
||||
PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
self.outfp.write(enc(text, self.codec))
|
||||
return
|
||||
|
||||
|
||||
## SGMLConverter
|
||||
##
|
||||
class SGMLConverter(PDFConverter):
|
||||
|
|
|
@ -270,7 +270,7 @@ class LTTextItem(LayoutItem, LTText):
|
|||
self.vertical = self.font.is_vertical()
|
||||
self.text = ''.join( char for (char,_) in chars )
|
||||
adv = sum( font.char_width(cid) for (_,cid) in chars )
|
||||
adv = (adv * fontsize + len(chars)*charspace) * scaling * .01
|
||||
adv = (adv * fontsize + len(chars)*charspace) * scaling
|
||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||
if not self.vertical:
|
||||
# horizontal text
|
||||
|
@ -410,6 +410,7 @@ def tsort(objs, f):
|
|||
go = dict( (obj,[]) for obj in objs )
|
||||
for obj1 in objs:
|
||||
for obj2 in objs:
|
||||
if obj1 is obj2: continue
|
||||
if f(obj1, obj2): # obj1 -> obj2
|
||||
go[obj1].append(obj2)
|
||||
gi[obj2].append(obj1)
|
||||
|
@ -478,7 +479,7 @@ class LTPage(LayoutContainer):
|
|||
elif obj1.voverlap(obj2):
|
||||
return obj1.x1 < obj2.x0
|
||||
else:
|
||||
return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0
|
||||
return obj1.x0 < obj2.x0 and obj2.y1 < obj1.y1
|
||||
lines = ClusterSet.build(textobjs, laparams.char_margin, 0,
|
||||
(lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)),
|
||||
hline)
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from pdfminer.utils import mult_matrix, translate_matrix
|
||||
|
||||
|
||||
## PDFDevice
|
||||
##
|
||||
class PDFDevice(object):
|
||||
|
@ -39,7 +42,59 @@ class PDFDevice(object):
|
|||
|
||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||
return
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
return
|
||||
def render_image(self, stream, size):
|
||||
return
|
||||
def render_string(self, textstate, seq):
|
||||
return
|
||||
|
||||
|
||||
## PDFTextDevice
|
||||
##
|
||||
class PDFTextDevice(PDFDevice):
|
||||
|
||||
def handle_undefined_char(self, cidcoding, cid):
|
||||
if self.debug:
|
||||
print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||
return '?'
|
||||
|
||||
def render_chars(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
return (0, 0)
|
||||
|
||||
def render_string(self, textstate, seq):
|
||||
matrix = mult_matrix(textstate.matrix, self.ctm)
|
||||
font = textstate.font
|
||||
fontsize = textstate.fontsize
|
||||
charspace = textstate.charspace
|
||||
scaling = textstate.scaling * .01
|
||||
wordspace = textstate.wordspace * scaling
|
||||
dxscale = scaling / (font.hscale*1000) * .01
|
||||
chars = []
|
||||
(x,y) = textstate.linematrix
|
||||
for obj in seq:
|
||||
if isinstance(obj, int) or isinstance(obj, float):
|
||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||
fontsize, charspace, scaling, chars)
|
||||
x += dx-obj*dxscale
|
||||
y += dy
|
||||
chars = []
|
||||
else:
|
||||
for cid in font.decode(obj):
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid))
|
||||
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||
fontsize, charspace, scaling, chars)
|
||||
x += dx + wordspace
|
||||
y += dy
|
||||
chars = []
|
||||
if chars:
|
||||
(dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font,
|
||||
fontsize, charspace, scaling, chars)
|
||||
x += dx
|
||||
y += dy
|
||||
textstate.linematrix = (x,y)
|
||||
return
|
||||
|
|
|
@ -49,6 +49,8 @@ class PDFTextState(object):
|
|||
self.render = 0
|
||||
self.rise = 0
|
||||
self.reset()
|
||||
# self.matrix is set
|
||||
# self.linematrix is set
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -630,23 +632,7 @@ class PDFPageInterpreter(object):
|
|||
# show-pos
|
||||
def do_TJ(self, seq):
|
||||
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
||||
textstate = self.textstate
|
||||
textmatrix = translate_matrix(textstate.matrix, textstate.linematrix)
|
||||
self.device.render_string(textstate, textmatrix, seq)
|
||||
font = textstate.font
|
||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
||||
w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
|
||||
len(s) * textstate.charspace)
|
||||
(lx,ly) = textstate.linematrix
|
||||
if font.is_vertical():
|
||||
# advance vertically
|
||||
ly += w * (textstate.scaling * .01)
|
||||
else:
|
||||
# advance horizontally
|
||||
if not font.is_multibyte():
|
||||
w += s.count(' ')*textstate.wordspace
|
||||
lx += w * (textstate.scaling * .01)
|
||||
textstate.linematrix = (lx,ly)
|
||||
self.device.render_string(self.textstate, seq)
|
||||
return
|
||||
# show
|
||||
def do_Tj(self, s):
|
||||
|
|
Loading…
Reference in New Issue