git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@14 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2008-01-09 14:40:04 +00:00
parent da778dee6f
commit dc77b838f7
4 changed files with 121 additions and 22 deletions

89
extent.py Executable file
View File

@ -0,0 +1,89 @@
#!/usr/bin/env python
import sys
INF = sys.maxint
## Rect
##
class Rect:
def __init__(self, x0=-INF, y0=-INF, w=None, h=None):
self.x0 = x0
self.y0 = y0
if w == None:
self.x1 = INF
else:
self.x1 = x0+w
if h == None:
self.y1 = INF
else:
self.y1 = y0+h
return
def overlap(self, rect):
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
rect.y1 <= self.y0 or self.y1 <= rect.y0)
## ExtSet
##
class ExtSet:
def __init__(self, gridsize):
self.gridsize = gridsize
self.grid = []
return
def cells(self, x0, x1):
i = int(x0 / self.gridsize)
x = i * self.gridsize
while x < x1:
yield i
x += self.gridsize
i += 1
return
def add(self, x0, x1, obj):
for i in self.cells(x0, x1):
self.grid[i].append(obj)
return
def get(self, x0, x1):
objs = set()
for i in self.cells(x0, x1):
objs.update(self.grid[i])
return objs
def test_extset():
e=ExtSet(10)
assert list(e.cells(-1, 1)) == [-1,0]
assert list(e.cells(0, 1)) == [0]
assert list(e.cells(0, 10)) == [0]
assert list(e.cells(0, 11)) == [0,1]
assert list(e.cells(1, 11)) == [0,1]
assert list(e.cells(10, 11)) == [1]
assert list(e.cells(0, 20)) == [0,1]
assert list(e.cells(10, 20)) == [1]
assert list(e.cells(1,21)) == [0,1,2]
assert list(e.cells(11,21)) == [1,2]
return
## ExtGrid
##
class ExtGrid:
def __init__(self, gridsize):
self.hext = ExtSet(gridsize)
self.vext = ExtSet(gridsize)
return
def add(self, rect):
self.hext.add(rect.x0, rect.x1, rect)
self.vext.add(rect.y0, rect.y1, rect)
return
def get(self, rect):
rects = self.hext.get(rect.x0, rect.x1)
rects.update_intersect(self.vext.get(rect.y0, rect.y1))
return rects

View File

@ -23,13 +23,20 @@ class TextConverter(PDFDevice):
self.outfp.write('\n')
return
def begin_block(self, name, (x0,y0,x1,y1)):
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
def begin_page(self, name, (x0,y0,x1,y1)):
self.outfp.write('<page name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
(name,x0,y0,x1,y1))
return
def end_block(self):
self.outfp.write('</block>\n')
def end_page(self, _):
self.outfp.write('</page>\n')
return
def begin_figure(self, name, (x0,y0,x1,y1)):
self.outfp.write('<figure name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
(name,x0,y0,x1,y1))
return
def end_figure(self, _):
self.outfp.write('</figure>\n')
return
def handle_undefined_char(self, cidcoding, cid):
@ -73,6 +80,7 @@ class TextConverter(PDFDevice):
# pdf2txt
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
device = TextConverter(outfp, rsrc, codec)
outfp.write('<document>')
doc = PDFDocument(debug=debug)
fp = file(fname)
parser = PDFParser(doc, fp, debug=debug)
@ -81,6 +89,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
if pages and (i not in pages): continue
interpreter.process_page(page)
fp.close()
outfp.write('</document>')
device.close()
return

View File

@ -189,7 +189,8 @@ class PDFType3Font(PDFSimpleFont):
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
descriptor = {'FontName':spec.get('Name'),
'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
@ -442,9 +443,13 @@ class PDFDevice:
self.ctm = ctm
return
def begin_block(self, name, bbox):
def begin_page(self, name, bbox):
return
def end_block(self):
def end_page(self, name):
return
def begin_figure(self, name, bbox):
return
def end_figure(self, name):
return
def render_string(self, textstate, textmatrix, size, seq):
@ -820,26 +825,23 @@ class PDFPageInterpreter:
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
(x0,y0) = apply_matrix(ctm, (x0,y0))
(x1,y1) = apply_matrix(ctm, (x1,y1))
interpreter.render_contents(xobjid,
(x0,y0,x1,y1),
xobj.dic.get('Resources'),
[xobj],
ctm=ctm)
bbox = (x0,y0,x1,y1)
self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(xobj.dic.get('Resources'),
[xobj], ctm=ctm)
self.device.end_figure(xobjid)
return
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
self.render_contents('page-%d' % page.pageid,
page.mediabox,
page.resources,
page.contents)
self.device.begin_page(page.pageid, page.mediabox)
self.render_contents(page.resources, page.contents)
self.device.end_page(page.pageid)
return
def render_contents(self, contid, mediabox, resources, contents,
ctm=MATRIX_IDENTITY):
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
self.initpage(ctm)
self.device.begin_block(contid, mediabox)
# Handle resource declarations.
def get_colorspace(spec):
if isinstance(spec, list):
@ -874,7 +876,6 @@ class PDFPageInterpreter:
data = ''.join( stream_value(stream).get_data()
for stream in list_value(contents) )
self.execute(data)
self.device.end_block()
return
def execute(self, data):

View File

@ -117,7 +117,7 @@ def str_value(x):
def list_value(x):
x = resolve1(x)
if not isinstance(x, list):
if not (isinstance(x, list) or isinstance(x, tuple)):
raise PDFTypeError('list required: %r' % x)
return x