git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@14 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
da778dee6f
commit
dc77b838f7
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
INF = sys.maxint
|
||||
|
||||
|
||||
## Rect
|
||||
##
|
||||
class Rect:
|
||||
|
||||
def __init__(self, x0=-INF, y0=-INF, w=None, h=None):
|
||||
self.x0 = x0
|
||||
self.y0 = y0
|
||||
if w == None:
|
||||
self.x1 = INF
|
||||
else:
|
||||
self.x1 = x0+w
|
||||
if h == None:
|
||||
self.y1 = INF
|
||||
else:
|
||||
self.y1 = y0+h
|
||||
return
|
||||
|
||||
def overlap(self, rect):
|
||||
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
|
||||
rect.y1 <= self.y0 or self.y1 <= rect.y0)
|
||||
|
||||
|
||||
## ExtSet
|
||||
##
|
||||
class ExtSet:
|
||||
|
||||
def __init__(self, gridsize):
|
||||
self.gridsize = gridsize
|
||||
self.grid = []
|
||||
return
|
||||
|
||||
def cells(self, x0, x1):
|
||||
i = int(x0 / self.gridsize)
|
||||
x = i * self.gridsize
|
||||
while x < x1:
|
||||
yield i
|
||||
x += self.gridsize
|
||||
i += 1
|
||||
return
|
||||
|
||||
def add(self, x0, x1, obj):
|
||||
for i in self.cells(x0, x1):
|
||||
self.grid[i].append(obj)
|
||||
return
|
||||
|
||||
def get(self, x0, x1):
|
||||
objs = set()
|
||||
for i in self.cells(x0, x1):
|
||||
objs.update(self.grid[i])
|
||||
return objs
|
||||
|
||||
def test_extset():
|
||||
e=ExtSet(10)
|
||||
assert list(e.cells(-1, 1)) == [-1,0]
|
||||
assert list(e.cells(0, 1)) == [0]
|
||||
assert list(e.cells(0, 10)) == [0]
|
||||
assert list(e.cells(0, 11)) == [0,1]
|
||||
assert list(e.cells(1, 11)) == [0,1]
|
||||
assert list(e.cells(10, 11)) == [1]
|
||||
assert list(e.cells(0, 20)) == [0,1]
|
||||
assert list(e.cells(10, 20)) == [1]
|
||||
assert list(e.cells(1,21)) == [0,1,2]
|
||||
assert list(e.cells(11,21)) == [1,2]
|
||||
return
|
||||
|
||||
|
||||
## ExtGrid
|
||||
##
|
||||
class ExtGrid:
|
||||
|
||||
def __init__(self, gridsize):
|
||||
self.hext = ExtSet(gridsize)
|
||||
self.vext = ExtSet(gridsize)
|
||||
return
|
||||
|
||||
def add(self, rect):
|
||||
self.hext.add(rect.x0, rect.x1, rect)
|
||||
self.vext.add(rect.y0, rect.y1, rect)
|
||||
return
|
||||
|
||||
def get(self, rect):
|
||||
rects = self.hext.get(rect.x0, rect.x1)
|
||||
rects.update_intersect(self.vext.get(rect.y0, rect.y1))
|
||||
return rects
|
17
pdf2txt.py
17
pdf2txt.py
|
@ -23,13 +23,20 @@ class TextConverter(PDFDevice):
|
|||
self.outfp.write('\n')
|
||||
return
|
||||
|
||||
def begin_block(self, name, (x0,y0,x1,y1)):
|
||||
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||
def begin_page(self, name, (x0,y0,x1,y1)):
|
||||
self.outfp.write('<page name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||
(name,x0,y0,x1,y1))
|
||||
return
|
||||
def end_page(self, _):
|
||||
self.outfp.write('</page>\n')
|
||||
return
|
||||
|
||||
def end_block(self):
|
||||
self.outfp.write('</block>\n')
|
||||
def begin_figure(self, name, (x0,y0,x1,y1)):
|
||||
self.outfp.write('<figure name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||
(name,x0,y0,x1,y1))
|
||||
return
|
||||
def end_figure(self, _):
|
||||
self.outfp.write('</figure>\n')
|
||||
return
|
||||
|
||||
def handle_undefined_char(self, cidcoding, cid):
|
||||
|
@ -73,6 +80,7 @@ class TextConverter(PDFDevice):
|
|||
# pdf2txt
|
||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||
device = TextConverter(outfp, rsrc, codec)
|
||||
outfp.write('<document>')
|
||||
doc = PDFDocument(debug=debug)
|
||||
fp = file(fname)
|
||||
parser = PDFParser(doc, fp, debug=debug)
|
||||
|
@ -81,6 +89,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
|||
if pages and (i not in pages): continue
|
||||
interpreter.process_page(page)
|
||||
fp.close()
|
||||
outfp.write('</document>')
|
||||
device.close()
|
||||
return
|
||||
|
||||
|
|
33
pdfinterp.py
33
pdfinterp.py
|
@ -189,7 +189,8 @@ class PDFType3Font(PDFSimpleFont):
|
|||
if 'FontDescriptor' in spec:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
else:
|
||||
descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
|
||||
descriptor = {'FontName':spec.get('Name'),
|
||||
'Ascent':0, 'Descent':0,
|
||||
'FontBBox':spec['FontBBox']}
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
return
|
||||
|
@ -442,9 +443,13 @@ class PDFDevice:
|
|||
self.ctm = ctm
|
||||
return
|
||||
|
||||
def begin_block(self, name, bbox):
|
||||
def begin_page(self, name, bbox):
|
||||
return
|
||||
def end_block(self):
|
||||
def end_page(self, name):
|
||||
return
|
||||
def begin_figure(self, name, bbox):
|
||||
return
|
||||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, size, seq):
|
||||
|
@ -820,26 +825,23 @@ class PDFPageInterpreter:
|
|||
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
|
||||
(x0,y0) = apply_matrix(ctm, (x0,y0))
|
||||
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
||||
interpreter.render_contents(xobjid,
|
||||
(x0,y0,x1,y1),
|
||||
xobj.dic.get('Resources'),
|
||||
[xobj],
|
||||
ctm=ctm)
|
||||
bbox = (x0,y0,x1,y1)
|
||||
self.device.begin_figure(xobjid, bbox)
|
||||
interpreter.render_contents(xobj.dic.get('Resources'),
|
||||
[xobj], ctm=ctm)
|
||||
self.device.end_figure(xobjid)
|
||||
return
|
||||
|
||||
def process_page(self, page):
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Processing page: %r' % page
|
||||
self.render_contents('page-%d' % page.pageid,
|
||||
page.mediabox,
|
||||
page.resources,
|
||||
page.contents)
|
||||
self.device.begin_page(page.pageid, page.mediabox)
|
||||
self.render_contents(page.resources, page.contents)
|
||||
self.device.end_page(page.pageid)
|
||||
return
|
||||
|
||||
def render_contents(self, contid, mediabox, resources, contents,
|
||||
ctm=MATRIX_IDENTITY):
|
||||
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
||||
self.initpage(ctm)
|
||||
self.device.begin_block(contid, mediabox)
|
||||
# Handle resource declarations.
|
||||
def get_colorspace(spec):
|
||||
if isinstance(spec, list):
|
||||
|
@ -874,7 +876,6 @@ class PDFPageInterpreter:
|
|||
data = ''.join( stream_value(stream).get_data()
|
||||
for stream in list_value(contents) )
|
||||
self.execute(data)
|
||||
self.device.end_block()
|
||||
return
|
||||
|
||||
def execute(self, data):
|
||||
|
|
|
@ -117,7 +117,7 @@ def str_value(x):
|
|||
|
||||
def list_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, list):
|
||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||
raise PDFTypeError('list required: %r' % x)
|
||||
return x
|
||||
|
||||
|
|
Loading…
Reference in New Issue