git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@14 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
da778dee6f
commit
dc77b838f7
|
@ -0,0 +1,89 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
INF = sys.maxint
|
||||||
|
|
||||||
|
|
||||||
|
## Rect
|
||||||
|
##
|
||||||
|
class Rect:
|
||||||
|
|
||||||
|
def __init__(self, x0=-INF, y0=-INF, w=None, h=None):
|
||||||
|
self.x0 = x0
|
||||||
|
self.y0 = y0
|
||||||
|
if w == None:
|
||||||
|
self.x1 = INF
|
||||||
|
else:
|
||||||
|
self.x1 = x0+w
|
||||||
|
if h == None:
|
||||||
|
self.y1 = INF
|
||||||
|
else:
|
||||||
|
self.y1 = y0+h
|
||||||
|
return
|
||||||
|
|
||||||
|
def overlap(self, rect):
|
||||||
|
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
|
||||||
|
rect.y1 <= self.y0 or self.y1 <= rect.y0)
|
||||||
|
|
||||||
|
|
||||||
|
## ExtSet
|
||||||
|
##
|
||||||
|
class ExtSet:
|
||||||
|
|
||||||
|
def __init__(self, gridsize):
|
||||||
|
self.gridsize = gridsize
|
||||||
|
self.grid = []
|
||||||
|
return
|
||||||
|
|
||||||
|
def cells(self, x0, x1):
|
||||||
|
i = int(x0 / self.gridsize)
|
||||||
|
x = i * self.gridsize
|
||||||
|
while x < x1:
|
||||||
|
yield i
|
||||||
|
x += self.gridsize
|
||||||
|
i += 1
|
||||||
|
return
|
||||||
|
|
||||||
|
def add(self, x0, x1, obj):
|
||||||
|
for i in self.cells(x0, x1):
|
||||||
|
self.grid[i].append(obj)
|
||||||
|
return
|
||||||
|
|
||||||
|
def get(self, x0, x1):
|
||||||
|
objs = set()
|
||||||
|
for i in self.cells(x0, x1):
|
||||||
|
objs.update(self.grid[i])
|
||||||
|
return objs
|
||||||
|
|
||||||
|
def test_extset():
|
||||||
|
e=ExtSet(10)
|
||||||
|
assert list(e.cells(-1, 1)) == [-1,0]
|
||||||
|
assert list(e.cells(0, 1)) == [0]
|
||||||
|
assert list(e.cells(0, 10)) == [0]
|
||||||
|
assert list(e.cells(0, 11)) == [0,1]
|
||||||
|
assert list(e.cells(1, 11)) == [0,1]
|
||||||
|
assert list(e.cells(10, 11)) == [1]
|
||||||
|
assert list(e.cells(0, 20)) == [0,1]
|
||||||
|
assert list(e.cells(10, 20)) == [1]
|
||||||
|
assert list(e.cells(1,21)) == [0,1,2]
|
||||||
|
assert list(e.cells(11,21)) == [1,2]
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## ExtGrid
|
||||||
|
##
|
||||||
|
class ExtGrid:
|
||||||
|
|
||||||
|
def __init__(self, gridsize):
|
||||||
|
self.hext = ExtSet(gridsize)
|
||||||
|
self.vext = ExtSet(gridsize)
|
||||||
|
return
|
||||||
|
|
||||||
|
def add(self, rect):
|
||||||
|
self.hext.add(rect.x0, rect.x1, rect)
|
||||||
|
self.vext.add(rect.y0, rect.y1, rect)
|
||||||
|
return
|
||||||
|
|
||||||
|
def get(self, rect):
|
||||||
|
rects = self.hext.get(rect.x0, rect.x1)
|
||||||
|
rects.update_intersect(self.vext.get(rect.y0, rect.y1))
|
||||||
|
return rects
|
19
pdf2txt.py
19
pdf2txt.py
|
@ -23,13 +23,20 @@ class TextConverter(PDFDevice):
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_block(self, name, (x0,y0,x1,y1)):
|
def begin_page(self, name, (x0,y0,x1,y1)):
|
||||||
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
self.outfp.write('<page name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||||
(name,x0,y0,x1,y1))
|
(name,x0,y0,x1,y1))
|
||||||
return
|
return
|
||||||
|
def end_page(self, _):
|
||||||
def end_block(self):
|
self.outfp.write('</page>\n')
|
||||||
self.outfp.write('</block>\n')
|
return
|
||||||
|
|
||||||
|
def begin_figure(self, name, (x0,y0,x1,y1)):
|
||||||
|
self.outfp.write('<figure name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||||
|
(name,x0,y0,x1,y1))
|
||||||
|
return
|
||||||
|
def end_figure(self, _):
|
||||||
|
self.outfp.write('</figure>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_undefined_char(self, cidcoding, cid):
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
|
@ -73,6 +80,7 @@ class TextConverter(PDFDevice):
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
device = TextConverter(outfp, rsrc, codec)
|
device = TextConverter(outfp, rsrc, codec)
|
||||||
|
outfp.write('<document>')
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname)
|
fp = file(fname)
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp, debug=debug)
|
||||||
|
@ -81,6 +89,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
if pages and (i not in pages): continue
|
if pages and (i not in pages): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
outfp.write('</document>')
|
||||||
device.close()
|
device.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
33
pdfinterp.py
33
pdfinterp.py
|
@ -189,7 +189,8 @@ class PDFType3Font(PDFSimpleFont):
|
||||||
if 'FontDescriptor' in spec:
|
if 'FontDescriptor' in spec:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
else:
|
else:
|
||||||
descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
|
descriptor = {'FontName':spec.get('Name'),
|
||||||
|
'Ascent':0, 'Descent':0,
|
||||||
'FontBBox':spec['FontBBox']}
|
'FontBBox':spec['FontBBox']}
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
return
|
return
|
||||||
|
@ -442,9 +443,13 @@ class PDFDevice:
|
||||||
self.ctm = ctm
|
self.ctm = ctm
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_block(self, name, bbox):
|
def begin_page(self, name, bbox):
|
||||||
return
|
return
|
||||||
def end_block(self):
|
def end_page(self, name):
|
||||||
|
return
|
||||||
|
def begin_figure(self, name, bbox):
|
||||||
|
return
|
||||||
|
def end_figure(self, name):
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, size, seq):
|
def render_string(self, textstate, textmatrix, size, seq):
|
||||||
|
@ -820,26 +825,23 @@ class PDFPageInterpreter:
|
||||||
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
|
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
|
||||||
(x0,y0) = apply_matrix(ctm, (x0,y0))
|
(x0,y0) = apply_matrix(ctm, (x0,y0))
|
||||||
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
||||||
interpreter.render_contents(xobjid,
|
bbox = (x0,y0,x1,y1)
|
||||||
(x0,y0,x1,y1),
|
self.device.begin_figure(xobjid, bbox)
|
||||||
xobj.dic.get('Resources'),
|
interpreter.render_contents(xobj.dic.get('Resources'),
|
||||||
[xobj],
|
[xobj], ctm=ctm)
|
||||||
ctm=ctm)
|
self.device.end_figure(xobjid)
|
||||||
return
|
return
|
||||||
|
|
||||||
def process_page(self, page):
|
def process_page(self, page):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing page: %r' % page
|
print >>stderr, 'Processing page: %r' % page
|
||||||
self.render_contents('page-%d' % page.pageid,
|
self.device.begin_page(page.pageid, page.mediabox)
|
||||||
page.mediabox,
|
self.render_contents(page.resources, page.contents)
|
||||||
page.resources,
|
self.device.end_page(page.pageid)
|
||||||
page.contents)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_contents(self, contid, mediabox, resources, contents,
|
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
||||||
ctm=MATRIX_IDENTITY):
|
|
||||||
self.initpage(ctm)
|
self.initpage(ctm)
|
||||||
self.device.begin_block(contid, mediabox)
|
|
||||||
# Handle resource declarations.
|
# Handle resource declarations.
|
||||||
def get_colorspace(spec):
|
def get_colorspace(spec):
|
||||||
if isinstance(spec, list):
|
if isinstance(spec, list):
|
||||||
|
@ -874,7 +876,6 @@ class PDFPageInterpreter:
|
||||||
data = ''.join( stream_value(stream).get_data()
|
data = ''.join( stream_value(stream).get_data()
|
||||||
for stream in list_value(contents) )
|
for stream in list_value(contents) )
|
||||||
self.execute(data)
|
self.execute(data)
|
||||||
self.device.end_block()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def execute(self, data):
|
def execute(self, data):
|
||||||
|
|
|
@ -117,7 +117,7 @@ def str_value(x):
|
||||||
|
|
||||||
def list_value(x):
|
def list_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, list):
|
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||||
raise PDFTypeError('list required: %r' % x)
|
raise PDFTypeError('list required: %r' % x)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue