diff --git a/extent.py b/extent.py
new file mode 100755
index 0000000..67e005d
--- /dev/null
+++ b/extent.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+import sys
+INF = sys.maxint
+
+
+## Rect
+##
+class Rect:
+
+ def __init__(self, x0=-INF, y0=-INF, w=None, h=None):
+ self.x0 = x0
+ self.y0 = y0
+ if w == None:
+ self.x1 = INF
+ else:
+ self.x1 = x0+w
+ if h == None:
+ self.y1 = INF
+ else:
+ self.y1 = y0+h
+ return
+
+ def overlap(self, rect):
+ return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
+ rect.y1 <= self.y0 or self.y1 <= rect.y0)
+
+
+## ExtSet
+##
+class ExtSet:
+
+ def __init__(self, gridsize):
+ self.gridsize = gridsize
+ self.grid = []
+ return
+
+ def cells(self, x0, x1):
+ i = int(x0 / self.gridsize)
+ x = i * self.gridsize
+ while x < x1:
+ yield i
+ x += self.gridsize
+ i += 1
+ return
+
+ def add(self, x0, x1, obj):
+ for i in self.cells(x0, x1):
+ self.grid[i].append(obj)
+ return
+
+ def get(self, x0, x1):
+ objs = set()
+ for i in self.cells(x0, x1):
+ objs.update(self.grid[i])
+ return objs
+
+def test_extset():
+ e=ExtSet(10)
+ assert list(e.cells(-1, 1)) == [-1,0]
+ assert list(e.cells(0, 1)) == [0]
+ assert list(e.cells(0, 10)) == [0]
+ assert list(e.cells(0, 11)) == [0,1]
+ assert list(e.cells(1, 11)) == [0,1]
+ assert list(e.cells(10, 11)) == [1]
+ assert list(e.cells(0, 20)) == [0,1]
+ assert list(e.cells(10, 20)) == [1]
+ assert list(e.cells(1,21)) == [0,1,2]
+ assert list(e.cells(11,21)) == [1,2]
+ return
+
+
+## ExtGrid
+##
+class ExtGrid:
+
+ def __init__(self, gridsize):
+ self.hext = ExtSet(gridsize)
+ self.vext = ExtSet(gridsize)
+ return
+
+ def add(self, rect):
+ self.hext.add(rect.x0, rect.x1, rect)
+ self.vext.add(rect.y0, rect.y1, rect)
+ return
+
+ def get(self, rect):
+ rects = self.hext.get(rect.x0, rect.x1)
+ rects.update_intersect(self.vext.get(rect.y0, rect.y1))
+ return rects
diff --git a/pdf2txt.py b/pdf2txt.py
index 5bb24a7..34c4a8c 100755
--- a/pdf2txt.py
+++ b/pdf2txt.py
@@ -23,13 +23,20 @@ class TextConverter(PDFDevice):
self.outfp.write('\n')
return
- def begin_block(self, name, (x0,y0,x1,y1)):
- self.outfp.write('\n' %
+ def begin_page(self, name, (x0,y0,x1,y1)):
+ self.outfp.write('\n' %
(name,x0,y0,x1,y1))
return
-
- def end_block(self):
- self.outfp.write('\n')
+ def end_page(self, _):
+ self.outfp.write('\n')
+ return
+
+ def begin_figure(self, name, (x0,y0,x1,y1)):
+ self.outfp.write('\n')
return
def handle_undefined_char(self, cidcoding, cid):
@@ -73,6 +80,7 @@ class TextConverter(PDFDevice):
# pdf2txt
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
device = TextConverter(outfp, rsrc, codec)
+ outfp.write('')
doc = PDFDocument(debug=debug)
fp = file(fname)
parser = PDFParser(doc, fp, debug=debug)
@@ -81,6 +89,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
if pages and (i not in pages): continue
interpreter.process_page(page)
fp.close()
+ outfp.write('')
device.close()
return
diff --git a/pdfinterp.py b/pdfinterp.py
index 2e48349..952412e 100644
--- a/pdfinterp.py
+++ b/pdfinterp.py
@@ -189,7 +189,8 @@ class PDFType3Font(PDFSimpleFont):
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
- descriptor = {'FontName':None, 'Ascent':0, 'Descent':0,
+ descriptor = {'FontName':spec.get('Name'),
+ 'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
@@ -442,9 +443,13 @@ class PDFDevice:
self.ctm = ctm
return
- def begin_block(self, name, bbox):
+ def begin_page(self, name, bbox):
return
- def end_block(self):
+ def end_page(self, name):
+ return
+ def begin_figure(self, name, bbox):
+ return
+ def end_figure(self, name):
return
def render_string(self, textstate, textmatrix, size, seq):
@@ -820,26 +825,23 @@ class PDFPageInterpreter:
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
(x0,y0) = apply_matrix(ctm, (x0,y0))
(x1,y1) = apply_matrix(ctm, (x1,y1))
- interpreter.render_contents(xobjid,
- (x0,y0,x1,y1),
- xobj.dic.get('Resources'),
- [xobj],
- ctm=ctm)
+ bbox = (x0,y0,x1,y1)
+ self.device.begin_figure(xobjid, bbox)
+ interpreter.render_contents(xobj.dic.get('Resources'),
+ [xobj], ctm=ctm)
+ self.device.end_figure(xobjid)
return
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
- self.render_contents('page-%d' % page.pageid,
- page.mediabox,
- page.resources,
- page.contents)
+ self.device.begin_page(page.pageid, page.mediabox)
+ self.render_contents(page.resources, page.contents)
+ self.device.end_page(page.pageid)
return
- def render_contents(self, contid, mediabox, resources, contents,
- ctm=MATRIX_IDENTITY):
+ def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
self.initpage(ctm)
- self.device.begin_block(contid, mediabox)
# Handle resource declarations.
def get_colorspace(spec):
if isinstance(spec, list):
@@ -874,7 +876,6 @@ class PDFPageInterpreter:
data = ''.join( stream_value(stream).get_data()
for stream in list_value(contents) )
self.execute(data)
- self.device.end_block()
return
def execute(self, data):
diff --git a/pdfparser.py b/pdfparser.py
index 1c6a7cb..ea0f11c 100755
--- a/pdfparser.py
+++ b/pdfparser.py
@@ -117,7 +117,7 @@ def str_value(x):
def list_value(x):
x = resolve1(x)
- if not isinstance(x, list):
+ if not (isinstance(x, list) or isinstance(x, tuple)):
raise PDFTypeError('list required: %r' % x)
return x