diff --git a/extent.py b/extent.py new file mode 100755 index 0000000..67e005d --- /dev/null +++ b/extent.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +import sys +INF = sys.maxint + + +## Rect +## +class Rect: + + def __init__(self, x0=-INF, y0=-INF, w=None, h=None): + self.x0 = x0 + self.y0 = y0 + if w == None: + self.x1 = INF + else: + self.x1 = x0+w + if h == None: + self.y1 = INF + else: + self.y1 = y0+h + return + + def overlap(self, rect): + return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or + rect.y1 <= self.y0 or self.y1 <= rect.y0) + + +## ExtSet +## +class ExtSet: + + def __init__(self, gridsize): + self.gridsize = gridsize + self.grid = [] + return + + def cells(self, x0, x1): + i = int(x0 / self.gridsize) + x = i * self.gridsize + while x < x1: + yield i + x += self.gridsize + i += 1 + return + + def add(self, x0, x1, obj): + for i in self.cells(x0, x1): + self.grid[i].append(obj) + return + + def get(self, x0, x1): + objs = set() + for i in self.cells(x0, x1): + objs.update(self.grid[i]) + return objs + +def test_extset(): + e=ExtSet(10) + assert list(e.cells(-1, 1)) == [-1,0] + assert list(e.cells(0, 1)) == [0] + assert list(e.cells(0, 10)) == [0] + assert list(e.cells(0, 11)) == [0,1] + assert list(e.cells(1, 11)) == [0,1] + assert list(e.cells(10, 11)) == [1] + assert list(e.cells(0, 20)) == [0,1] + assert list(e.cells(10, 20)) == [1] + assert list(e.cells(1,21)) == [0,1,2] + assert list(e.cells(11,21)) == [1,2] + return + + +## ExtGrid +## +class ExtGrid: + + def __init__(self, gridsize): + self.hext = ExtSet(gridsize) + self.vext = ExtSet(gridsize) + return + + def add(self, rect): + self.hext.add(rect.x0, rect.x1, rect) + self.vext.add(rect.y0, rect.y1, rect) + return + + def get(self, rect): + rects = self.hext.get(rect.x0, rect.x1) + rects.update_intersect(self.vext.get(rect.y0, rect.y1)) + return rects diff --git a/pdf2txt.py b/pdf2txt.py index 5bb24a7..34c4a8c 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -23,13 +23,20 @@ class TextConverter(PDFDevice): self.outfp.write('\n') return - def begin_block(self, name, (x0,y0,x1,y1)): - self.outfp.write('\n' % + def begin_page(self, name, (x0,y0,x1,y1)): + self.outfp.write('\n' % (name,x0,y0,x1,y1)) return - - def end_block(self): - self.outfp.write('\n') + def end_page(self, _): + self.outfp.write('\n') + return + + def begin_figure(self, name, (x0,y0,x1,y1)): + self.outfp.write('
\n' % + (name,x0,y0,x1,y1)) + return + def end_figure(self, _): + self.outfp.write('
\n') return def handle_undefined_char(self, cidcoding, cid): @@ -73,6 +80,7 @@ class TextConverter(PDFDevice): # pdf2txt def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): device = TextConverter(outfp, rsrc, codec) + outfp.write('') doc = PDFDocument(debug=debug) fp = file(fname) parser = PDFParser(doc, fp, debug=debug) @@ -81,6 +89,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): if pages and (i not in pages): continue interpreter.process_page(page) fp.close() + outfp.write('') device.close() return diff --git a/pdfinterp.py b/pdfinterp.py index 2e48349..952412e 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -189,7 +189,8 @@ class PDFType3Font(PDFSimpleFont): if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: - descriptor = {'FontName':None, 'Ascent':0, 'Descent':0, + descriptor = {'FontName':spec.get('Name'), + 'Ascent':0, 'Descent':0, 'FontBBox':spec['FontBBox']} PDFSimpleFont.__init__(self, descriptor, widths, spec) return @@ -442,9 +443,13 @@ class PDFDevice: self.ctm = ctm return - def begin_block(self, name, bbox): + def begin_page(self, name, bbox): return - def end_block(self): + def end_page(self, name): + return + def begin_figure(self, name, bbox): + return + def end_figure(self, name): return def render_string(self, textstate, textmatrix, size, seq): @@ -820,26 +825,23 @@ class PDFPageInterpreter: ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm) (x0,y0) = apply_matrix(ctm, (x0,y0)) (x1,y1) = apply_matrix(ctm, (x1,y1)) - interpreter.render_contents(xobjid, - (x0,y0,x1,y1), - xobj.dic.get('Resources'), - [xobj], - ctm=ctm) + bbox = (x0,y0,x1,y1) + self.device.begin_figure(xobjid, bbox) + interpreter.render_contents(xobj.dic.get('Resources'), + [xobj], ctm=ctm) + self.device.end_figure(xobjid) return def process_page(self, page): if 1 <= self.debug: print >>stderr, 'Processing page: %r' % page - self.render_contents('page-%d' % page.pageid, - page.mediabox, - page.resources, - page.contents) + self.device.begin_page(page.pageid, page.mediabox) + self.render_contents(page.resources, page.contents) + self.device.end_page(page.pageid) return - def render_contents(self, contid, mediabox, resources, contents, - ctm=MATRIX_IDENTITY): + def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY): self.initpage(ctm) - self.device.begin_block(contid, mediabox) # Handle resource declarations. def get_colorspace(spec): if isinstance(spec, list): @@ -874,7 +876,6 @@ class PDFPageInterpreter: data = ''.join( stream_value(stream).get_data() for stream in list_value(contents) ) self.execute(data) - self.device.end_block() return def execute(self, data): diff --git a/pdfparser.py b/pdfparser.py index 1c6a7cb..ea0f11c 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -117,7 +117,7 @@ def str_value(x): def list_value(x): x = resolve1(x) - if not isinstance(x, list): + if not (isinstance(x, list) or isinstance(x, tuple)): raise PDFTypeError('list required: %r' % x) return x