diff --git a/cmap.py b/cmap.py index 5ef99fc..9251497 100644 --- a/cmap.py +++ b/cmap.py @@ -241,7 +241,7 @@ class CMapParser(PSStackParser): if name == 'def': try: ((_,k),(_,v)) = self.pop(2) - self.cmap.attrs[str(k)] = v + self.cmap.attrs[literal_name(k)] = v except PSSyntaxError: pass return diff --git a/extent.py b/extent.py index 74d08b7..5df7f0f 100755 --- a/extent.py +++ b/extent.py @@ -7,35 +7,38 @@ INF = sys.maxint ## class Rect: - def __init__(self, x0=-INF, y0=-INF, w=None, h=None): - self.x0 = x0 - self.y0 = y0 - if w == None: + def __init__(self, x=-INF, y=-INF, width=None, height=None): + self.x0 = x + self.y0 = y + if width == None: self.x1 = INF else: - self.x1 = x0+w - if h == None: + self.x1 = x+width + if height == None: self.y1 = INF else: - self.y1 = y0+h + self.y1 = y+height return def __repr__(self): - return '' % (self.x0, self.y0, self.x1, self.y1) + return '' % (self.x0, self.y0, self.x1-self.x0, self.y1-self.y0) def overlap(self, rect): return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or rect.y1 <= self.y0 or self.y1 <= rect.y0) -## ExtSet +## ExtGrid ## -class ExtSet: +class ExtGrid: def __init__(self, gridsize): self.gridsize = gridsize - self.grid = {} + self.gridy = {} return + + def __repr__(self): + return '' % (self.gridsize, self.gridy) def cells(self, x0, x1): i = int(x0 / self.gridsize) @@ -46,25 +49,41 @@ class ExtSet: i += 1 return - def add(self, x0, x1, obj): - for i in self.cells(x0, x1): - if i not in self.grid: - a = [] - self.grid[i] = a + def add(self, rect, obj): + if isinstance(rect, tuple): rect = Rect(*rect) + xcells = list(self.cells(rect.x0, rect.x1)) + for y in self.cells(rect.y0, rect.y1): + if y not in self.gridy: + gridx = {} + self.gridy[y] = gridx else: - a = self.grid[i] - a.append(obj) + gridx = self.gridy[y] + for x in xcells: + assert isinstance(gridx, dict), gridx + if x not in gridx: + objs = [] + gridx[x] = objs + else: + objs = gridx[x] + objs.append((rect, obj)) + assert isinstance(gridx, dict), gridx return - def get(self, x0, x1): + def get(self, rect): + if isinstance(rect, tuple): rect = Rect(*rect) objs = set() - for i in self.cells(x0, x1): - if i in self.grid: - objs.update(self.grid[i]) + xcells = list(self.cells(rect.x0, rect.x1)) + for y in self.cells(rect.y0, rect.y1): + if y not in self.gridy: continue + gridx = self.gridy[y] + for x in xcells: + if x not in gridx: continue + objs.update( obj for (r,obj) in gridx[x] if rect.overlap(r) ) return objs -def test_extset(): - e=ExtSet(10) + +if __name__ == '__main__': + e = ExtGrid(10) assert list(e.cells(-1, 1)) == [-1,0] assert list(e.cells(0, 1)) == [0] assert list(e.cells(0, 10)) == [0] @@ -75,25 +94,10 @@ def test_extset(): assert list(e.cells(10, 20)) == [1] assert list(e.cells(1,21)) == [0,1,2] assert list(e.cells(11,21)) == [1,2] - return - - -## ExtGrid -## -class ExtGrid: - - def __init__(self, gridsize): - self.hext = ExtSet(gridsize) - self.vext = ExtSet(gridsize) - return - - def add(self, rect, obj): - self.hext.add(rect.x0, rect.x1, obj) - self.vext.add(rect.y0, rect.y1, obj) - return - - def get(self, rect, getrect): - objs = self.hext.get(rect.x0, rect.x1) - objs.intersection_update(self.vext.get(rect.y0, rect.y1)) - objs = [ obj for obj in objs if rect.overlap(getrect(obj)) ] - return objs + e.add((0,0,10,10), 'a') + e.add((10,10,10,10), 'b') + e.add((5,5,5,10), 'c') + assert sorted(e.get((0,0,1,1))) == ['a'] + assert sorted(e.get((10,10,1,1))) == ['b'] + assert sorted(e.get((5,10,10,10))) == ['b','c'] + assert sorted(e.get((5,5,10,10))) == ['a','b','c'] diff --git a/pdf2txt.py b/pdf2txt.py index a29bee8..66dffd2 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -28,7 +28,7 @@ class PageItem: return def dump(self, outfp, codec): - bbox = '%d,%d,%d,%d' % self.bbox + bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox outfp.write('\n' % (self.id, bbox, self.rotate)) for obj in self.objs: @@ -45,7 +45,7 @@ class FigureItem(PageItem): return ('
' % (self.id, self.bbox)) def dump(self, outfp, codec): - bbox = '%d,%d,%d,%d' % self.bbox + bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox outfp.write('
\n' % (self.id, bbox)) for obj in self.objs: obj.dump(outfp, codec) @@ -86,9 +86,9 @@ class TextItem: def e(x): x = x.replace('&','&').replace('>','>').replace('<','<') return x.encode(codec, 'xmlcharrefreplace') - (a,b,c,d,tx,ty) = self.matrix - outfp.write('' % - (tx, ty, e(self.font.fontname), self.size, self.width)) + bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox + outfp.write('' % + (e(self.font.fontname), self.direction, bbox, self.size)) outfp.write(e(self.text)) outfp.write('\n') return diff --git a/sgml.py b/sgml.py new file mode 100755 index 0000000..321c88d --- /dev/null +++ b/sgml.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +import sys, sgmllib +__all__ = [ 'Document', 'Page', 'Text', 'PDFSGMLParser' ] + +def fixed(x): + return int(float(x)*1000) +def getbbox(s): + (a,b,c,d) = s.split(',') + return (fixed(a),fixed(b),fixed(c),fixed(d)) + + +## Document +## +class Document: + + def __init__(self): + self.pages = [] + return + + def __repr__(self): + return '' % self.pages + + def get_pages(self): + return self.pages + + def add_page(self, page): + self.pages.append(page) + return + + def add_text(self, text): + self.pages[-1].add_text(text) + return + + +## Page +## +class Page: + + def __init__(self, pageid, bbox, rotate): + self.pageid = pageid + self.bbox = bbox + self.rotate = rotate + self.texts = [] + return + + def __repr__(self): + return '' % (self.pageid, self.texts) + + def get_texts(self): + return self.texts + + def add_text(self, text): + self.texts.append(text) + return + + +## Text +## +class Text: + + def __init__(self, font, direction, bbox, size): + self.font = font + self.direction = direction + self.bbox = bbox + self.size = size + self.data = '' + return + + def __repr__(self): + return '' % (self.data) + + def add_data(self, data): + self.data += data + return + + +## PDFSGMLParser +## +class PDFSGMLParser(sgmllib.SGMLParser): + + def __init__(self, doc): + sgmllib.SGMLParser.__init__(self) + self.doc = doc + self.curtext = None + return + + def start_document(self, attrs): + return + def end_document(self): + return + + def start_page(self, attrs): + attrs = dict(attrs) + pageid = attrs['id'] + bbox = getbbox(attrs['bbox']) + rotate = int(attrs['rotate']) + page = Page(pageid, bbox, rotate) + self.doc.add_page(page) + return + def end_page(self): + return + + def start_text(self, attrs): + attrs = dict(attrs) + font = attrs['font'] + direction = attrs['direction'] + bbox = getbbox(attrs['bbox']) + size = fixed(attrs['size']) + text = Text(font, direction, bbox, size) + self.curtext = text + return + def end_text(self): + assert self.curtext + self.doc.add_text(self.curtext) + self.curtext = None + return + + def handle_data(self, data): + if not self.curtext: return + self.curtext.add_data(data) + return + + def feedfile(self, fp, encoding='utf-8'): + for line in fp: + line = unicode(line, encoding, 'ignore') + self.feed(line) + return + + +# main +def main(argv): + import getopt + def usage(): + print 'usage: %s [-d] [-c encoding] [file ...]' % argv[0] + return 100 + try: + (opts, args) = getopt.getopt(argv[1:], 'dc:') + except getopt.GetoptError: + return usage() + encoding = 'utf-8' + for (k, v) in opts: + if k == '-d': debug += 1 + elif k == '-c': encoding = v + for fname in args: + doc = Document() + parser = PDFSGMLParser(doc) + parser.feedfile(fname, encoding) + parser.close() + print doc + return 0 + +if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/viewpdf.py b/viewpdf.py new file mode 100755 index 0000000..ea69864 --- /dev/null +++ b/viewpdf.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +import sys +from sgml import PDFSGMLParser, Document +stdout = sys.stdout +stderr = sys.stderr +try: + import pygame + from pygame.locals import * +except ImportError: + print >>stderr, 'you need pygame' + sys.exit(111) + + +def scale(x): + return int(x*0.002) + + +## FontManager +## +class FontManager: + + fonts = {} + default_font = '/Library/Fonts/Vera.ttf' + #default_font = '/Library/Fonts/ipag.ttf' + + @classmethod + def get_font(klass, path, size): + if not path: + path = klass.default_font + size = int(size) + k = (path,size) + if k not in klass.fonts: + font = pygame.font.Font(path, size) + klass.fonts[k] = font + else: + font = klass.fonts[k] + return font + + +## PDFViewer +## +class PDFViewer: + + BGCOLOR = (255,255,255) + FGCOLOR = (0,0,0) + + def __init__(self, display, doc): + self.display = display + self.buf = None + self.pages = doc.get_pages() + self.render_page(0) + return + + def render_page(self, pageno): + print >>stderr, 'rendering: page=%d...' % pageno + page = self.pages[pageno] + (x,y,w,h) = page.bbox + self.width = scale(w) + self.height = scale(h) + self.buf = pygame.Surface((self.width, self.height)) + self.buf.fill(self.BGCOLOR) + for text in page.get_texts(): + font = FontManager.get_font(None, scale(text.size*0.7)) + (x,y,w,h) = text.bbox + r = font.render(text.data, 1, self.FGCOLOR) + self.buf.blit(r, (scale(x), self.height-scale(y))) + self.pageno = pageno + self.pos = (0,0) + self.refresh() + return + + def refresh(self): + size = self.display.get_size() + self.display.blit(self.buf, (0,0), (self.pos, size)) + pygame.display.flip() + return + + STEP = 8 + def run(self): + loop = True + key = None + (w,h) = self.display.get_size() + xmax = self.width - w + ymax = self.height - h + while loop: + for e in pygame.event.get(): + if e.type == VIDEOEXPOSE: + self.refresh() + elif e.type == KEYDOWN: + if e.key in (K_ESCAPE, K_RETURN, K_q): + loop = False + break + elif e.key == K_SPACE: + if self.pageno < len(self.pages)-1: + self.render_page(self.pageno+1) + elif e.key == K_b: + if 0 < self.pageno: + self.render_page(self.pageno-1) + else: + key = e.key + elif e.type == KEYUP: + key = None + if key: + (x,y) = self.pos + if key in (K_h, K_LEFT, K_KP4): + x = max(0, x-self.STEP) + elif key in (K_l, K_RIGHT, K_KP6): + x = min(xmax, x+self.STEP) + elif key in (K_k, K_UP, K_KP8): + y = max(0, y-self.STEP) + elif key in (K_j, K_DOWN, K_KP2): + y = min(ymax, y+self.STEP) + self.pos = (x,y) + self.refresh() + return + +# main +def main(argv): + import getopt + def usage(): + print 'usage: %s [-d] [-c encoding] file' % argv[0] + return 100 + try: + (opts, args) = getopt.getopt(argv[1:], 'dc:P:') + except getopt.GetoptError: + return usage() + if not args: return usage() + debug = 0 + encoding = 'utf-8' + cmapdir = 'CMap' + cdbcmapdir = 'CDBCMap' + password = '' + for (k, v) in opts: + if k == '-d': debug += 1 + elif k == '-c': encoding = v + elif k == '-P': password = v + # + fname = args.pop(0) + if fname.endswith('.pdf'): + # convert .pdf to sgml + import tempfile + from pdf2txt import CMapDB, PDFResourceManager, pdf2txt + print >>stderr, 'reading %r...' % fname + CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) + rsrc = PDFResourceManager(debug=debug) + fp = tempfile.TemporaryFile() + pdf2txt(fp, rsrc, fname, None, encoding, password=password, debug=debug) + fp.seek(0) + else: + fp = file(fname, 'rb') + doc = Document() + parser = PDFSGMLParser(doc) + parser.feedfile(fp, encoding) + parser.close() + fp.close() + # + pygame.init() + pygame.display.set_mode((640,480)) + PDFViewer(pygame.display.get_surface(), doc).run() + return + +if __name__ == '__main__': sys.exit(main(sys.argv))