simplistic pdf viewer added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@29 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-05-06 10:57:41 +00:00
parent 77d7c9ae55
commit 3dba71b7d2
5 changed files with 370 additions and 52 deletions

View File

@ -241,7 +241,7 @@ class CMapParser(PSStackParser):
if name == 'def': if name == 'def':
try: try:
((_,k),(_,v)) = self.pop(2) ((_,k),(_,v)) = self.pop(2)
self.cmap.attrs[str(k)] = v self.cmap.attrs[literal_name(k)] = v
except PSSyntaxError: except PSSyntaxError:
pass pass
return return

View File

@ -7,36 +7,39 @@ INF = sys.maxint
## ##
class Rect: class Rect:
def __init__(self, x0=-INF, y0=-INF, w=None, h=None): def __init__(self, x=-INF, y=-INF, width=None, height=None):
self.x0 = x0 self.x0 = x
self.y0 = y0 self.y0 = y
if w == None: if width == None:
self.x1 = INF self.x1 = INF
else: else:
self.x1 = x0+w self.x1 = x+width
if h == None: if height == None:
self.y1 = INF self.y1 = INF
else: else:
self.y1 = y0+h self.y1 = y+height
return return
def __repr__(self): def __repr__(self):
return '<Rect: (%d,%d)-(%d,%d)>' % (self.x0, self.y0, self.x1, self.y1) return '<Rect: (%d,%d) (%dx%d)>' % (self.x0, self.y0, self.x1-self.x0, self.y1-self.y0)
def overlap(self, rect): def overlap(self, rect):
return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
rect.y1 <= self.y0 or self.y1 <= rect.y0) rect.y1 <= self.y0 or self.y1 <= rect.y0)
## ExtSet ## ExtGrid
## ##
class ExtSet: class ExtGrid:
def __init__(self, gridsize): def __init__(self, gridsize):
self.gridsize = gridsize self.gridsize = gridsize
self.grid = {} self.gridy = {}
return return
def __repr__(self):
return '<ExtGrid(size=%d): %r>' % (self.gridsize, self.gridy)
def cells(self, x0, x1): def cells(self, x0, x1):
i = int(x0 / self.gridsize) i = int(x0 / self.gridsize)
x = i * self.gridsize x = i * self.gridsize
@ -46,25 +49,41 @@ class ExtSet:
i += 1 i += 1
return return
def add(self, x0, x1, obj): def add(self, rect, obj):
for i in self.cells(x0, x1): if isinstance(rect, tuple): rect = Rect(*rect)
if i not in self.grid: xcells = list(self.cells(rect.x0, rect.x1))
a = [] for y in self.cells(rect.y0, rect.y1):
self.grid[i] = a if y not in self.gridy:
gridx = {}
self.gridy[y] = gridx
else: else:
a = self.grid[i] gridx = self.gridy[y]
a.append(obj) for x in xcells:
assert isinstance(gridx, dict), gridx
if x not in gridx:
objs = []
gridx[x] = objs
else:
objs = gridx[x]
objs.append((rect, obj))
assert isinstance(gridx, dict), gridx
return return
def get(self, x0, x1): def get(self, rect):
if isinstance(rect, tuple): rect = Rect(*rect)
objs = set() objs = set()
for i in self.cells(x0, x1): xcells = list(self.cells(rect.x0, rect.x1))
if i in self.grid: for y in self.cells(rect.y0, rect.y1):
objs.update(self.grid[i]) if y not in self.gridy: continue
gridx = self.gridy[y]
for x in xcells:
if x not in gridx: continue
objs.update( obj for (r,obj) in gridx[x] if rect.overlap(r) )
return objs return objs
def test_extset():
e=ExtSet(10) if __name__ == '__main__':
e = ExtGrid(10)
assert list(e.cells(-1, 1)) == [-1,0] assert list(e.cells(-1, 1)) == [-1,0]
assert list(e.cells(0, 1)) == [0] assert list(e.cells(0, 1)) == [0]
assert list(e.cells(0, 10)) == [0] assert list(e.cells(0, 10)) == [0]
@ -75,25 +94,10 @@ def test_extset():
assert list(e.cells(10, 20)) == [1] assert list(e.cells(10, 20)) == [1]
assert list(e.cells(1,21)) == [0,1,2] assert list(e.cells(1,21)) == [0,1,2]
assert list(e.cells(11,21)) == [1,2] assert list(e.cells(11,21)) == [1,2]
return e.add((0,0,10,10), 'a')
e.add((10,10,10,10), 'b')
e.add((5,5,5,10), 'c')
## ExtGrid assert sorted(e.get((0,0,1,1))) == ['a']
## assert sorted(e.get((10,10,1,1))) == ['b']
class ExtGrid: assert sorted(e.get((5,10,10,10))) == ['b','c']
assert sorted(e.get((5,5,10,10))) == ['a','b','c']
def __init__(self, gridsize):
self.hext = ExtSet(gridsize)
self.vext = ExtSet(gridsize)
return
def add(self, rect, obj):
self.hext.add(rect.x0, rect.x1, obj)
self.vext.add(rect.y0, rect.y1, obj)
return
def get(self, rect, getrect):
objs = self.hext.get(rect.x0, rect.x1)
objs.intersection_update(self.vext.get(rect.y0, rect.y1))
objs = [ obj for obj in objs if rect.overlap(getrect(obj)) ]
return objs

View File

@ -28,7 +28,7 @@ class PageItem:
return return
def dump(self, outfp, codec): def dump(self, outfp, codec):
bbox = '%d,%d,%d,%d' % self.bbox bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox
outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(self.id, bbox, self.rotate)) (self.id, bbox, self.rotate))
for obj in self.objs: for obj in self.objs:
@ -45,7 +45,7 @@ class FigureItem(PageItem):
return ('<figure id=%r bbox=%r>' % (self.id, self.bbox)) return ('<figure id=%r bbox=%r>' % (self.id, self.bbox))
def dump(self, outfp, codec): def dump(self, outfp, codec):
bbox = '%d,%d,%d,%d' % self.bbox bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox
outfp.write('<figure id="%s" bbox="%s">\n' % (self.id, bbox)) outfp.write('<figure id="%s" bbox="%s">\n' % (self.id, bbox))
for obj in self.objs: for obj in self.objs:
obj.dump(outfp, codec) obj.dump(outfp, codec)
@ -86,9 +86,9 @@ class TextItem:
def e(x): def e(x):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')
(a,b,c,d,tx,ty) = self.matrix bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox
outfp.write('<text x="%.3f" y="%.3f" font="%s" size="%.3f" width="%.3f">' % outfp.write('<text font="%s" direction="%s" bbox="%s" size="%.3f">' %
(tx, ty, e(self.font.fontname), self.size, self.width)) (e(self.font.fontname), self.direction, bbox, self.size))
outfp.write(e(self.text)) outfp.write(e(self.text))
outfp.write('</text>\n') outfp.write('</text>\n')
return return

152
sgml.py Executable file
View File

@ -0,0 +1,152 @@
#!/usr/bin/env python
import sys, sgmllib
__all__ = [ 'Document', 'Page', 'Text', 'PDFSGMLParser' ]
def fixed(x):
return int(float(x)*1000)
def getbbox(s):
(a,b,c,d) = s.split(',')
return (fixed(a),fixed(b),fixed(c),fixed(d))
## Document
##
class Document:
def __init__(self):
self.pages = []
return
def __repr__(self):
return '<Document: pages=%r>' % self.pages
def get_pages(self):
return self.pages
def add_page(self, page):
self.pages.append(page)
return
def add_text(self, text):
self.pages[-1].add_text(text)
return
## Page
##
class Page:
def __init__(self, pageid, bbox, rotate):
self.pageid = pageid
self.bbox = bbox
self.rotate = rotate
self.texts = []
return
def __repr__(self):
return '<Page(%s): texts=%r>' % (self.pageid, self.texts)
def get_texts(self):
return self.texts
def add_text(self, text):
self.texts.append(text)
return
## Text
##
class Text:
def __init__(self, font, direction, bbox, size):
self.font = font
self.direction = direction
self.bbox = bbox
self.size = size
self.data = ''
return
def __repr__(self):
return '<Text: %r>' % (self.data)
def add_data(self, data):
self.data += data
return
## PDFSGMLParser
##
class PDFSGMLParser(sgmllib.SGMLParser):
def __init__(self, doc):
sgmllib.SGMLParser.__init__(self)
self.doc = doc
self.curtext = None
return
def start_document(self, attrs):
return
def end_document(self):
return
def start_page(self, attrs):
attrs = dict(attrs)
pageid = attrs['id']
bbox = getbbox(attrs['bbox'])
rotate = int(attrs['rotate'])
page = Page(pageid, bbox, rotate)
self.doc.add_page(page)
return
def end_page(self):
return
def start_text(self, attrs):
attrs = dict(attrs)
font = attrs['font']
direction = attrs['direction']
bbox = getbbox(attrs['bbox'])
size = fixed(attrs['size'])
text = Text(font, direction, bbox, size)
self.curtext = text
return
def end_text(self):
assert self.curtext
self.doc.add_text(self.curtext)
self.curtext = None
return
def handle_data(self, data):
if not self.curtext: return
self.curtext.add_data(data)
return
def feedfile(self, fp, encoding='utf-8'):
for line in fp:
line = unicode(line, encoding, 'ignore')
self.feed(line)
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-c encoding] [file ...]' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dc:')
except getopt.GetoptError:
return usage()
encoding = 'utf-8'
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-c': encoding = v
for fname in args:
doc = Document()
parser = PDFSGMLParser(doc)
parser.feedfile(fname, encoding)
parser.close()
print doc
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))

162
viewpdf.py Executable file
View File

@ -0,0 +1,162 @@
#!/usr/bin/env python
import sys
from sgml import PDFSGMLParser, Document
stdout = sys.stdout
stderr = sys.stderr
try:
import pygame
from pygame.locals import *
except ImportError:
print >>stderr, 'you need pygame'
sys.exit(111)
def scale(x):
return int(x*0.002)
## FontManager
##
class FontManager:
fonts = {}
default_font = '/Library/Fonts/Vera.ttf'
#default_font = '/Library/Fonts/ipag.ttf'
@classmethod
def get_font(klass, path, size):
if not path:
path = klass.default_font
size = int(size)
k = (path,size)
if k not in klass.fonts:
font = pygame.font.Font(path, size)
klass.fonts[k] = font
else:
font = klass.fonts[k]
return font
## PDFViewer
##
class PDFViewer:
BGCOLOR = (255,255,255)
FGCOLOR = (0,0,0)
def __init__(self, display, doc):
self.display = display
self.buf = None
self.pages = doc.get_pages()
self.render_page(0)
return
def render_page(self, pageno):
print >>stderr, 'rendering: page=%d...' % pageno
page = self.pages[pageno]
(x,y,w,h) = page.bbox
self.width = scale(w)
self.height = scale(h)
self.buf = pygame.Surface((self.width, self.height))
self.buf.fill(self.BGCOLOR)
for text in page.get_texts():
font = FontManager.get_font(None, scale(text.size*0.7))
(x,y,w,h) = text.bbox
r = font.render(text.data, 1, self.FGCOLOR)
self.buf.blit(r, (scale(x), self.height-scale(y)))
self.pageno = pageno
self.pos = (0,0)
self.refresh()
return
def refresh(self):
size = self.display.get_size()
self.display.blit(self.buf, (0,0), (self.pos, size))
pygame.display.flip()
return
STEP = 8
def run(self):
loop = True
key = None
(w,h) = self.display.get_size()
xmax = self.width - w
ymax = self.height - h
while loop:
for e in pygame.event.get():
if e.type == VIDEOEXPOSE:
self.refresh()
elif e.type == KEYDOWN:
if e.key in (K_ESCAPE, K_RETURN, K_q):
loop = False
break
elif e.key == K_SPACE:
if self.pageno < len(self.pages)-1:
self.render_page(self.pageno+1)
elif e.key == K_b:
if 0 < self.pageno:
self.render_page(self.pageno-1)
else:
key = e.key
elif e.type == KEYUP:
key = None
if key:
(x,y) = self.pos
if key in (K_h, K_LEFT, K_KP4):
x = max(0, x-self.STEP)
elif key in (K_l, K_RIGHT, K_KP6):
x = min(xmax, x+self.STEP)
elif key in (K_k, K_UP, K_KP8):
y = max(0, y-self.STEP)
elif key in (K_j, K_DOWN, K_KP2):
y = min(ymax, y+self.STEP)
self.pos = (x,y)
self.refresh()
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-c encoding] file' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dc:P:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
encoding = 'utf-8'
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
password = ''
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-c': encoding = v
elif k == '-P': password = v
#
fname = args.pop(0)
if fname.endswith('.pdf'):
# convert .pdf to sgml
import tempfile
from pdf2txt import CMapDB, PDFResourceManager, pdf2txt
print >>stderr, 'reading %r...' % fname
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug)
fp = tempfile.TemporaryFile()
pdf2txt(fp, rsrc, fname, None, encoding, password=password, debug=debug)
fp.seek(0)
else:
fp = file(fname, 'rb')
doc = Document()
parser = PDFSGMLParser(doc)
parser.feedfile(fp, encoding)
parser.close()
fp.close()
#
pygame.init()
pygame.display.set_mode((640,480))
PDFViewer(pygame.display.get_surface(), doc).run()
return
if __name__ == '__main__': sys.exit(main(sys.argv))