yum-yum!
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@9 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
a8bae61c25
commit
196ece7913
2
Makefile
2
Makefile
|
@ -6,7 +6,7 @@ TAR=tar
|
||||||
SVN=svn
|
SVN=svn
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
|
|
||||||
WORKDIR=..
|
WORKDIR=/tmp
|
||||||
DISTNAME=$(PACKAGE)-dist-$(VERSION)
|
DISTNAME=$(PACKAGE)-dist-$(VERSION)
|
||||||
DISTFILE=$(DISTNAME).tar.gz
|
DISTFILE=$(DISTNAME).tar.gz
|
||||||
|
|
||||||
|
|
11
README.html
11
README.html
|
@ -21,6 +21,7 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
|
||||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz">
|
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz">
|
||||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz
|
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz
|
||||||
</a>
|
</a>
|
||||||
|
(220kbytes)
|
||||||
|
|
||||||
<P>
|
<P>
|
||||||
<strong>Svn repository:</strong><br>
|
<strong>Svn repository:</strong><br>
|
||||||
|
@ -46,7 +47,7 @@ http://www.unixuser.org/~euske/pub/CMap.tar.bz2
|
||||||
<p>
|
<p>
|
||||||
<strong>Dump the contents:</strong>
|
<strong>Dump the contents:</strong>
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ ./dumppdf.py foo.pdf
|
$ ./dumppdf.py -a foo.pdf
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
@ -56,6 +57,14 @@ $ ./pdf2txt.py samples/naacl06-shinyama.pdf
|
||||||
$ ./pdf2txt.py -c euc-jp samples/jo.pdf
|
$ ./pdf2txt.py -c euc-jp samples/jo.pdf
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
<h2>Similar Projects</h2>
|
||||||
|
<ul>
|
||||||
|
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
|
||||||
|
<li> <a href="http://www.pdfbox.org/">pdfbox</a>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<h2>Terms and conditions</h2>
|
<h2>Terms and conditions</h2>
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -83,8 +83,8 @@ def dumptrailers(out, doc):
|
||||||
out.write('\n</trailer>\n\n')
|
out.write('\n</trailer>\n\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
# dumpall
|
# dumpallobjs
|
||||||
def dumpall(out, doc):
|
def dumpallobjs(out, doc):
|
||||||
out.write('<pdf>')
|
out.write('<pdf>')
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
for objid in xrange(xref.objid0, xref.objid1+1):
|
for objid in xrange(xref.objid0, xref.objid1+1):
|
||||||
|
@ -93,7 +93,7 @@ def dumpall(out, doc):
|
||||||
out.write('<object id="%d">\n' % objid)
|
out.write('<object id="%d">\n' % objid)
|
||||||
dumpxml(out, obj)
|
dumpxml(out, obj)
|
||||||
out.write('\n</object>\n\n')
|
out.write('\n</object>\n\n')
|
||||||
except PDFValueError:
|
except:
|
||||||
pass
|
pass
|
||||||
dumptrailers(out, doc)
|
dumptrailers(out, doc)
|
||||||
out.write('</pdf>')
|
out.write('</pdf>')
|
||||||
|
@ -117,7 +117,7 @@ def dumppdf(outfp, fname, objids, pageids,
|
||||||
if page.pageid in pageids:
|
if page.pageid in pageids:
|
||||||
dumpxml(outfp, page.attrs)
|
dumpxml(outfp, page.attrs)
|
||||||
if dumpall:
|
if dumpall:
|
||||||
dumpall(outfp, doc)
|
dumpallobjs(outfp, doc)
|
||||||
if (not objids) and (not pageids) and (not dumpall):
|
if (not objids) and (not pageids) and (not dumpall):
|
||||||
dumptrailers(outfp, doc)
|
dumptrailers(outfp, doc)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
|
@ -23,8 +23,9 @@ class TextConverter(PDFDevice):
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_block(self, name):
|
def begin_block(self, name, (x0,y0,x1,y1)):
|
||||||
self.outfp.write('<block name="%s">\n' % name)
|
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
||||||
|
(name,x0,y0,x1,y1))
|
||||||
return
|
return
|
||||||
def end_block(self):
|
def end_block(self):
|
||||||
self.outfp.write('</block>\n')
|
self.outfp.write('</block>\n')
|
||||||
|
@ -83,10 +84,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
|
print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:c:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
|
61
pdfinterp.py
61
pdfinterp.py
|
@ -57,17 +57,18 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||||
|
|
||||||
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
'''Applies a matrix to a coordination.'''
|
'''Applies a matrix to coordinates.'''
|
||||||
return (a*x+c*y+e, b*x+d*y+f)
|
return (a*x+c*y+e, b*x+d*y+f)
|
||||||
|
|
||||||
def cs_params(cs):
|
def cs_params(cs):
|
||||||
|
'''Returns a number of components for a given colorspace.'''
|
||||||
t = cs[0]
|
t = cs[0]
|
||||||
if t == LITERAL_ICC_BASED:
|
if t == LITERAL_ICC_BASED:
|
||||||
return stream_value(cs[1]).dic['N']
|
return stream_value(cs[1]).dic['N']
|
||||||
elif t == LITERAL_DEVICE_N:
|
elif t == LITERAL_DEVICE_N:
|
||||||
return len(list_value(cs[1]))
|
return len(list_value(cs[1]))
|
||||||
else:
|
else:
|
||||||
return CS_COMPONENTS[t]
|
return CS_COMPONENTS.get(t, 0)
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
## Fonts
|
||||||
|
@ -438,7 +439,7 @@ class PDFDevice:
|
||||||
self.ctm = ctm
|
self.ctm = ctm
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_block(self, name):
|
def begin_block(self, name, bbox):
|
||||||
return
|
return
|
||||||
def end_block(self):
|
def end_block(self):
|
||||||
return
|
return
|
||||||
|
@ -589,11 +590,11 @@ class PDFPageInterpreter:
|
||||||
|
|
||||||
# setcolorspace-stroking
|
# setcolorspace-stroking
|
||||||
def do_CS(self, name):
|
def do_CS(self, name):
|
||||||
self.scs = self.csmap.get(literal_name(name), None)
|
self.scs = self.csmap.get(literal_name(name), [name])
|
||||||
return
|
return
|
||||||
# setcolorspace-non-strokine
|
# setcolorspace-non-strokine
|
||||||
def do_cs(self, name):
|
def do_cs(self, name):
|
||||||
self.ncs = self.csmap.get(literal_name(name), None)
|
self.ncs = self.csmap.get(literal_name(name), [name])
|
||||||
return
|
return
|
||||||
# setgray-stroking
|
# setgray-stroking
|
||||||
def do_G(self, gray):
|
def do_G(self, gray):
|
||||||
|
@ -770,34 +771,46 @@ class PDFPageInterpreter:
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing xobj: %r' % xobj
|
print >>stderr, 'Processing xobj: %r' % xobj
|
||||||
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
||||||
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
|
(x0,y0,x1,y1) = xobj.dic['BBox']
|
||||||
xobj.dic.get('Matrix', MATRIX_IDENTITY))
|
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
|
||||||
|
(x0,y0) = apply_matrix(ctm, (x0,y0))
|
||||||
|
(x1,y1) = apply_matrix(ctm, (x1,y1))
|
||||||
|
interpreter.render_contents(xobjid,
|
||||||
|
(x0,y0,x1,y1),
|
||||||
|
xobj.dic.get('Resources'),
|
||||||
|
[xobj],
|
||||||
|
ctm=ctm)
|
||||||
return
|
return
|
||||||
|
|
||||||
def process_page(self, page):
|
def process_page(self, page):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing page: %r' % page
|
print >>stderr, 'Processing page: %r' % page
|
||||||
self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
|
self.render_contents('page-%d' % page.pageid,
|
||||||
|
page.mediabox,
|
||||||
|
page.resources,
|
||||||
|
page.contents)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
|
def render_contents(self, contid, mediabox, resources, contents,
|
||||||
|
ctm=MATRIX_IDENTITY):
|
||||||
self.initpage(ctm)
|
self.initpage(ctm)
|
||||||
self.device.begin_block(contid)
|
self.device.begin_block(contid, mediabox)
|
||||||
# Handle resource declarations.
|
# Handle resource declarations.
|
||||||
for (k,v) in dict_value(resources).iteritems():
|
if resources:
|
||||||
if 1 <= self.debug:
|
for (k,v) in dict_value(resources).iteritems():
|
||||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
if 1 <= self.debug:
|
||||||
if k == 'Font':
|
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||||
for (fontid,fontrsrc) in dict_value(v).iteritems():
|
if k == 'Font':
|
||||||
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
|
for (fontid,fontrsrc) in dict_value(v).iteritems():
|
||||||
elif k == 'ColorSpace':
|
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
|
||||||
for (csid,csspec) in dict_value(v).iteritems():
|
elif k == 'ColorSpace':
|
||||||
self.csmap[csid] = list_value(csspec)
|
for (csid,csspec) in dict_value(v).iteritems():
|
||||||
elif k == 'ProcSet':
|
self.csmap[csid] = list_value(csspec)
|
||||||
self.rsrc.get_procset(list_value(v))
|
elif k == 'ProcSet':
|
||||||
elif k == 'XObject':
|
self.rsrc.get_procset(list_value(v))
|
||||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
elif k == 'XObject':
|
||||||
self.xobjmap[xobjid] = xobjstrm
|
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||||
|
self.xobjmap[xobjid] = xobjstrm
|
||||||
for stream in list_value(contents):
|
for stream in list_value(contents):
|
||||||
self.execute(stream_value(stream))
|
self.execute(stream_value(stream))
|
||||||
self.device.end_block()
|
self.device.end_block()
|
||||||
|
|
60
pdfparser.py
60
pdfparser.py
|
@ -18,8 +18,8 @@ import sys, re
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||||
PSLiteral, PSKeyword, \
|
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
literal_name, keyword_name, \
|
||||||
PSStackParser
|
PSStackParser
|
||||||
|
|
||||||
|
|
||||||
|
@ -76,8 +76,7 @@ def resolveall(x):
|
||||||
'''
|
'''
|
||||||
Recursively resolve X and all the internals.
|
Recursively resolve X and all the internals.
|
||||||
Make sure there is no indirect reference within the nested object.
|
Make sure there is no indirect reference within the nested object.
|
||||||
This procedure might be slow. Do not used it unless
|
This procedure might be slow.
|
||||||
you really need it.
|
|
||||||
'''
|
'''
|
||||||
while isinstance(x, PDFObjRef):
|
while isinstance(x, PDFObjRef):
|
||||||
x = x.resolve()
|
x = x.resolve()
|
||||||
|
@ -209,13 +208,12 @@ class PDFStream:
|
||||||
##
|
##
|
||||||
class PDFPage:
|
class PDFPage:
|
||||||
|
|
||||||
def __init__(self, doc, pageidx, attrs, parent_attrs):
|
def __init__(self, doc, pageidx, attrs):
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.pageid = pageidx
|
self.pageid = pageidx
|
||||||
self.attrs = dict_value(attrs)
|
self.attrs = dict_value(attrs)
|
||||||
self.parent_attrs = parent_attrs
|
self.resources = resolve1(self.attrs['Resources'])
|
||||||
self.resources = self.get_attr('Resources')
|
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||||
self.mediabox = self.get_attr('MediaBox')
|
|
||||||
contents = resolve1(self.attrs['Contents'])
|
contents = resolve1(self.attrs['Contents'])
|
||||||
if not isinstance(contents, list):
|
if not isinstance(contents, list):
|
||||||
contents = [ contents ]
|
contents = [ contents ]
|
||||||
|
@ -224,11 +222,6 @@ class PDFPage:
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||||
|
|
||||||
def get_attr(self, k):
|
|
||||||
if k in self.attrs:
|
|
||||||
return resolve1(self.attrs[k])
|
|
||||||
return self.parent_attrs.get(k)
|
|
||||||
|
|
||||||
|
|
||||||
## XRefs
|
## XRefs
|
||||||
|
@ -239,7 +232,7 @@ class PDFXRef:
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self, parser):
|
||||||
while 1:
|
while 1:
|
||||||
line = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
if not line:
|
if not line:
|
||||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
raise PDFSyntaxError('premature eof: %r' % parser)
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
@ -253,7 +246,7 @@ class PDFXRef:
|
||||||
self.objid1 = start+nobjs
|
self.objid1 = start+nobjs
|
||||||
self.offsets = []
|
self.offsets = []
|
||||||
for objid in xrange(start, start+nobjs):
|
for objid in xrange(start, start+nobjs):
|
||||||
line = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
f = line.strip().split(' ')
|
f = line.strip().split(' ')
|
||||||
if len(f) != 3:
|
if len(f) != 3:
|
||||||
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
|
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
|
||||||
|
@ -361,13 +354,12 @@ class PDFDocument:
|
||||||
self.parsed_objs[stream] = objs
|
self.parsed_objs[stream] = objs
|
||||||
obj = objs[stream.dic['N']*2+index]
|
obj = objs[stream.dic['N']*2+index]
|
||||||
else:
|
else:
|
||||||
pos0 = self.parser.linepos
|
prevpos = self.parser.seek(index)
|
||||||
self.parser.seek(index)
|
|
||||||
seq = list_value(self.parser.parse())
|
seq = list_value(self.parser.parse())
|
||||||
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
||||||
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
||||||
obj = seq[3]
|
obj = seq[3]
|
||||||
self.parser.seek(pos0)
|
self.parser.seek(prevpos)
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
||||||
self.objs[objid] = obj
|
self.objs[objid] = obj
|
||||||
|
@ -376,7 +368,10 @@ class PDFDocument:
|
||||||
def get_pages(self, debug=0):
|
def get_pages(self, debug=0):
|
||||||
assert self.xrefs
|
assert self.xrefs
|
||||||
def search(obj, parent):
|
def search(obj, parent):
|
||||||
tree = dict_value(obj)
|
tree = dict_value(obj).copy()
|
||||||
|
for (k,v) in parent.iteritems():
|
||||||
|
if k not in tree:
|
||||||
|
tree[k] = v
|
||||||
if tree['Type'] == LITERAL_PAGES:
|
if tree['Type'] == LITERAL_PAGES:
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||||
|
@ -386,9 +381,9 @@ class PDFDocument:
|
||||||
elif tree['Type'] == LITERAL_PAGE:
|
elif tree['Type'] == LITERAL_PAGE:
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>stderr, 'Page: %r' % tree
|
print >>stderr, 'Page: %r' % tree
|
||||||
yield (tree, parent)
|
yield tree
|
||||||
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
||||||
yield PDFPage(self, i, tree, parent)
|
yield PDFPage(self, i, tree)
|
||||||
return
|
return
|
||||||
|
|
||||||
def set_root(self, root):
|
def set_root(self, root):
|
||||||
|
@ -440,19 +435,19 @@ class PDFParser(PSStackParser):
|
||||||
raise PDFValueError('/Length is undefined: %r' % dic)
|
raise PDFValueError('/Length is undefined: %r' % dic)
|
||||||
objlen = int_value(dic['Length'])
|
objlen = int_value(dic['Length'])
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
line = self.nextline() # 'stream'
|
(_, line) = self.nextline() # 'stream'
|
||||||
self.fp.seek(pos+len(line))
|
self.fp.seek(pos+len(line))
|
||||||
data = self.fp.read(objlen)
|
data = self.fp.read(objlen)
|
||||||
self.seek(pos+len(line)+objlen)
|
self.seek(pos+len(line)+objlen)
|
||||||
while 1:
|
while 1:
|
||||||
line = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if not line:
|
if not line:
|
||||||
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
|
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
|
||||||
(self.linepos, line))
|
(linepos, line))
|
||||||
if line.strip():
|
if line.strip():
|
||||||
if not line.startswith('endstream'):
|
if not line.startswith('endstream'):
|
||||||
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
|
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
|
||||||
(self.linepos, line))
|
(linepos, line))
|
||||||
break
|
break
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||||
|
@ -510,17 +505,16 @@ class PDFParser(PSStackParser):
|
||||||
self.find_xref()
|
self.find_xref()
|
||||||
while 1:
|
while 1:
|
||||||
# read xref table
|
# read xref table
|
||||||
pos0 = self.linepos
|
(linepos, line) = self.nextline()
|
||||||
line = self.nextline()
|
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'line: %r' % line
|
print >>stderr, 'line: %r' % line
|
||||||
if line[0].isdigit():
|
if line[0].isdigit():
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
self.seek(pos0)
|
self.seek(linepos)
|
||||||
xref = PDFXRefStream(self)
|
xref = PDFXRefStream(self)
|
||||||
elif line.strip() != 'xref':
|
elif line.strip() != 'xref':
|
||||||
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
||||||
(self.linepos, line))
|
(linepos, line))
|
||||||
else:
|
else:
|
||||||
xref = PDFXRef(self)
|
xref = PDFXRef(self)
|
||||||
yield xref
|
yield xref
|
||||||
|
@ -531,10 +525,10 @@ class PDFParser(PSStackParser):
|
||||||
self.seek(int_value(trailer['XRefStm']))
|
self.seek(int_value(trailer['XRefStm']))
|
||||||
if 'Prev' in trailer:
|
if 'Prev' in trailer:
|
||||||
# find previous xref
|
# find previous xref
|
||||||
pos0 = int_value(trailer['Prev'])
|
pos = int_value(trailer['Prev'])
|
||||||
self.seek(pos0)
|
self.seek(pos)
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'prev trailer: pos=%d' % pos0
|
print >>stderr, 'prev trailer: pos=%d' % pos
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
return
|
return
|
||||||
|
|
95
psparser.py
95
psparser.py
|
@ -12,36 +12,48 @@ class PSTypeError(PSException): pass
|
||||||
class PSValueError(PSException): pass
|
class PSValueError(PSException): pass
|
||||||
|
|
||||||
|
|
||||||
## PostScript Types
|
## Basic PostScript Types
|
||||||
##
|
##
|
||||||
|
|
||||||
|
# PSLiteral
|
||||||
class PSLiteral:
|
class PSLiteral:
|
||||||
|
|
||||||
'''
|
'''
|
||||||
PS literals (e.g. "/Name").
|
PS literals (e.g. "/Name").
|
||||||
Caution: Never create these objects directly.
|
Caution: Never create these objects directly.
|
||||||
Use PSLiteralTable.intern() instead.
|
Use PSLiteralTable.intern() instead.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '/%s' % self.name
|
return '/%s' % self.name
|
||||||
|
|
||||||
|
# PSKeyword
|
||||||
class PSKeyword:
|
class PSKeyword:
|
||||||
|
|
||||||
'''
|
'''
|
||||||
PS keywords (e.g. "showpage").
|
PS keywords (e.g. "showpage").
|
||||||
Caution: Never create these objects directly.
|
Caution: Never create these objects directly.
|
||||||
Use PSKeywordTable.intern() instead.
|
Use PSKeywordTable.intern() instead.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
|
# PSSymbolTable
|
||||||
class PSSymbolTable:
|
class PSSymbolTable:
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Symbol table that stores PSLiteral or PSKeyword.
|
Symbol table that stores PSLiteral or PSKeyword.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, classe):
|
def __init__(self, classe):
|
||||||
self.dic = {}
|
self.dic = {}
|
||||||
self.classe = classe
|
self.classe = classe
|
||||||
|
@ -74,7 +86,9 @@ def keyword_name(x):
|
||||||
##
|
##
|
||||||
class PSBaseParser:
|
class PSBaseParser:
|
||||||
|
|
||||||
'''PostScript parser that performs only basic tokenization.'''
|
'''
|
||||||
|
Most basic PostScript parser that performs only basic tokenization.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, fp, debug=0):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
|
@ -88,21 +102,22 @@ class PSBaseParser:
|
||||||
|
|
||||||
def seek(self, pos):
|
def seek(self, pos):
|
||||||
'''
|
'''
|
||||||
seeks to the given pos.
|
Seeks the parser to the given position.
|
||||||
'''
|
'''
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'seek:', pos
|
print >>stderr, 'seek:', pos
|
||||||
|
prevpos = self.fp.tell()
|
||||||
self.fp.seek(pos)
|
self.fp.seek(pos)
|
||||||
self.linepos = pos
|
self.linebuf = None # line buffer.
|
||||||
self.linebuf = None
|
self.curpos = 0 # current position in the buffer.
|
||||||
self.curpos = 0
|
self.linepos = pos # the beginning of the current line.
|
||||||
self.line = ''
|
self.go = False
|
||||||
return
|
return prevpos
|
||||||
|
|
||||||
EOLCHAR = re.compile(r'[\r\n]')
|
EOLCHAR = re.compile(r'[\r\n]')
|
||||||
def nextline(self):
|
def nextline(self):
|
||||||
'''
|
'''
|
||||||
fetches the next line that ends either with \\r or \\n.
|
Fetches a next line that ends either with \\r or \\n.
|
||||||
'''
|
'''
|
||||||
line = ''
|
line = ''
|
||||||
eol = None
|
eol = None
|
||||||
|
@ -131,12 +146,14 @@ class PSBaseParser:
|
||||||
# fetch further
|
# fetch further
|
||||||
line += self.linebuf[self.curpos:]
|
line += self.linebuf[self.curpos:]
|
||||||
self.linebuf = None
|
self.linebuf = None
|
||||||
|
linepos = self.linepos
|
||||||
self.linepos += len(line)
|
self.linepos += len(line)
|
||||||
return line
|
return (linepos, line)
|
||||||
|
|
||||||
def revreadlines(self):
|
def revreadlines(self):
|
||||||
'''
|
'''
|
||||||
fetches lines backword. used to locate trailers.
|
Fetches a next line backword. This is used to locate
|
||||||
|
the trailers at the end of a file.
|
||||||
'''
|
'''
|
||||||
self.fp.seek(0, 2)
|
self.fp.seek(0, 2)
|
||||||
pos = self.fp.tell()
|
pos = self.fp.tell()
|
||||||
|
@ -156,6 +173,7 @@ class PSBaseParser:
|
||||||
buf = ''
|
buf = ''
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# regex patterns for basic lexical scanning.
|
||||||
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
|
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
|
||||||
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
|
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
|
||||||
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
|
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
|
||||||
|
@ -167,38 +185,39 @@ class PSBaseParser:
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
'''
|
'''
|
||||||
Yields a list of basic tokens: keywords, literals, strings,
|
Yields a list of tuples (pos, token) of the following:
|
||||||
numbers and parentheses. Comments are skipped.
|
keywords, literals, strings, numbers and parentheses.
|
||||||
Nested objects (i.e. arrays and dictionaries) are not handled.
|
Comments are skipped.
|
||||||
|
Nested objects (i.e. arrays and dictionaries) are not handled here.
|
||||||
'''
|
'''
|
||||||
while 1:
|
while 1:
|
||||||
# do not strip line! we need to distinguish last '\n' or '\r'
|
# do not strip line! we need to distinguish last '\n' or '\r'
|
||||||
linepos0 = self.linepos
|
(linepos, line) = self.nextline()
|
||||||
self.line = self.nextline()
|
if not line: break
|
||||||
if not self.line: break
|
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
|
print >>stderr, 'line: (%d) %r' % (linepos, line)
|
||||||
# do this before removing comment
|
# do this before removing comment
|
||||||
if self.line.startswith('%%EOF'): break
|
if line.startswith('%%EOF'): break
|
||||||
charpos = 0
|
charpos = 0
|
||||||
|
|
||||||
# tokenize
|
# tokenize
|
||||||
while 1:
|
self.go = True
|
||||||
m = self.TOKEN.search(self.line, charpos)
|
while self.go:
|
||||||
|
m = self.TOKEN.search(line, charpos)
|
||||||
if not m: break
|
if not m: break
|
||||||
t = m.group(0)
|
t = m.group(0)
|
||||||
pos = linepos0 + m.start(0)
|
pos = linepos + m.start(0)
|
||||||
charpos = m.end(0)
|
charpos = m.end(0)
|
||||||
|
|
||||||
if t == '%':
|
if t == '%':
|
||||||
# skip comment
|
# skip comment
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'comment: %r' % self.line[charpos:]
|
print >>stderr, 'comment: %r' % line[charpos:]
|
||||||
break
|
break
|
||||||
|
|
||||||
elif t == '/':
|
elif t == '/':
|
||||||
# literal object
|
# literal object
|
||||||
mn = self.LITERAL.match(self.line, m.start(0)+1)
|
mn = self.LITERAL.match(line, m.start(0)+1)
|
||||||
lit = PSLiteralTable.intern(mn.group(0))
|
lit = PSLiteralTable.intern(mn.group(0))
|
||||||
yield (pos, lit)
|
yield (pos, lit)
|
||||||
charpos = mn.end(0)
|
charpos = mn.end(0)
|
||||||
|
@ -209,30 +228,30 @@ class PSBaseParser:
|
||||||
# normal string object
|
# normal string object
|
||||||
s = ''
|
s = ''
|
||||||
while 1:
|
while 1:
|
||||||
ms = self.STRING_NORM.match(self.line, charpos)
|
ms = self.STRING_NORM.match(line, charpos)
|
||||||
if not ms: break
|
if not ms: break
|
||||||
s1 = ms.group(0)
|
s1 = ms.group(0)
|
||||||
charpos = ms.end(0)
|
charpos = ms.end(0)
|
||||||
if len(s1) == 1 and s1[-1] == '\\':
|
if len(s1) == 1 and s1[-1] == '\\':
|
||||||
s += s1[-1:]
|
s += s1[-1:]
|
||||||
self.line = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if not self.line:
|
if not line:
|
||||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||||
(self.linepos, self.line))
|
(linepos, line))
|
||||||
charpos = 0
|
charpos = 0
|
||||||
elif charpos == len(self.line):
|
elif charpos == len(line):
|
||||||
s += s1
|
s += s1
|
||||||
self.line = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if not self.line:
|
if not line:
|
||||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||||
(self.linepos, self.line))
|
(linepos, line))
|
||||||
charpos = 0
|
charpos = 0
|
||||||
else:
|
else:
|
||||||
s += s1
|
s += s1
|
||||||
break
|
break
|
||||||
if self.line[charpos] != ')':
|
if line[charpos] != ')':
|
||||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||||
(self.linepos, self.line))
|
(linepos, line))
|
||||||
charpos += 1
|
charpos += 1
|
||||||
def convesc(m):
|
def convesc(m):
|
||||||
x = m.group(0)
|
x = m.group(0)
|
||||||
|
@ -247,11 +266,11 @@ class PSBaseParser:
|
||||||
|
|
||||||
elif t == '<':
|
elif t == '<':
|
||||||
# hex string object
|
# hex string object
|
||||||
ms = self.STRING_HEX.match(self.line, charpos)
|
ms = self.STRING_HEX.match(line, charpos)
|
||||||
charpos = ms.end(0)
|
charpos = ms.end(0)
|
||||||
if self.line[charpos] != '>':
|
if line[charpos] != '>':
|
||||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||||
(self.linepos, self.line))
|
(linepos, line))
|
||||||
charpos += 1
|
charpos += 1
|
||||||
def convhex(m1):
|
def convhex(m1):
|
||||||
return chr(int(m1.group(0), 16))
|
return chr(int(m1.group(0), 16))
|
||||||
|
@ -270,7 +289,7 @@ class PSBaseParser:
|
||||||
print >>stderr, 'number: %r' % n
|
print >>stderr, 'number: %r' % n
|
||||||
yield (pos, n)
|
yield (pos, n)
|
||||||
|
|
||||||
elif t in ('true','false'):
|
elif t in ('true', 'false'):
|
||||||
# boolean
|
# boolean
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'boolean: %r' % t
|
print >>stderr, 'boolean: %r' % t
|
||||||
|
|
Loading…
Reference in New Issue