diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..46c6acb
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,29 @@
+# Makefile for pdfminer
+
+PACKAGE=pdfminer
+VERSION=20071231
+TAR=tar
+SVN=svn
+
+WORKDIR=..
+DISTNAME=$(PACKAGE)-dist-$(VERSION)
+DISTFILE=$(DISTNAME).tar.gz
+
+all:
+
+clean:
+ -rm *.pyc *.pyo *~
+
+# Maintainance:
+
+pack: clean
+ $(SVN) cleanup
+ $(SVN) export . $(WORKDIR)/$(DISTNAME)
+ $(TAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
+ rm -rf $(WORKDIR)/$(DISTNAME)
+
+pychecker:
+ -pychecker --limit=0 *.py
+
+commit: clean
+ $(SVN) commit
diff --git a/dumppdf.py b/dumppdf.py
new file mode 100755
index 0000000..7efcdce
--- /dev/null
+++ b/dumppdf.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+#
+# dumppdf.py - dump pdf contents in XML format.
+#
+# usage: dumppdf.py [options] [files ...]
+# options:
+# -i objid : object id
+#
+import sys, re
+from pdfparser import PDFDocument, PDFParser, PDFStream, \
+ PDFObjRef, PSKeyword, PSLiteral
+stdout = sys.stdout
+stderr = sys.stderr
+
+
+ESC_PAT = re.compile(r'[\000-\037&<>\042\047\134\177-\377]')
+def esc(s):
+ return ESC_PAT.sub(lambda m:'\\x%02x' % ord(m.group(0)), s)
+
+
+# dumpxml
+def dumpxml(out, obj):
+ if isinstance(obj, dict):
+ out.write('\n' % len(obj))
+ for (k,v) in obj.iteritems():
+ out.write('%s\n' % k)
+ out.write('')
+ dumpxml(out, v)
+ out.write('\n')
+ out.write('')
+ return
+
+ if isinstance(obj, list):
+ out.write('\n' % len(obj))
+ for v in obj:
+ dumpxml(out, v)
+ out.write('\n')
+ out.write('
')
+ return
+
+ if isinstance(obj, str):
+ out.write('%s' % (len(obj), esc(obj)))
+ return
+
+ if isinstance(obj, PDFStream):
+ props = obj.dic.copy()
+ if 'Filter' in props:
+ del props['Filter']
+ if 'DecodeParms' in props:
+ del props['DecodeParms']
+ out.write('\n\n')
+ dumpxml(out, props)
+ data = obj.get_data()
+ out.write('\n\n')
+ out.write('%s\n' % (len(data), esc(data)))
+ out.write('')
+ return
+
+ if isinstance(obj, PDFObjRef):
+ out.write('' % obj.objid)
+ return
+
+ if isinstance(obj, PSKeyword):
+ out.write('%s' % obj.name)
+ return
+
+ if isinstance(obj, PSLiteral):
+ out.write('%s' % obj.name)
+ return
+
+ if isinstance(obj, int) or isinstance(obj, float):
+ out.write('%s' % obj)
+ return
+
+ raise TypeError(obj)
+
+# dumptrailers
+def dumptrailers(out, doc):
+ for xref in doc.xrefs:
+ out.write('\n' %
+ (xref.objid0, xref.objid1))
+ dumpxml(out, xref.trailer)
+ out.write('\n\n\n')
+ return
+
+# dumpall
+def dumpall(out, doc):
+ out.write('')
+ for xref in doc.xrefs:
+ for objid in xrange(xref.objid0, xref.objid1+1):
+ try:
+ obj = doc.getobj(objid)
+ out.write('\n\n')
+ except PDFValueError:
+ pass
+ dumptrailers(out, doc)
+ out.write('')
+ return
+
+# dumppdf
+def dumppdf(outfp, fname, objids, pageids,
+ dumpall=False, binary=False, debug=0):
+ doc = PDFDocument(debug=debug)
+ fp = file(fname)
+ parser = PDFParser(doc, fp, debug=debug)
+ if objids:
+ for objid in objids:
+ obj = doc.getobj(objid)
+ if binary and isinstance(obj, PDFStream):
+ outfp.write(obj.get_data())
+ else:
+ dumpxml(outfp, obj)
+ if pageids:
+ for page in doc.get_pages():
+ if page.pageid in pageids:
+ dumpxml(outfp, page.attrs)
+ if dumpall:
+ dumpall(outfp, doc)
+ if (not objids) and (not pageids) and (not dumpall):
+ dumptrailers(outfp, doc)
+ fp.close()
+ outfp.write('\n')
+ return
+
+
+# main
+def main(argv):
+ import getopt
+ def usage():
+ print 'usage: %s [-d] [-a] [-b] [-p pageid] [-i objid] file ...' % argv[0]
+ return 100
+ try:
+ (opts, args) = getopt.getopt(argv[1:], 'dabi:p:')
+ except getopt.GetoptError:
+ return usage()
+ if not args: return usage()
+ debug = 0
+ objids = []
+ pageids = set()
+ binary = False
+ dumpall = False
+ outfp = stdout
+ for (k, v) in opts:
+ if k == '-d': debug += 1
+ elif k == '-i': objids.append(int(v))
+ elif k == '-p': pageids.add(int(v))
+ elif k == '-a': dumpall = True
+ elif k == '-b': binary = True
+ elif k == '-o': outfp = file(v, 'w')
+ #
+ for fname in args:
+ dumppdf(outfp, fname, objids, pageids,
+ dumpall=dumpall, binary=binary, debug=debug)
+ return
+
+if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/pdfdump.py b/pdfdump.py
deleted file mode 100755
index 99e82d1..0000000
--- a/pdfdump.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env python
-import sys
-from pdfparser import CMapDB, PDFDocument, PDFParser, dumpxml, PDFStream
-stdout = sys.stdout
-stderr = sys.stderr
-
-# main
-def main(argv):
- import getopt
- def usage():
- print 'usage: %s [-d] [-v] [-a] [-b] [-p pageid] [-i objid] file ...' % argv[0]
- return 100
- try:
- (opts, args) = getopt.getopt(argv[1:], 'dvabi:p:')
- except getopt.GetoptError:
- return usage()
- if not args: return usage()
- (debug, verbose) = (0, 0)
- objids = []
- pageids = set()
- binary = False
- dumpall = False
- outfp = stdout
- for (k, v) in opts:
- if k == '-d': debug += 1
- elif k == '-v': verbose += 1
- elif k == '-i': objids.append(int(v))
- elif k == '-p': pageids.add(int(v))
- elif k == '-a': dumpall = True
- elif k == '-b': binary = True
- elif k == '-o': outfp = file(v, 'w')
- #
- for fname in args:
- doc = PDFDocument(debug=debug)
- fp = file(fname)
- parser = PDFParser(doc, fp, debug=debug)
- if objids:
- for objid in objids:
- obj = doc.getobj(objid)
- if binary:
- if isinstance(obj, PDFStream):
- outfp.write(obj.get_data())
- else:
- outfp.write(repr(obj))
- else:
- dumpxml(outfp, obj)
- elif pageids:
- for page in doc.get_pages():
- if page.pageid in pageids:
- dumpxml(outfp, page.attrs)
- elif dumpall:
- doc.dumpall(outfp)
- else:
- doc.dumptrailers(outfp)
- fp.close()
- outfp.write('\n')
- return
-
-if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/pdfparser.py b/pdfparser.py
index f7c6536..ed5d289 100755
--- a/pdfparser.py
+++ b/pdfparser.py
@@ -5,14 +5,12 @@
# ver 0.2, Dec 24 2007
# TODO:
-# - .curpos
-# - colorspace..
+# - Code Documentation.
+# - Error handling for invalid type.
-# - comments.
# - Outlines.
# - Named Objects. (pages)
# - Writers.
-# - Error handling for invalid type.
# - Linearized PDF.
# - Encryption?
@@ -143,10 +141,6 @@ LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
-LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
-LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
-LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
-LITERAL_ICCBASED = PSLiteralTable.intern('ICCBased')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_EI = PSKeywordTable.intern('EI')
@@ -184,8 +178,10 @@ class CMap:
return self
def register_cid2code(self, cid, code):
- assert isinstance(code, str)
+ from glyphlist import charname2unicode
assert isinstance(cid, int)
+ if isinstance(code, PSLiteral):
+ code = pack('>H', charname2unicode[code.name])
self.cid2code[cid] = code
return self
@@ -243,6 +239,10 @@ class CDBCMap(CMap):
if not self.db.has_key(k):
return None
return self.db[k]
+
+ def is_vertical(self):
+ return (self.db.has_key('/WMode') and
+ self.db['/WMode'] == '1')
def getall(self, c):
while 1:
@@ -252,19 +252,16 @@ class CDBCMap(CMap):
if k.startswith(c):
yield (k[1:], unpack('>L', v)[0])
return
-
- def is_vertical(self):
- return (self.db.has_key('/WMode') and
- self.db['/WMode'] == '1')
def getall_attrs(self):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
- if k.startswith(c):
+ if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
+
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
@@ -387,6 +384,36 @@ class EncodingDB:
cid += 1
return cid2unicode
+
+## Color Spaces
+##
+LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
+LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
+LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
+LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
+LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
+CS_COMPONENTS = {
+ PSLiteralTable.intern('CalRGB'): 3,
+ PSLiteralTable.intern('CalGray'): 1,
+ PSLiteralTable.intern('Lab'): 3,
+ PSLiteralTable.intern('DeviceRGB'): 3,
+ PSLiteralTable.intern('DeviceCMYK'): 4,
+ PSLiteralTable.intern('DeviceGray'): 1,
+ PSLiteralTable.intern('Separation'): 1,
+ PSLiteralTable.intern('Indexed'): 1,
+ PSLiteralTable.intern('Pattern'): 1,
+ }
+
+def cs_params(cs):
+ t = cs[0]
+ if t == LITERAL_ICC_BASED:
+ return stream_value(cs[1]).dic['N']
+ elif t == LITERAL_DEVICE_N:
+ return len(list_value(cs[1]))
+ else:
+ return CS_COMPONENTS[t]
+
+
## PSBaseParser
##
class PSBaseParser:
@@ -401,7 +428,7 @@ class PSBaseParser:
return
def __repr__(self):
- return '' % (self.fp, self.curpos)
+ return '' % (self.fp,)
def seek(self, pos):
'''
@@ -410,9 +437,9 @@ class PSBaseParser:
if 2 <= self.debug:
print >>stderr, 'seek:', pos
self.fp.seek(pos)
- self.curpos = pos
+ self.linepos = pos
self.linebuf = None
- self.linepos = 0
+ self.curpos = 0
self.line = ''
return
@@ -424,31 +451,31 @@ class PSBaseParser:
line = ''
eol = None
while 1:
- if not self.linebuf or len(self.linebuf) <= self.linepos:
+ if not self.linebuf or len(self.linebuf) <= self.curpos:
# fetch next chunk.
self.linebuf = self.fp.read(self.bufsize)
if not self.linebuf:
# at EOF.
break
- self.linepos = 0
+ self.curpos = 0
if eol:
- c = self.linebuf[self.linepos]
+ c = self.linebuf[self.curpos]
# handle '\r\n'
if (eol == '\r' and c == '\n'):
line += c
- self.linepos += 1
+ self.curpos += 1
break
- m = self.EOLCHAR.search(self.linebuf, self.linepos)
+ m = self.EOLCHAR.search(self.linebuf, self.curpos)
if m:
i = m.end(0)
- line += self.linebuf[self.linepos:i]
+ line += self.linebuf[self.curpos:i]
eol = self.linebuf[i-1]
- self.linepos = i
+ self.curpos = i
else:
# fetch further
- line += self.linebuf[self.linepos:]
+ line += self.linebuf[self.curpos:]
self.linebuf = None
- self.curpos += len(line)
+ self.linepos += len(line)
return line
def revreadlines(self):
@@ -490,11 +517,11 @@ class PSBaseParser:
'''
while 1:
# do not strip line! we need to distinguish last '\n' or '\r'
- basepos = self.curpos
+ linepos0 = self.linepos
self.line = self.nextline()
if not self.line: break
if 2 <= self.debug:
- print >>stderr, 'line: (%d) %r' % (self.curpos, self.line)
+ print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
# do this before removing comment
if self.line.startswith('%%EOF'): break
charpos = 0
@@ -504,7 +531,7 @@ class PSBaseParser:
m = self.TOKEN.search(self.line, charpos)
if not m: break
t = m.group(0)
- pos = basepos+m.start(0)
+ pos = linepos0 + m.start(0)
charpos = m.end(0)
if t == '%':
@@ -534,22 +561,22 @@ class PSBaseParser:
s += s1[-1:]
self.line = self.nextline()
if not self.line:
- raise PSSyntaxError('end inside string: curpos=%d, line=%r' %
- (self.curpos, self.line))
+ raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+ (self.linepos, self.line))
charpos = 0
elif charpos == len(self.line):
s += s1
self.line = self.nextline()
if not self.line:
- raise PSSyntaxError('end inside string: curpos=%d, line=%r' %
- (self.curpos, self.line))
+ raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+ (self.linepos, self.line))
charpos = 0
else:
s += s1
break
if self.line[charpos] != ')':
- raise PSSyntaxError('no close paren: curpos=%d, line=%r' %
- (self.curpos, self.line))
+ raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+ (self.linepos, self.line))
charpos += 1
def convesc(m):
x = m.group(0)
@@ -567,8 +594,8 @@ class PSBaseParser:
ms = self.STRING_HEX.match(self.line, charpos)
charpos = ms.end(0)
if self.line[charpos] != '>':
- raise PSSyntaxError('no close paren: curpos=%d, line=%r' %
- (self.curpos, self.line))
+ raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+ (self.linepos, self.line))
charpos += 1
def convhex(m1):
return chr(int(m1.group(0), 16))
@@ -801,18 +828,21 @@ class CMapParser(PSStackParser):
for (s,e,code) in choplist(3, self.partobj):
assert isinstance(s, str)
assert isinstance(e, str)
- assert isinstance(code, str)
assert len(s) == len(e)
s1 = nunpack(s)
e1 = nunpack(e)
assert s1 <= e1
- var = code[-4:]
- base = nunpack(var)
- prefix = code[:-4]
- vlen = len(var)
- for i in xrange(e1-s1+1):
- x = prefix+pack('>L',base+i)[-vlen:]
- self.cmap.register_cid2code(s1+i, x)
+ if isinstance(code, list):
+ for i in xrange(e1-s1+1):
+ self.cmap.register_cid2code(s1+i, code[i])
+ else:
+ var = code[-4:]
+ base = nunpack(var)
+ prefix = code[:-4]
+ vlen = len(var)
+ for i in xrange(e1-s1+1):
+ x = prefix+pack('>L',base+i)[-vlen:]
+ self.cmap.register_cid2code(s1+i, x)
self.popall()
return
@@ -858,6 +888,7 @@ class PDFStream:
data = self.rawdata
if self.doc.crypt:
# func DECRYPT is not implemented yet...
+ raise NotImplementedError
data = DECRYPT(self.doc.crypt, data)
if 'Filter' not in self.dic:
self.data = data
@@ -1008,63 +1039,6 @@ def stream_value(x):
return x
-# dumpxml
-def dumpxml(out, obj):
- if isinstance(obj, dict):
- out.write('\n' % len(obj))
- for (k,v) in obj.iteritems():
- out.write('%s\n' % k)
- out.write('')
- dumpxml(out, v)
- out.write('\n')
- out.write('')
- return
-
- if isinstance(obj, list):
- out.write('\n' % len(obj))
- for v in obj:
- dumpxml(out, v)
- out.write('\n')
- out.write('
')
- return
-
- if isinstance(obj, str):
- out.write('%s' % (len(obj), repr(obj)))
- return
-
- if isinstance(obj, PDFStream):
- props = obj.dic.copy()
- if 'Filter' in props:
- del props['Filter']
- if 'DecodeParms' in props:
- del props['DecodeParms']
- out.write('\n\n')
- dumpxml(out, props)
- data = obj.get_data()
- out.write('\n\n')
- out.write('%s\n' % (len(data), repr(data)))
- out.write('')
- return
-
- if isinstance(obj, PDFObjRef):
- out.write('' % obj.objid)
- return
-
- if isinstance(obj, PSKeyword):
- out.write('%s' % obj.name)
- return
-
- if isinstance(obj, PSLiteral):
- out.write('%s' % obj.name)
- return
-
- if isinstance(obj, int) or isinstance(obj, float):
- out.write('%s' % obj)
- return
-
- raise TypeError(obj)
-
-
## PDFPage
##
class PDFPage:
@@ -1176,6 +1150,7 @@ class PDFDocument:
self.parsed_objs = {}
self.crypt = None
self.root = None
+ self.catalog = None
self.parser = None
return
@@ -1187,7 +1162,6 @@ class PDFDocument:
trailer = xref.trailer
if 'Encrypt' in trailer:
self.crypt = dict_value(trailer['Encrypt'])
- raise PDFEncrypted
if 'Root' in trailer:
self.set_root(dict_value(trailer['Root']))
break
@@ -1196,6 +1170,7 @@ class PDFDocument:
return
def getobj(self, objid):
+ assert self.xrefs
if objid in self.objs:
obj = self.objs[objid]
else:
@@ -1220,9 +1195,8 @@ class PDFDocument:
self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index]
else:
- pos = index
- pos0 = self.parser.curpos
- self.parser.seek(pos)
+ pos0 = self.parser.linepos
+ self.parser.seek(index)
seq = list_value(self.parser.parse())
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
raise PDFSyntaxError('invalid stream spec: %r' % seq)
@@ -1234,6 +1208,7 @@ class PDFDocument:
return obj
def get_pages(self, debug=0):
+ assert self.xrefs
def search(obj, parent):
tree = dict_value(obj)
if tree['Type'] == LITERAL_PAGES:
@@ -1244,7 +1219,7 @@ class PDFDocument:
yield x
elif tree['Type'] == LITERAL_PAGE:
if 1 <= debug:
- print >>stderr, 'Page: %r' % page1
+ print >>stderr, 'Page: %r' % tree
yield (tree, parent)
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree, parent)
@@ -1258,28 +1233,6 @@ class PDFDocument:
self.outline = self.catalog.get('Outline')
return
- def dumptrailers(self, out=sys.stdout):
- for xref in self.xrefs:
- out.write('\n')
- dumpxml(out, xref.trailer)
- out.write('\n\n\n')
- return
-
- def dumpall(self, out=sys.stdout):
- out.write('')
- for xref in self.xrefs:
- for objid in xrange(xref.objid0, xref.objid1+1):
- try:
- obj = self.getobj(objid)
- out.write('\n\n')
- except PDFValueError:
- pass
- self.dumptrailers(out)
- out.write('')
- return
-
## PDFParser
##
@@ -1293,7 +1246,7 @@ class PDFParser(PSStackParser):
return
def __repr__(self):
- return '' % self.curpos
+ return '' % self.linepos
EOIPAT = re.compile(r'\nEI\W')
def do_token(self, pos, token):
@@ -1328,12 +1281,12 @@ class PDFParser(PSStackParser):
while 1:
line = self.nextline()
if not line:
- raise PDFSyntaxError('premature eof, need endstream: curpos=%d, line=%r' %
- (self.curpos, line))
+ raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
+ (self.linepos, line))
if line.strip():
if not line.startswith('endstream'):
- raise PDFSyntaxError('need endstream: curpos=%d, line=%r' %
- (self.curpos, line))
+ raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
+ (self.linepos, line))
break
if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
@@ -1355,7 +1308,7 @@ class PDFParser(PSStackParser):
pos += len('ID ')
self.fp.seek(pos)
data = self.fp.read(8192)
- # XXX how do we know the real datalen other than scanning?
+ # XXX how do we know the real length other than scanning?
m = self.EOIPAT.search(data)
assert m
objlen = m.start(0)
@@ -1391,7 +1344,7 @@ class PDFParser(PSStackParser):
self.find_xref()
while 1:
# read xref table
- pos0 = self.curpos
+ pos0 = self.linepos
line = self.nextline()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
@@ -1400,8 +1353,8 @@ class PDFParser(PSStackParser):
self.seek(pos0)
xref = PDFXRefStream(self)
elif line.strip() != 'xref':
- raise PDFSyntaxError('xref not found: curpos=%d, line=%r' %
- (self.curpos, line))
+ raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
+ (self.linepos, line))
else:
xref = PDFXRef(self)
yield xref
@@ -1587,7 +1540,7 @@ class TrueTypeFont:
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
- if not c: continue
+ if not entcount: continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
@@ -1911,12 +1864,10 @@ class PDFPageInterpreter:
# setcolorspace-stroking
def do_CS(self, name):
- # XXX
self.scs = self.csmap.get(literal_name(name), None)
return
# setcolorspace-non-strokine
def do_cs(self, name):
- # XXX
self.ncs = self.csmap.get(literal_name(name), None)
return
# setgray-stroking
@@ -1946,19 +1897,11 @@ class PDFPageInterpreter:
# setcolor
def do_SCN(self):
- if t == LITERAL_ICCBASED:
- n = stream_value(self.scs[1]).dic['N']
- else:
- n = 1
+ n = cs_params(self.scs)
self.pop(n)
return
def do_scn(self):
- # XXX
- t = self.ncs[0]
- if t == LITERAL_ICCBASED:
- n = stream_value(self.ncs[1]).dic['N']
- else:
- n = 1
+ n = cs_params(self.ncs)
self.pop(n)
return
def do_SC(self):
@@ -2108,14 +2051,16 @@ class PDFPageInterpreter:
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
- self.render_contents('page%d' % page.pageid, page.resources, page.contents)
+ self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
return
def render_contents(self, contid, resources, contents, ctm=(1, 0, 0, 1, 0, 0)):
self.initpage(ctm)
- self.device.begin_page(contid)
+ self.device.begin_block(contid)
# Handle resource declarations.
for (k,v) in resources.iteritems():
+ if 1 <= self.debug:
+ print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,fontrsrc) in dict_value(v).iteritems():
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
@@ -2129,7 +2074,7 @@ class PDFPageInterpreter:
self.xobjmap[xobjid] = xobjstrm
for stream in contents:
self.execute(stream_value(stream))
- self.device.end_page()
+ self.device.end_block()
return
def execute(self, stream):
@@ -2172,14 +2117,12 @@ class PDFDevice:
self.ctm = ctm
return
- def begin_page(self, page):
+ def begin_block(self, name):
+ return
+ def end_block(self):
return
- def end_page(self):
- return
-
- def render_string(self, state, matrix, size, seq):
- print "render_string: state=%r, matrix=%r, size=%r, seq=%r" % (state, matrix, size, seq)
+ def render_string(self, textstate, textmatrix, size, seq):
raise NotImplementedError
@@ -2193,19 +2136,17 @@ class TextConverter(PDFDevice):
self.codec = codec
return
- def begin_page(self, pageid):
- self.outfp.write('\n' % pageid)
+ def begin_block(self, name):
+ self.outfp.write('\n' % name)
return
- def end_page(self):
- self.outfp.write('\n')
+ def end_block(self):
+ self.outfp.write('\n')
return
- def render_string(self, textstate, matrix, size, seq):
- buf = ''
+ def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font
- (a,b,c,d,tx,ty) = mult_matrix(matrix, self.ctm)
- skewed = (b != 0 or c != 0)
spwidth = int(-font.char_width(32) * 0.6) # space width
+ buf = ''
for x in seq:
if isinstance(x, int) or isinstance(x, float):
if not font.is_vertical() and x <= spwidth:
@@ -2219,16 +2160,20 @@ class TextConverter(PDFDevice):
(cidcoding, cid) = e.args
char = u'[%s:%d]' % (cidcoding, cid)
buf += char
- def f(x): return '%.03f' % x
- s = buf.encode(self.codec, 'xmlcharrefreplace')
+ (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
+ skewed = (b != 0 or c != 0)
if font.is_vertical():
- (w,fs) = apply_matrix((a,b,c,d,0,0), (-size,textstate.fontsize))
- self.outfp.write('%s\n' %
- (font.fontname, f(fs), f(tx),f(ty),f(w),s))
+ size = -size
+ tag = 'vtext'
else:
- (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
- self.outfp.write('%s\n' %
- (font.fontname, f(fs), f(tx),f(ty),f(w),s))
+ tag = 'htext'
+ if skewed:
+ tag += ' skewed'
+ s = buf.encode(self.codec, 'xmlcharrefreplace')
+ (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
+ def f(x): return '%.03f' % x
+ self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s%s>\n' %
+ (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
return