diff --git a/Makefile b/Makefile
index 7989b79..6eecd76 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ TAR=tar
SVN=svn
PYTHON=python
-WORKDIR=..
+WORKDIR=/tmp
DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz
diff --git a/README.html b/README.html
index 8859d94..c6d4011 100644
--- a/README.html
+++ b/README.html
@@ -21,6 +21,7 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz
+(220kbytes)
Svn repository:
@@ -46,7 +47,7 @@ http://www.unixuser.org/~euske/pub/CMap.tar.bz2
Dump the contents:
-$ ./dumppdf.py foo.pdf
+$ ./dumppdf.py -a foo.pdf
@@ -56,6 +57,14 @@ $ ./pdf2txt.py samples/naacl06-shinyama.pdf
$ ./pdf2txt.py -c euc-jp samples/jo.pdf
+
+Similar Projects
+
+
+
Terms and conditions
diff --git a/dumppdf.py b/dumppdf.py
index 7efcdce..5f7ab75 100755
--- a/dumppdf.py
+++ b/dumppdf.py
@@ -83,8 +83,8 @@ def dumptrailers(out, doc):
out.write('\n\n\n')
return
-# dumpall
-def dumpall(out, doc):
+# dumpallobjs
+def dumpallobjs(out, doc):
out.write('')
for xref in doc.xrefs:
for objid in xrange(xref.objid0, xref.objid1+1):
@@ -93,7 +93,7 @@ def dumpall(out, doc):
out.write('\n\n')
- except PDFValueError:
+ except:
pass
dumptrailers(out, doc)
out.write('')
@@ -117,7 +117,7 @@ def dumppdf(outfp, fname, objids, pageids,
if page.pageid in pageids:
dumpxml(outfp, page.attrs)
if dumpall:
- dumpall(outfp, doc)
+ dumpallobjs(outfp, doc)
if (not objids) and (not pageids) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
diff --git a/pdf2txt.py b/pdf2txt.py
index c52daad..04f06fd 100755
--- a/pdf2txt.py
+++ b/pdf2txt.py
@@ -23,8 +23,9 @@ class TextConverter(PDFDevice):
self.outfp.write('\n')
return
- def begin_block(self, name):
- self.outfp.write('\n' % name)
+ def begin_block(self, name, (x0,y0,x1,y1)):
+ self.outfp.write('\n' %
+ (name,x0,y0,x1,y1))
return
def end_block(self):
self.outfp.write('\n')
@@ -83,10 +84,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
def main(argv):
import getopt
def usage():
- print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
+ print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0]
return 100
try:
- (opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
+ (opts, args) = getopt.getopt(argv[1:], 'dp:c:')
except getopt.GetoptError:
return usage()
if not args: return usage()
diff --git a/pdfinterp.py b/pdfinterp.py
index d24f848..4095b57 100644
--- a/pdfinterp.py
+++ b/pdfinterp.py
@@ -57,17 +57,18 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def apply_matrix((a,b,c,d,e,f), (x,y)):
- '''Applies a matrix to a coordination.'''
+ '''Applies a matrix to coordinates.'''
return (a*x+c*y+e, b*x+d*y+f)
def cs_params(cs):
+ '''Returns a number of components for a given colorspace.'''
t = cs[0]
if t == LITERAL_ICC_BASED:
return stream_value(cs[1]).dic['N']
elif t == LITERAL_DEVICE_N:
return len(list_value(cs[1]))
else:
- return CS_COMPONENTS[t]
+ return CS_COMPONENTS.get(t, 0)
## Fonts
@@ -438,7 +439,7 @@ class PDFDevice:
self.ctm = ctm
return
- def begin_block(self, name):
+ def begin_block(self, name, bbox):
return
def end_block(self):
return
@@ -589,11 +590,11 @@ class PDFPageInterpreter:
# setcolorspace-stroking
def do_CS(self, name):
- self.scs = self.csmap.get(literal_name(name), None)
+ self.scs = self.csmap.get(literal_name(name), [name])
return
# setcolorspace-non-strokine
def do_cs(self, name):
- self.ncs = self.csmap.get(literal_name(name), None)
+ self.ncs = self.csmap.get(literal_name(name), [name])
return
# setgray-stroking
def do_G(self, gray):
@@ -770,34 +771,46 @@ class PDFPageInterpreter:
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
interpreter = PDFPageInterpreter(self.rsrc, self.device)
- interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
- xobj.dic.get('Matrix', MATRIX_IDENTITY))
+ (x0,y0,x1,y1) = xobj.dic['BBox']
+ ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
+ (x0,y0) = apply_matrix(ctm, (x0,y0))
+ (x1,y1) = apply_matrix(ctm, (x1,y1))
+ interpreter.render_contents(xobjid,
+ (x0,y0,x1,y1),
+ xobj.dic.get('Resources'),
+ [xobj],
+ ctm=ctm)
return
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
- self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
+ self.render_contents('page-%d' % page.pageid,
+ page.mediabox,
+ page.resources,
+ page.contents)
return
- def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
+ def render_contents(self, contid, mediabox, resources, contents,
+ ctm=MATRIX_IDENTITY):
self.initpage(ctm)
- self.device.begin_block(contid)
+ self.device.begin_block(contid, mediabox)
# Handle resource declarations.
- for (k,v) in dict_value(resources).iteritems():
- if 1 <= self.debug:
- print >>stderr, 'Resource: %r: %r' % (k,v)
- if k == 'Font':
- for (fontid,fontrsrc) in dict_value(v).iteritems():
- self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
- elif k == 'ColorSpace':
- for (csid,csspec) in dict_value(v).iteritems():
- self.csmap[csid] = list_value(csspec)
- elif k == 'ProcSet':
- self.rsrc.get_procset(list_value(v))
- elif k == 'XObject':
- for (xobjid,xobjstrm) in dict_value(v).iteritems():
- self.xobjmap[xobjid] = xobjstrm
+ if resources:
+ for (k,v) in dict_value(resources).iteritems():
+ if 1 <= self.debug:
+ print >>stderr, 'Resource: %r: %r' % (k,v)
+ if k == 'Font':
+ for (fontid,fontrsrc) in dict_value(v).iteritems():
+ self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
+ elif k == 'ColorSpace':
+ for (csid,csspec) in dict_value(v).iteritems():
+ self.csmap[csid] = list_value(csspec)
+ elif k == 'ProcSet':
+ self.rsrc.get_procset(list_value(v))
+ elif k == 'XObject':
+ for (xobjid,xobjstrm) in dict_value(v).iteritems():
+ self.xobjmap[xobjid] = xobjstrm
for stream in list_value(contents):
self.execute(stream_value(stream))
self.device.end_block()
diff --git a/pdfparser.py b/pdfparser.py
index a12c7a1..db2bdcf 100755
--- a/pdfparser.py
+++ b/pdfparser.py
@@ -18,8 +18,8 @@ import sys, re
stderr = sys.stderr
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \
- PSLiteral, PSKeyword, \
- PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
+ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
+ literal_name, keyword_name, \
PSStackParser
@@ -76,8 +76,7 @@ def resolveall(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
- This procedure might be slow. Do not used it unless
- you really need it.
+ This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
@@ -209,13 +208,12 @@ class PDFStream:
##
class PDFPage:
- def __init__(self, doc, pageidx, attrs, parent_attrs):
+ def __init__(self, doc, pageidx, attrs):
self.doc = doc
self.pageid = pageidx
self.attrs = dict_value(attrs)
- self.parent_attrs = parent_attrs
- self.resources = self.get_attr('Resources')
- self.mediabox = self.get_attr('MediaBox')
+ self.resources = resolve1(self.attrs['Resources'])
+ self.mediabox = resolve1(self.attrs['MediaBox'])
contents = resolve1(self.attrs['Contents'])
if not isinstance(contents, list):
contents = [ contents ]
@@ -224,11 +222,6 @@ class PDFPage:
def __repr__(self):
return '' % (self.resources, self.mediabox)
-
- def get_attr(self, k):
- if k in self.attrs:
- return resolve1(self.attrs[k])
- return self.parent_attrs.get(k)
## XRefs
@@ -239,7 +232,7 @@ class PDFXRef:
def __init__(self, parser):
while 1:
- line = parser.nextline()
+ (_, line) = parser.nextline()
if not line:
raise PDFSyntaxError('premature eof: %r' % parser)
line = line.strip()
@@ -253,7 +246,7 @@ class PDFXRef:
self.objid1 = start+nobjs
self.offsets = []
for objid in xrange(start, start+nobjs):
- line = parser.nextline()
+ (_, line) = parser.nextline()
f = line.strip().split(' ')
if len(f) != 3:
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
@@ -361,13 +354,12 @@ class PDFDocument:
self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index]
else:
- pos0 = self.parser.linepos
- self.parser.seek(index)
+ prevpos = self.parser.seek(index)
seq = list_value(self.parser.parse())
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
raise PDFSyntaxError('invalid stream spec: %r' % seq)
obj = seq[3]
- self.parser.seek(pos0)
+ self.parser.seek(prevpos)
if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj
@@ -376,7 +368,10 @@ class PDFDocument:
def get_pages(self, debug=0):
assert self.xrefs
def search(obj, parent):
- tree = dict_value(obj)
+ tree = dict_value(obj).copy()
+ for (k,v) in parent.iteritems():
+ if k not in tree:
+ tree[k] = v
if tree['Type'] == LITERAL_PAGES:
if 1 <= debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
@@ -386,9 +381,9 @@ class PDFDocument:
elif tree['Type'] == LITERAL_PAGE:
if 1 <= debug:
print >>stderr, 'Page: %r' % tree
- yield (tree, parent)
- for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
- yield PDFPage(self, i, tree, parent)
+ yield tree
+ for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
+ yield PDFPage(self, i, tree)
return
def set_root(self, root):
@@ -440,19 +435,19 @@ class PDFParser(PSStackParser):
raise PDFValueError('/Length is undefined: %r' % dic)
objlen = int_value(dic['Length'])
self.seek(pos)
- line = self.nextline() # 'stream'
+ (_, line) = self.nextline() # 'stream'
self.fp.seek(pos+len(line))
data = self.fp.read(objlen)
self.seek(pos+len(line)+objlen)
while 1:
- line = self.nextline()
+ (linepos, line) = self.nextline()
if not line:
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
- (self.linepos, line))
+ (linepos, line))
if line.strip():
if not line.startswith('endstream'):
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
- (self.linepos, line))
+ (linepos, line))
break
if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
@@ -510,17 +505,16 @@ class PDFParser(PSStackParser):
self.find_xref()
while 1:
# read xref table
- pos0 = self.linepos
- line = self.nextline()
+ (linepos, line) = self.nextline()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
if line[0].isdigit():
# XRefStream: PDF-1.5
- self.seek(pos0)
+ self.seek(linepos)
xref = PDFXRefStream(self)
elif line.strip() != 'xref':
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
- (self.linepos, line))
+ (linepos, line))
else:
xref = PDFXRef(self)
yield xref
@@ -531,10 +525,10 @@ class PDFParser(PSStackParser):
self.seek(int_value(trailer['XRefStm']))
if 'Prev' in trailer:
# find previous xref
- pos0 = int_value(trailer['Prev'])
- self.seek(pos0)
+ pos = int_value(trailer['Prev'])
+ self.seek(pos)
if 1 <= self.debug:
- print >>stderr, 'prev trailer: pos=%d' % pos0
+ print >>stderr, 'prev trailer: pos=%d' % pos
else:
break
return
diff --git a/psparser.py b/psparser.py
index 5a72d46..201a39e 100644
--- a/psparser.py
+++ b/psparser.py
@@ -12,36 +12,48 @@ class PSTypeError(PSException): pass
class PSValueError(PSException): pass
-## PostScript Types
+## Basic PostScript Types
##
+
+# PSLiteral
class PSLiteral:
+
'''
PS literals (e.g. "/Name").
Caution: Never create these objects directly.
Use PSLiteralTable.intern() instead.
'''
+
def __init__(self, name):
self.name = name
return
+
def __repr__(self):
return '/%s' % self.name
+# PSKeyword
class PSKeyword:
+
'''
PS keywords (e.g. "showpage").
Caution: Never create these objects directly.
Use PSKeywordTable.intern() instead.
'''
+
def __init__(self, name):
self.name = name
return
+
def __repr__(self):
return self.name
+# PSSymbolTable
class PSSymbolTable:
+
'''
Symbol table that stores PSLiteral or PSKeyword.
'''
+
def __init__(self, classe):
self.dic = {}
self.classe = classe
@@ -74,7 +86,9 @@ def keyword_name(x):
##
class PSBaseParser:
- '''PostScript parser that performs only basic tokenization.'''
+ '''
+ Most basic PostScript parser that performs only basic tokenization.
+ '''
def __init__(self, fp, debug=0):
self.fp = fp
@@ -88,21 +102,22 @@ class PSBaseParser:
def seek(self, pos):
'''
- seeks to the given pos.
+ Seeks the parser to the given position.
'''
if 2 <= self.debug:
print >>stderr, 'seek:', pos
+ prevpos = self.fp.tell()
self.fp.seek(pos)
- self.linepos = pos
- self.linebuf = None
- self.curpos = 0
- self.line = ''
- return
+ self.linebuf = None # line buffer.
+ self.curpos = 0 # current position in the buffer.
+ self.linepos = pos # the beginning of the current line.
+ self.go = False
+ return prevpos
EOLCHAR = re.compile(r'[\r\n]')
def nextline(self):
'''
- fetches the next line that ends either with \\r or \\n.
+ Fetches a next line that ends either with \\r or \\n.
'''
line = ''
eol = None
@@ -131,12 +146,14 @@ class PSBaseParser:
# fetch further
line += self.linebuf[self.curpos:]
self.linebuf = None
+ linepos = self.linepos
self.linepos += len(line)
- return line
+ return (linepos, line)
def revreadlines(self):
'''
- fetches lines backword. used to locate trailers.
+ Fetches a next line backword. This is used to locate
+ the trailers at the end of a file.
'''
self.fp.seek(0, 2)
pos = self.fp.tell()
@@ -156,6 +173,7 @@ class PSBaseParser:
buf = ''
return
+ # regex patterns for basic lexical scanning.
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
@@ -167,38 +185,39 @@ class PSBaseParser:
def parse(self):
'''
- Yields a list of basic tokens: keywords, literals, strings,
- numbers and parentheses. Comments are skipped.
- Nested objects (i.e. arrays and dictionaries) are not handled.
+ Yields a list of tuples (pos, token) of the following:
+ keywords, literals, strings, numbers and parentheses.
+ Comments are skipped.
+ Nested objects (i.e. arrays and dictionaries) are not handled here.
'''
while 1:
# do not strip line! we need to distinguish last '\n' or '\r'
- linepos0 = self.linepos
- self.line = self.nextline()
- if not self.line: break
+ (linepos, line) = self.nextline()
+ if not line: break
if 2 <= self.debug:
- print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
+ print >>stderr, 'line: (%d) %r' % (linepos, line)
# do this before removing comment
- if self.line.startswith('%%EOF'): break
+ if line.startswith('%%EOF'): break
charpos = 0
# tokenize
- while 1:
- m = self.TOKEN.search(self.line, charpos)
+ self.go = True
+ while self.go:
+ m = self.TOKEN.search(line, charpos)
if not m: break
t = m.group(0)
- pos = linepos0 + m.start(0)
+ pos = linepos + m.start(0)
charpos = m.end(0)
if t == '%':
# skip comment
if 2 <= self.debug:
- print >>stderr, 'comment: %r' % self.line[charpos:]
+ print >>stderr, 'comment: %r' % line[charpos:]
break
elif t == '/':
# literal object
- mn = self.LITERAL.match(self.line, m.start(0)+1)
+ mn = self.LITERAL.match(line, m.start(0)+1)
lit = PSLiteralTable.intern(mn.group(0))
yield (pos, lit)
charpos = mn.end(0)
@@ -209,30 +228,30 @@ class PSBaseParser:
# normal string object
s = ''
while 1:
- ms = self.STRING_NORM.match(self.line, charpos)
+ ms = self.STRING_NORM.match(line, charpos)
if not ms: break
s1 = ms.group(0)
charpos = ms.end(0)
if len(s1) == 1 and s1[-1] == '\\':
s += s1[-1:]
- self.line = self.nextline()
- if not self.line:
+ (linepos, line) = self.nextline()
+ if not line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
- (self.linepos, self.line))
+ (linepos, line))
charpos = 0
- elif charpos == len(self.line):
+ elif charpos == len(line):
s += s1
- self.line = self.nextline()
- if not self.line:
+ (linepos, line) = self.nextline()
+ if not line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
- (self.linepos, self.line))
+ (linepos, line))
charpos = 0
else:
s += s1
break
- if self.line[charpos] != ')':
+ if line[charpos] != ')':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
- (self.linepos, self.line))
+ (linepos, line))
charpos += 1
def convesc(m):
x = m.group(0)
@@ -247,11 +266,11 @@ class PSBaseParser:
elif t == '<':
# hex string object
- ms = self.STRING_HEX.match(self.line, charpos)
+ ms = self.STRING_HEX.match(line, charpos)
charpos = ms.end(0)
- if self.line[charpos] != '>':
+ if line[charpos] != '>':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
- (self.linepos, self.line))
+ (linepos, line))
charpos += 1
def convhex(m1):
return chr(int(m1.group(0), 16))
@@ -270,7 +289,7 @@ class PSBaseParser:
print >>stderr, 'number: %r' % n
yield (pos, n)
- elif t in ('true','false'):
+ elif t in ('true', 'false'):
# boolean
if 2 <= self.debug:
print >>stderr, 'boolean: %r' % t