git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@9 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-01-07 13:47:52 +00:00
parent a8bae61c25
commit 196ece7913
7 changed files with 141 additions and 105 deletions

View File

@ -6,7 +6,7 @@ TAR=tar
SVN=svn
PYTHON=python
WORKDIR=..
WORKDIR=/tmp
DISTNAME=$(PACKAGE)-dist-$(VERSION)
DISTFILE=$(DISTNAME).tar.gz

View File

@ -21,6 +21,7 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz
</a>
(220kbytes)
<P>
<strong>Svn repository:</strong><br>
@ -46,7 +47,7 @@ http://www.unixuser.org/~euske/pub/CMap.tar.bz2
<p>
<strong>Dump the contents:</strong>
<blockquote><pre>
$ ./dumppdf.py foo.pdf
$ ./dumppdf.py -a foo.pdf
</pre></blockquote>
<p>
@ -56,6 +57,14 @@ $ ./pdf2txt.py samples/naacl06-shinyama.pdf
$ ./pdf2txt.py -c euc-jp samples/jo.pdf
</pre></blockquote>
<hr>
<h2>Similar Projects</h2>
<ul>
<li> <a href="http://www.foolabs.com/xpdf/">xpdf</a>
<li> <a href="http://www.pdfbox.org/">pdfbox</a>
</ul>
<hr>
<h2>Terms and conditions</h2>
<p>

View File

@ -83,8 +83,8 @@ def dumptrailers(out, doc):
out.write('\n</trailer>\n\n')
return
# dumpall
def dumpall(out, doc):
# dumpallobjs
def dumpallobjs(out, doc):
out.write('<pdf>')
for xref in doc.xrefs:
for objid in xrange(xref.objid0, xref.objid1+1):
@ -93,7 +93,7 @@ def dumpall(out, doc):
out.write('<object id="%d">\n' % objid)
dumpxml(out, obj)
out.write('\n</object>\n\n')
except PDFValueError:
except:
pass
dumptrailers(out, doc)
out.write('</pdf>')
@ -117,7 +117,7 @@ def dumppdf(outfp, fname, objids, pageids,
if page.pageid in pageids:
dumpxml(outfp, page.attrs)
if dumpall:
dumpall(outfp, doc)
dumpallobjs(outfp, doc)
if (not objids) and (not pageids) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()

View File

@ -23,8 +23,9 @@ class TextConverter(PDFDevice):
self.outfp.write('\n')
return
def begin_block(self, name):
self.outfp.write('<block name="%s">\n' % name)
def begin_block(self, name, (x0,y0,x1,y1)):
self.outfp.write('<block name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
(name,x0,y0,x1,y1))
return
def end_block(self):
self.outfp.write('</block>\n')
@ -83,10 +84,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
(opts, args) = getopt.getopt(argv[1:], 'dp:c:')
except getopt.GetoptError:
return usage()
if not args: return usage()

View File

@ -57,17 +57,18 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a coordination.'''
'''Applies a matrix to coordinates.'''
return (a*x+c*y+e, b*x+d*y+f)
def cs_params(cs):
'''Returns a number of components for a given colorspace.'''
t = cs[0]
if t == LITERAL_ICC_BASED:
return stream_value(cs[1]).dic['N']
elif t == LITERAL_DEVICE_N:
return len(list_value(cs[1]))
else:
return CS_COMPONENTS[t]
return CS_COMPONENTS.get(t, 0)
## Fonts
@ -438,7 +439,7 @@ class PDFDevice:
self.ctm = ctm
return
def begin_block(self, name):
def begin_block(self, name, bbox):
return
def end_block(self):
return
@ -589,11 +590,11 @@ class PDFPageInterpreter:
# setcolorspace-stroking
def do_CS(self, name):
self.scs = self.csmap.get(literal_name(name), None)
self.scs = self.csmap.get(literal_name(name), [name])
return
# setcolorspace-non-strokine
def do_cs(self, name):
self.ncs = self.csmap.get(literal_name(name), None)
self.ncs = self.csmap.get(literal_name(name), [name])
return
# setgray-stroking
def do_G(self, gray):
@ -770,34 +771,46 @@ class PDFPageInterpreter:
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
interpreter = PDFPageInterpreter(self.rsrc, self.device)
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
xobj.dic.get('Matrix', MATRIX_IDENTITY))
(x0,y0,x1,y1) = xobj.dic['BBox']
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
(x0,y0) = apply_matrix(ctm, (x0,y0))
(x1,y1) = apply_matrix(ctm, (x1,y1))
interpreter.render_contents(xobjid,
(x0,y0,x1,y1),
xobj.dic.get('Resources'),
[xobj],
ctm=ctm)
return
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
self.render_contents('page-%d' % page.pageid,
page.mediabox,
page.resources,
page.contents)
return
def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
def render_contents(self, contid, mediabox, resources, contents,
ctm=MATRIX_IDENTITY):
self.initpage(ctm)
self.device.begin_block(contid)
self.device.begin_block(contid, mediabox)
# Handle resource declarations.
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,fontrsrc) in dict_value(v).iteritems():
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
elif k == 'ColorSpace':
for (csid,csspec) in dict_value(v).iteritems():
self.csmap[csid] = list_value(csspec)
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
if resources:
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,fontrsrc) in dict_value(v).iteritems():
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
elif k == 'ColorSpace':
for (csid,csspec) in dict_value(v).iteritems():
self.csmap[csid] = list_value(csspec)
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
for stream in list_value(contents):
self.execute(stream_value(stream))
self.device.end_block()

View File

@ -18,8 +18,8 @@ import sys, re
stderr = sys.stderr
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \
PSLiteral, PSKeyword, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \
PSStackParser
@ -76,8 +76,7 @@ def resolveall(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow. Do not used it unless
you really need it.
This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
@ -209,13 +208,12 @@ class PDFStream:
##
class PDFPage:
def __init__(self, doc, pageidx, attrs, parent_attrs):
def __init__(self, doc, pageidx, attrs):
self.doc = doc
self.pageid = pageidx
self.attrs = dict_value(attrs)
self.parent_attrs = parent_attrs
self.resources = self.get_attr('Resources')
self.mediabox = self.get_attr('MediaBox')
self.resources = resolve1(self.attrs['Resources'])
self.mediabox = resolve1(self.attrs['MediaBox'])
contents = resolve1(self.attrs['Contents'])
if not isinstance(contents, list):
contents = [ contents ]
@ -225,11 +223,6 @@ class PDFPage:
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
def get_attr(self, k):
if k in self.attrs:
return resolve1(self.attrs[k])
return self.parent_attrs.get(k)
## XRefs
@ -239,7 +232,7 @@ class PDFXRef:
def __init__(self, parser):
while 1:
line = parser.nextline()
(_, line) = parser.nextline()
if not line:
raise PDFSyntaxError('premature eof: %r' % parser)
line = line.strip()
@ -253,7 +246,7 @@ class PDFXRef:
self.objid1 = start+nobjs
self.offsets = []
for objid in xrange(start, start+nobjs):
line = parser.nextline()
(_, line) = parser.nextline()
f = line.strip().split(' ')
if len(f) != 3:
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
@ -361,13 +354,12 @@ class PDFDocument:
self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index]
else:
pos0 = self.parser.linepos
self.parser.seek(index)
prevpos = self.parser.seek(index)
seq = list_value(self.parser.parse())
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
raise PDFSyntaxError('invalid stream spec: %r' % seq)
obj = seq[3]
self.parser.seek(pos0)
self.parser.seek(prevpos)
if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj
@ -376,7 +368,10 @@ class PDFDocument:
def get_pages(self, debug=0):
assert self.xrefs
def search(obj, parent):
tree = dict_value(obj)
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
if k not in tree:
tree[k] = v
if tree['Type'] == LITERAL_PAGES:
if 1 <= debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
@ -386,9 +381,9 @@ class PDFDocument:
elif tree['Type'] == LITERAL_PAGE:
if 1 <= debug:
print >>stderr, 'Page: %r' % tree
yield (tree, parent)
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree, parent)
yield tree
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree)
return
def set_root(self, root):
@ -440,19 +435,19 @@ class PDFParser(PSStackParser):
raise PDFValueError('/Length is undefined: %r' % dic)
objlen = int_value(dic['Length'])
self.seek(pos)
line = self.nextline() # 'stream'
(_, line) = self.nextline() # 'stream'
self.fp.seek(pos+len(line))
data = self.fp.read(objlen)
self.seek(pos+len(line)+objlen)
while 1:
line = self.nextline()
(linepos, line) = self.nextline()
if not line:
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
(self.linepos, line))
(linepos, line))
if line.strip():
if not line.startswith('endstream'):
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
(self.linepos, line))
(linepos, line))
break
if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
@ -510,17 +505,16 @@ class PDFParser(PSStackParser):
self.find_xref()
while 1:
# read xref table
pos0 = self.linepos
line = self.nextline()
(linepos, line) = self.nextline()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
if line[0].isdigit():
# XRefStream: PDF-1.5
self.seek(pos0)
self.seek(linepos)
xref = PDFXRefStream(self)
elif line.strip() != 'xref':
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
(self.linepos, line))
(linepos, line))
else:
xref = PDFXRef(self)
yield xref
@ -531,10 +525,10 @@ class PDFParser(PSStackParser):
self.seek(int_value(trailer['XRefStm']))
if 'Prev' in trailer:
# find previous xref
pos0 = int_value(trailer['Prev'])
self.seek(pos0)
pos = int_value(trailer['Prev'])
self.seek(pos)
if 1 <= self.debug:
print >>stderr, 'prev trailer: pos=%d' % pos0
print >>stderr, 'prev trailer: pos=%d' % pos
else:
break
return

View File

@ -12,36 +12,48 @@ class PSTypeError(PSException): pass
class PSValueError(PSException): pass
## PostScript Types
## Basic PostScript Types
##
# PSLiteral
class PSLiteral:
'''
PS literals (e.g. "/Name").
Caution: Never create these objects directly.
Use PSLiteralTable.intern() instead.
'''
def __init__(self, name):
self.name = name
return
def __repr__(self):
return '/%s' % self.name
# PSKeyword
class PSKeyword:
'''
PS keywords (e.g. "showpage").
Caution: Never create these objects directly.
Use PSKeywordTable.intern() instead.
'''
def __init__(self, name):
self.name = name
return
def __repr__(self):
return self.name
# PSSymbolTable
class PSSymbolTable:
'''
Symbol table that stores PSLiteral or PSKeyword.
'''
def __init__(self, classe):
self.dic = {}
self.classe = classe
@ -74,7 +86,9 @@ def keyword_name(x):
##
class PSBaseParser:
'''PostScript parser that performs only basic tokenization.'''
'''
Most basic PostScript parser that performs only basic tokenization.
'''
def __init__(self, fp, debug=0):
self.fp = fp
@ -88,21 +102,22 @@ class PSBaseParser:
def seek(self, pos):
'''
seeks to the given pos.
Seeks the parser to the given position.
'''
if 2 <= self.debug:
print >>stderr, 'seek:', pos
prevpos = self.fp.tell()
self.fp.seek(pos)
self.linepos = pos
self.linebuf = None
self.curpos = 0
self.line = ''
return
self.linebuf = None # line buffer.
self.curpos = 0 # current position in the buffer.
self.linepos = pos # the beginning of the current line.
self.go = False
return prevpos
EOLCHAR = re.compile(r'[\r\n]')
def nextline(self):
'''
fetches the next line that ends either with \\r or \\n.
Fetches a next line that ends either with \\r or \\n.
'''
line = ''
eol = None
@ -131,12 +146,14 @@ class PSBaseParser:
# fetch further
line += self.linebuf[self.curpos:]
self.linebuf = None
linepos = self.linepos
self.linepos += len(line)
return line
return (linepos, line)
def revreadlines(self):
'''
fetches lines backword. used to locate trailers.
Fetches a next line backword. This is used to locate
the trailers at the end of a file.
'''
self.fp.seek(0, 2)
pos = self.fp.tell()
@ -156,6 +173,7 @@ class PSBaseParser:
buf = ''
return
# regex patterns for basic lexical scanning.
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
@ -167,38 +185,39 @@ class PSBaseParser:
def parse(self):
'''
Yields a list of basic tokens: keywords, literals, strings,
numbers and parentheses. Comments are skipped.
Nested objects (i.e. arrays and dictionaries) are not handled.
Yields a list of tuples (pos, token) of the following:
keywords, literals, strings, numbers and parentheses.
Comments are skipped.
Nested objects (i.e. arrays and dictionaries) are not handled here.
'''
while 1:
# do not strip line! we need to distinguish last '\n' or '\r'
linepos0 = self.linepos
self.line = self.nextline()
if not self.line: break
(linepos, line) = self.nextline()
if not line: break
if 2 <= self.debug:
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
print >>stderr, 'line: (%d) %r' % (linepos, line)
# do this before removing comment
if self.line.startswith('%%EOF'): break
if line.startswith('%%EOF'): break
charpos = 0
# tokenize
while 1:
m = self.TOKEN.search(self.line, charpos)
self.go = True
while self.go:
m = self.TOKEN.search(line, charpos)
if not m: break
t = m.group(0)
pos = linepos0 + m.start(0)
pos = linepos + m.start(0)
charpos = m.end(0)
if t == '%':
# skip comment
if 2 <= self.debug:
print >>stderr, 'comment: %r' % self.line[charpos:]
print >>stderr, 'comment: %r' % line[charpos:]
break
elif t == '/':
# literal object
mn = self.LITERAL.match(self.line, m.start(0)+1)
mn = self.LITERAL.match(line, m.start(0)+1)
lit = PSLiteralTable.intern(mn.group(0))
yield (pos, lit)
charpos = mn.end(0)
@ -209,30 +228,30 @@ class PSBaseParser:
# normal string object
s = ''
while 1:
ms = self.STRING_NORM.match(self.line, charpos)
ms = self.STRING_NORM.match(line, charpos)
if not ms: break
s1 = ms.group(0)
charpos = ms.end(0)
if len(s1) == 1 and s1[-1] == '\\':
s += s1[-1:]
self.line = self.nextline()
if not self.line:
(linepos, line) = self.nextline()
if not line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line))
(linepos, line))
charpos = 0
elif charpos == len(self.line):
elif charpos == len(line):
s += s1
self.line = self.nextline()
if not self.line:
(linepos, line) = self.nextline()
if not line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line))
(linepos, line))
charpos = 0
else:
s += s1
break
if self.line[charpos] != ')':
if line[charpos] != ')':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line))
(linepos, line))
charpos += 1
def convesc(m):
x = m.group(0)
@ -247,11 +266,11 @@ class PSBaseParser:
elif t == '<':
# hex string object
ms = self.STRING_HEX.match(self.line, charpos)
ms = self.STRING_HEX.match(line, charpos)
charpos = ms.end(0)
if self.line[charpos] != '>':
if line[charpos] != '>':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line))
(linepos, line))
charpos += 1
def convhex(m1):
return chr(int(m1.group(0), 16))
@ -270,7 +289,7 @@ class PSBaseParser:
print >>stderr, 'number: %r' % n
yield (pos, n)
elif t in ('true','false'):
elif t in ('true', 'false'):
# boolean
if 2 <= self.debug:
print >>stderr, 'boolean: %r' % t