From 196ece791301ca452eb6a09a52c192bf16d55636 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Mon, 7 Jan 2008 13:47:52 +0000 Subject: [PATCH] yum-yum! git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@9 1aa58f4a-7d42-0410-adbc-911cccaed67c --- Makefile | 2 +- README.html | 11 +++++- dumppdf.py | 8 ++--- pdf2txt.py | 9 ++--- pdfinterp.py | 61 ++++++++++++++++++++------------- pdfparser.py | 60 +++++++++++++++------------------ psparser.py | 95 +++++++++++++++++++++++++++++++--------------------- 7 files changed, 141 insertions(+), 105 deletions(-) diff --git a/Makefile b/Makefile index 7989b79..6eecd76 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ TAR=tar SVN=svn PYTHON=python -WORKDIR=.. +WORKDIR=/tmp DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTFILE=$(DISTNAME).tar.gz diff --git a/README.html b/README.html index 8859d94..c6d4011 100644 --- a/README.html +++ b/README.html @@ -21,6 +21,7 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20071231.tar.gz +(220kbytes)

Svn repository:
@@ -46,7 +47,7 @@ http://www.unixuser.org/~euske/pub/CMap.tar.bz2

Dump the contents:

-$ ./dumppdf.py foo.pdf
+$ ./dumppdf.py -a foo.pdf
 

@@ -56,6 +57,14 @@ $ ./pdf2txt.py samples/naacl06-shinyama.pdf $ ./pdf2txt.py -c euc-jp samples/jo.pdf +


+

Similar Projects

+ + +

Terms and conditions

diff --git a/dumppdf.py b/dumppdf.py index 7efcdce..5f7ab75 100755 --- a/dumppdf.py +++ b/dumppdf.py @@ -83,8 +83,8 @@ def dumptrailers(out, doc): out.write('\n\n\n') return -# dumpall -def dumpall(out, doc): +# dumpallobjs +def dumpallobjs(out, doc): out.write('') for xref in doc.xrefs: for objid in xrange(xref.objid0, xref.objid1+1): @@ -93,7 +93,7 @@ def dumpall(out, doc): out.write('\n' % objid) dumpxml(out, obj) out.write('\n\n\n') - except PDFValueError: + except: pass dumptrailers(out, doc) out.write('') @@ -117,7 +117,7 @@ def dumppdf(outfp, fname, objids, pageids, if page.pageid in pageids: dumpxml(outfp, page.attrs) if dumpall: - dumpall(outfp, doc) + dumpallobjs(outfp, doc) if (not objids) and (not pageids) and (not dumpall): dumptrailers(outfp, doc) fp.close() diff --git a/pdf2txt.py b/pdf2txt.py index c52daad..04f06fd 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -23,8 +23,9 @@ class TextConverter(PDFDevice): self.outfp.write('\n') return - def begin_block(self, name): - self.outfp.write('\n' % name) + def begin_block(self, name, (x0,y0,x1,y1)): + self.outfp.write('\n' % + (name,x0,y0,x1,y1)) return def end_block(self): self.outfp.write('\n') @@ -83,10 +84,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0] + print 'usage: %s [-d] [-c codec] [-p pages] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dvp:c:') + (opts, args) = getopt.getopt(argv[1:], 'dp:c:') except getopt.GetoptError: return usage() if not args: return usage() diff --git a/pdfinterp.py b/pdfinterp.py index d24f848..4095b57 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -57,17 +57,18 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) def apply_matrix((a,b,c,d,e,f), (x,y)): - '''Applies a matrix to a coordination.''' + '''Applies a matrix to coordinates.''' return (a*x+c*y+e, b*x+d*y+f) def cs_params(cs): + '''Returns a number of components for a given colorspace.''' t = cs[0] if t == LITERAL_ICC_BASED: return stream_value(cs[1]).dic['N'] elif t == LITERAL_DEVICE_N: return len(list_value(cs[1])) else: - return CS_COMPONENTS[t] + return CS_COMPONENTS.get(t, 0) ## Fonts @@ -438,7 +439,7 @@ class PDFDevice: self.ctm = ctm return - def begin_block(self, name): + def begin_block(self, name, bbox): return def end_block(self): return @@ -589,11 +590,11 @@ class PDFPageInterpreter: # setcolorspace-stroking def do_CS(self, name): - self.scs = self.csmap.get(literal_name(name), None) + self.scs = self.csmap.get(literal_name(name), [name]) return # setcolorspace-non-strokine def do_cs(self, name): - self.ncs = self.csmap.get(literal_name(name), None) + self.ncs = self.csmap.get(literal_name(name), [name]) return # setgray-stroking def do_G(self, gray): @@ -770,34 +771,46 @@ class PDFPageInterpreter: if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj interpreter = PDFPageInterpreter(self.rsrc, self.device) - interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], - xobj.dic.get('Matrix', MATRIX_IDENTITY)) + (x0,y0,x1,y1) = xobj.dic['BBox'] + ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm) + (x0,y0) = apply_matrix(ctm, (x0,y0)) + (x1,y1) = apply_matrix(ctm, (x1,y1)) + interpreter.render_contents(xobjid, + (x0,y0,x1,y1), + xobj.dic.get('Resources'), + [xobj], + ctm=ctm) return def process_page(self, page): if 1 <= self.debug: print >>stderr, 'Processing page: %r' % page - self.render_contents('page-%d' % page.pageid, page.resources, page.contents) + self.render_contents('page-%d' % page.pageid, + page.mediabox, + page.resources, + page.contents) return - def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY): + def render_contents(self, contid, mediabox, resources, contents, + ctm=MATRIX_IDENTITY): self.initpage(ctm) - self.device.begin_block(contid) + self.device.begin_block(contid, mediabox) # Handle resource declarations. - for (k,v) in dict_value(resources).iteritems(): - if 1 <= self.debug: - print >>stderr, 'Resource: %r: %r' % (k,v) - if k == 'Font': - for (fontid,fontrsrc) in dict_value(v).iteritems(): - self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc) - elif k == 'ColorSpace': - for (csid,csspec) in dict_value(v).iteritems(): - self.csmap[csid] = list_value(csspec) - elif k == 'ProcSet': - self.rsrc.get_procset(list_value(v)) - elif k == 'XObject': - for (xobjid,xobjstrm) in dict_value(v).iteritems(): - self.xobjmap[xobjid] = xobjstrm + if resources: + for (k,v) in dict_value(resources).iteritems(): + if 1 <= self.debug: + print >>stderr, 'Resource: %r: %r' % (k,v) + if k == 'Font': + for (fontid,fontrsrc) in dict_value(v).iteritems(): + self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc) + elif k == 'ColorSpace': + for (csid,csspec) in dict_value(v).iteritems(): + self.csmap[csid] = list_value(csspec) + elif k == 'ProcSet': + self.rsrc.get_procset(list_value(v)) + elif k == 'XObject': + for (xobjid,xobjstrm) in dict_value(v).iteritems(): + self.xobjmap[xobjid] = xobjstrm for stream in list_value(contents): self.execute(stream_value(stream)) self.device.end_block() diff --git a/pdfparser.py b/pdfparser.py index a12c7a1..db2bdcf 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -18,8 +18,8 @@ import sys, re stderr = sys.stderr from utils import choplist, nunpack from psparser import PSException, PSSyntaxError, PSTypeError, \ - PSLiteral, PSKeyword, \ - PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ + PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ + literal_name, keyword_name, \ PSStackParser @@ -76,8 +76,7 @@ def resolveall(x): ''' Recursively resolve X and all the internals. Make sure there is no indirect reference within the nested object. - This procedure might be slow. Do not used it unless - you really need it. + This procedure might be slow. ''' while isinstance(x, PDFObjRef): x = x.resolve() @@ -209,13 +208,12 @@ class PDFStream: ## class PDFPage: - def __init__(self, doc, pageidx, attrs, parent_attrs): + def __init__(self, doc, pageidx, attrs): self.doc = doc self.pageid = pageidx self.attrs = dict_value(attrs) - self.parent_attrs = parent_attrs - self.resources = self.get_attr('Resources') - self.mediabox = self.get_attr('MediaBox') + self.resources = resolve1(self.attrs['Resources']) + self.mediabox = resolve1(self.attrs['MediaBox']) contents = resolve1(self.attrs['Contents']) if not isinstance(contents, list): contents = [ contents ] @@ -224,11 +222,6 @@ class PDFPage: def __repr__(self): return '' % (self.resources, self.mediabox) - - def get_attr(self, k): - if k in self.attrs: - return resolve1(self.attrs[k]) - return self.parent_attrs.get(k) ## XRefs @@ -239,7 +232,7 @@ class PDFXRef: def __init__(self, parser): while 1: - line = parser.nextline() + (_, line) = parser.nextline() if not line: raise PDFSyntaxError('premature eof: %r' % parser) line = line.strip() @@ -253,7 +246,7 @@ class PDFXRef: self.objid1 = start+nobjs self.offsets = [] for objid in xrange(start, start+nobjs): - line = parser.nextline() + (_, line) = parser.nextline() f = line.strip().split(' ') if len(f) != 3: raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line)) @@ -361,13 +354,12 @@ class PDFDocument: self.parsed_objs[stream] = objs obj = objs[stream.dic['N']*2+index] else: - pos0 = self.parser.linepos - self.parser.seek(index) + prevpos = self.parser.seek(index) seq = list_value(self.parser.parse()) if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ): raise PDFSyntaxError('invalid stream spec: %r' % seq) obj = seq[3] - self.parser.seek(pos0) + self.parser.seek(prevpos) if 2 <= self.debug: print >>stderr, 'register: objid=%r: %r' % (objid, obj) self.objs[objid] = obj @@ -376,7 +368,10 @@ class PDFDocument: def get_pages(self, debug=0): assert self.xrefs def search(obj, parent): - tree = dict_value(obj) + tree = dict_value(obj).copy() + for (k,v) in parent.iteritems(): + if k not in tree: + tree[k] = v if tree['Type'] == LITERAL_PAGES: if 1 <= debug: print >>stderr, 'Pages: Kids=%r' % tree['Kids'] @@ -386,9 +381,9 @@ class PDFDocument: elif tree['Type'] == LITERAL_PAGE: if 1 <= debug: print >>stderr, 'Page: %r' % tree - yield (tree, parent) - for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)): - yield PDFPage(self, i, tree, parent) + yield tree + for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)): + yield PDFPage(self, i, tree) return def set_root(self, root): @@ -440,19 +435,19 @@ class PDFParser(PSStackParser): raise PDFValueError('/Length is undefined: %r' % dic) objlen = int_value(dic['Length']) self.seek(pos) - line = self.nextline() # 'stream' + (_, line) = self.nextline() # 'stream' self.fp.seek(pos+len(line)) data = self.fp.read(objlen) self.seek(pos+len(line)+objlen) while 1: - line = self.nextline() + (linepos, line) = self.nextline() if not line: raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' % - (self.linepos, line)) + (linepos, line)) if line.strip(): if not line.startswith('endstream'): raise PDFSyntaxError('need endstream: linepos=%d, line=%r' % - (self.linepos, line)) + (linepos, line)) break if 1 <= self.debug: print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ @@ -510,17 +505,16 @@ class PDFParser(PSStackParser): self.find_xref() while 1: # read xref table - pos0 = self.linepos - line = self.nextline() + (linepos, line) = self.nextline() if 2 <= self.debug: print >>stderr, 'line: %r' % line if line[0].isdigit(): # XRefStream: PDF-1.5 - self.seek(pos0) + self.seek(linepos) xref = PDFXRefStream(self) elif line.strip() != 'xref': raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % - (self.linepos, line)) + (linepos, line)) else: xref = PDFXRef(self) yield xref @@ -531,10 +525,10 @@ class PDFParser(PSStackParser): self.seek(int_value(trailer['XRefStm'])) if 'Prev' in trailer: # find previous xref - pos0 = int_value(trailer['Prev']) - self.seek(pos0) + pos = int_value(trailer['Prev']) + self.seek(pos) if 1 <= self.debug: - print >>stderr, 'prev trailer: pos=%d' % pos0 + print >>stderr, 'prev trailer: pos=%d' % pos else: break return diff --git a/psparser.py b/psparser.py index 5a72d46..201a39e 100644 --- a/psparser.py +++ b/psparser.py @@ -12,36 +12,48 @@ class PSTypeError(PSException): pass class PSValueError(PSException): pass -## PostScript Types +## Basic PostScript Types ## + +# PSLiteral class PSLiteral: + ''' PS literals (e.g. "/Name"). Caution: Never create these objects directly. Use PSLiteralTable.intern() instead. ''' + def __init__(self, name): self.name = name return + def __repr__(self): return '/%s' % self.name +# PSKeyword class PSKeyword: + ''' PS keywords (e.g. "showpage"). Caution: Never create these objects directly. Use PSKeywordTable.intern() instead. ''' + def __init__(self, name): self.name = name return + def __repr__(self): return self.name +# PSSymbolTable class PSSymbolTable: + ''' Symbol table that stores PSLiteral or PSKeyword. ''' + def __init__(self, classe): self.dic = {} self.classe = classe @@ -74,7 +86,9 @@ def keyword_name(x): ## class PSBaseParser: - '''PostScript parser that performs only basic tokenization.''' + ''' + Most basic PostScript parser that performs only basic tokenization. + ''' def __init__(self, fp, debug=0): self.fp = fp @@ -88,21 +102,22 @@ class PSBaseParser: def seek(self, pos): ''' - seeks to the given pos. + Seeks the parser to the given position. ''' if 2 <= self.debug: print >>stderr, 'seek:', pos + prevpos = self.fp.tell() self.fp.seek(pos) - self.linepos = pos - self.linebuf = None - self.curpos = 0 - self.line = '' - return + self.linebuf = None # line buffer. + self.curpos = 0 # current position in the buffer. + self.linepos = pos # the beginning of the current line. + self.go = False + return prevpos EOLCHAR = re.compile(r'[\r\n]') def nextline(self): ''' - fetches the next line that ends either with \\r or \\n. + Fetches a next line that ends either with \\r or \\n. ''' line = '' eol = None @@ -131,12 +146,14 @@ class PSBaseParser: # fetch further line += self.linebuf[self.curpos:] self.linebuf = None + linepos = self.linepos self.linepos += len(line) - return line + return (linepos, line) def revreadlines(self): ''' - fetches lines backword. used to locate trailers. + Fetches a next line backword. This is used to locate + the trailers at the end of a file. ''' self.fp.seek(0, 2) pos = self.fp.tell() @@ -156,6 +173,7 @@ class PSBaseParser: buf = '' return + # regex patterns for basic lexical scanning. SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040' TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+') LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+') @@ -167,38 +185,39 @@ class PSBaseParser: def parse(self): ''' - Yields a list of basic tokens: keywords, literals, strings, - numbers and parentheses. Comments are skipped. - Nested objects (i.e. arrays and dictionaries) are not handled. + Yields a list of tuples (pos, token) of the following: + keywords, literals, strings, numbers and parentheses. + Comments are skipped. + Nested objects (i.e. arrays and dictionaries) are not handled here. ''' while 1: # do not strip line! we need to distinguish last '\n' or '\r' - linepos0 = self.linepos - self.line = self.nextline() - if not self.line: break + (linepos, line) = self.nextline() + if not line: break if 2 <= self.debug: - print >>stderr, 'line: (%d) %r' % (self.linepos, self.line) + print >>stderr, 'line: (%d) %r' % (linepos, line) # do this before removing comment - if self.line.startswith('%%EOF'): break + if line.startswith('%%EOF'): break charpos = 0 # tokenize - while 1: - m = self.TOKEN.search(self.line, charpos) + self.go = True + while self.go: + m = self.TOKEN.search(line, charpos) if not m: break t = m.group(0) - pos = linepos0 + m.start(0) + pos = linepos + m.start(0) charpos = m.end(0) if t == '%': # skip comment if 2 <= self.debug: - print >>stderr, 'comment: %r' % self.line[charpos:] + print >>stderr, 'comment: %r' % line[charpos:] break elif t == '/': # literal object - mn = self.LITERAL.match(self.line, m.start(0)+1) + mn = self.LITERAL.match(line, m.start(0)+1) lit = PSLiteralTable.intern(mn.group(0)) yield (pos, lit) charpos = mn.end(0) @@ -209,30 +228,30 @@ class PSBaseParser: # normal string object s = '' while 1: - ms = self.STRING_NORM.match(self.line, charpos) + ms = self.STRING_NORM.match(line, charpos) if not ms: break s1 = ms.group(0) charpos = ms.end(0) if len(s1) == 1 and s1[-1] == '\\': s += s1[-1:] - self.line = self.nextline() - if not self.line: + (linepos, line) = self.nextline() + if not line: raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (self.linepos, self.line)) + (linepos, line)) charpos = 0 - elif charpos == len(self.line): + elif charpos == len(line): s += s1 - self.line = self.nextline() - if not self.line: + (linepos, line) = self.nextline() + if not line: raise PSSyntaxError('end inside string: linepos=%d, line=%r' % - (self.linepos, self.line)) + (linepos, line)) charpos = 0 else: s += s1 break - if self.line[charpos] != ')': + if line[charpos] != ')': raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (self.linepos, self.line)) + (linepos, line)) charpos += 1 def convesc(m): x = m.group(0) @@ -247,11 +266,11 @@ class PSBaseParser: elif t == '<': # hex string object - ms = self.STRING_HEX.match(self.line, charpos) + ms = self.STRING_HEX.match(line, charpos) charpos = ms.end(0) - if self.line[charpos] != '>': + if line[charpos] != '>': raise PSSyntaxError('no close paren: linepos=%d, line=%r' % - (self.linepos, self.line)) + (linepos, line)) charpos += 1 def convhex(m1): return chr(int(m1.group(0), 16)) @@ -270,7 +289,7 @@ class PSBaseParser: print >>stderr, 'number: %r' % n yield (pos, n) - elif t in ('true','false'): + elif t in ('true', 'false'): # boolean if 2 <= self.debug: print >>stderr, 'boolean: %r' % t