From dc6e5c366d3310d1c75e2bd8eec48582493b039f Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 30 Jan 2010 07:30:01 +0000 Subject: [PATCH] jpeg extraction support added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/converter.py | 67 ++++++++++++++++++++++++++++++++++++++----- pdfminer/layout.py | 20 +++++++++++++ pdfminer/pdfdevice.py | 2 +- pdfminer/pdfinterp.py | 17 ++++++----- pdfminer/pdfparser.py | 50 +++++++++++++++++++++++--------- pdfminer/pdftypes.py | 34 +++++++++++++++------- pdfminer/psparser.py | 10 +++---- tools/dumppdf.py | 35 ++++++++++++---------- tools/pdf2txt.py | 10 ++++--- 9 files changed, 179 insertions(+), 66 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index fe3e37d..4c3fcbf 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,10 +1,11 @@ #!/usr/bin/env python -import sys +import sys, os.path from pdfdevice import PDFDevice, PDFTextDevice from pdffont import PDFUnicodeNotDefined +from pdftypes import LITERALS_DCT_DECODE from layout import LayoutContainer from layout import LTPage, LTText, LTLine, LTRect, LTPolygon -from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine +from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine from utils import enc from utils import apply_matrix_pt, mult_matrix @@ -109,6 +110,16 @@ class PDFPageAggregator(PDFTextDevice): self.cur_item.add(fig) return + def render_image(self, name, stream): + assert isinstance(self.cur_item, LTFigure) + item = LTImage(name, stream['Filter'], + (stream['Width'], stream['Height']), + (self.cur_item.x0, self.cur_item.y0, + self.cur_item.x1, self.cur_item.y1), + stream.get_rawdata()) + self.cur_item.add(item) + return + def paint_path(self, gstate, stroke, fill, evenodd, path): shape = ''.join(x[0] for x in path) if shape == 'ml': @@ -166,11 +177,24 @@ class PDFConverter(PDFPageAggregator): ## class XMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): + def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None): PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) + self.imgdir = imgdir self.outfp.write('\n' % codec) self.outfp.write('\n') return + + def write_image(self, image): + if image.type in LITERALS_DCT_DECODE: + ext = '.jpg' + else: + return None + name = image.name+ext + path = os.path.join(self.imgdir, name) + fp = file(path, 'wb') + fp.write(image.data) + fp.close() + return name def end_page(self, page): def render(item): @@ -181,11 +205,11 @@ class XMLConverter(PDFConverter): render(child) self.outfp.write('\n') elif isinstance(item, LTLine) and item.direction: - self.outfp.write('' % (item.linewidth, item.direction, item.get_bbox())) + self.outfp.write('\n' % (item.linewidth, item.direction, item.get_bbox())) elif isinstance(item, LTRect): - self.outfp.write('' % (item.linewidth, item.get_bbox())) + self.outfp.write('\n' % (item.linewidth, item.get_bbox())) elif isinstance(item, LTPolygon): - self.outfp.write('' % (item.linewidth, item.get_bbox(), item.get_pts())) + self.outfp.write('\n' % (item.linewidth, item.get_bbox(), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('
\n' % (item.id, item.get_bbox())) for child in item: @@ -209,6 +233,13 @@ class XMLConverter(PDFConverter): self.outfp.write('\n') elif isinstance(item, LTText): self.outfp.write('%s\n' % item.text) + elif isinstance(item, LTImage): + x = '' + if self.imgdir: + name = self.write_image(item) + if name: + x = 'name="%s" ' % enc(name) + self.outfp.write('\n' % (x, item.type, item.width, item.height)) else: assert 0, item return @@ -226,10 +257,11 @@ class XMLConverter(PDFConverter): class HTMLConverter(PDFConverter): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, - scale=1, showpageno=True, pagepad=50): + scale=1, showpageno=True, pagepad=50, imgdir=None): PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno self.pagepad = pagepad + self.imgdir = imgdir self.scale = scale self.outfp.write('\n') self.outfp.write('\n' % @@ -244,6 +276,24 @@ class HTMLConverter(PDFConverter): (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) return + def write_image(self, image): + if image.type in LITERALS_DCT_DECODE: + ext = '.jpg' + else: + return + name = image.name+ext + path = os.path.join(self.imgdir, name) + fp = file(path, 'wb') + fp.write(image.data) + fp.close() + (x0,y0,x1,y1) = image.dstbbox + self.outfp.write('\n' % + (enc(name), + x0*self.scale, (self.yoffset-y1)*self.scale, + (x1-x0)*self.scale, (y1-y0)*self.scale)) + return + def end_page(self, page): def render(item): if isinstance(item, LTPage): @@ -281,6 +331,9 @@ class HTMLConverter(PDFConverter): self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height) for child in item: render(child) + elif isinstance(item, LTImage): + if self.imgdir: + self.write_image(item) return page = PDFConverter.end_page(self, page) render(page) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 1b940e4..02d1cb8 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -283,6 +283,26 @@ class LTRect(LTPolygon): return +## LTImage +## +class LTImage(object): + + def __init__(self, name, type, srcsize, dstbbox, data): + self.name = name + self.type = type + self.srcsize = srcsize + self.dstbbox = dstbbox + self.data = data + return + + def __repr__(self): + (w,h) = self.srcsize + return '' % (self.id, self.type, w, h) + + def get_weight(self): + return 0 + + ## LTText ## class LTText(object): diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index a93c6a5..b4d4f92 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -44,7 +44,7 @@ class PDFDevice(object): def paint_path(self, graphicstate, stroke, fill, evenodd, path): return - def render_image(self, stream, size): + def render_image(self, name, stream): return def render_string(self, textstate, seq): return diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index dec9818..f6647fe 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -293,7 +293,7 @@ class PDFPageInterpreter(object): else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): - return PDFColorSpace(name, stream_value(spec[1]).dic['N']) + return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: @@ -681,18 +681,17 @@ class PDFPageInterpreter(object): return if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj - subtype = xobj.dic.get('Subtype') - if subtype is LITERAL_FORM and 'BBox' in xobj.dic: + subtype = xobj.get('Subtype') + if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() - bbox = list_value(xobj.dic['BBox']) - matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)) + bbox = list_value(xobj['BBox']) + matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) self.device.begin_figure(xobjid, bbox, matrix) - interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) + interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) - elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: + elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) - (w,h) = (xobj.dic['Width'], xobj.dic['Height']) - self.device.render_image(xobj, (w,h)) + self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 43659dd..730cf45 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -159,22 +159,22 @@ class PDFXRefStream(PDFBaseXRef): (_,genno) = parser.nexttoken() # ignored (_,kwd) = parser.nexttoken() (_,stream) = parser.nextobject() - if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF: + if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') - size = stream.dic['Size'] - index_array = stream.dic.get('Index', (0,size)) + size = stream['Size'] + index_array = stream.get('Index', (0,size)) if len(index_array) % 2 != 0: raise PDFSyntaxError('Invalid index number') self.objid_ranges.extend( ObjIdRange(start, nobjs) for (start,nobjs) in choplist(2, index_array) ) - (self.fl1, self.fl2, self.fl3) = stream.dic['W'] + (self.fl1, self.fl2, self.fl3) = stream['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 - self.trailer = stream.dic + self.trailer = stream.attrs if debug: print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % - (', '.join(map(repr, self.objid_ranges), - self.fl1, self.fl2, self.fl3))) + (', '.join(map(repr, self.objid_ranges)), + self.fl1, self.fl2, self.fl3)) return def get_trailer(self): @@ -430,11 +430,11 @@ class PDFDocument(object): return None if strmid: stream = stream_value(self.getobj(strmid)) - if stream.dic.get('Type') is not LITERAL_OBJSTM: + if stream.get('Type') is not LITERAL_OBJSTM: if STRICT: raise PDFSyntaxError('Not a stream object: %r' % stream) try: - n = stream.dic['N'] + n = stream['N'] except KeyError: if STRICT: raise PDFSyntaxError('N is not defined: %r' % stream) @@ -442,7 +442,7 @@ class PDFDocument(object): if strmid in self.parsed_objs: objs = self.parsed_objs[strmid] else: - parser = PDFObjStrmParser(stream.get_data()) + parser = PDFObjStrmParser(stream.get_data(), self) objs = [] try: while 1: @@ -493,7 +493,12 @@ class PDFDocument(object): raise PDFException('PDFDocument is not initialized') #assert self.xrefs def search(obj, parent): - tree = dict_value(obj).copy() + if isinstance(obj, int): + objid = obj + tree = dict_value(self.getobj(objid)).copy() + else: + objid = obj.objid + tree = dict_value(obj).copy() for (k,v) in parent.iteritems(): if k in self.INHERITABLE_ATTRS and k not in tree: tree[k] = v @@ -506,7 +511,7 @@ class PDFDocument(object): elif tree.get('Type') is LITERAL_PAGE: if 1 <= self.debug: print >>stderr, 'Page: %r' % tree - yield (obj.objid, tree) + yield (objid, tree) if 'Pages' not in self.catalog: return for (pageid,tree) in search(self.catalog['Pages'], self.catalog): yield PDFPage(self, pageid, tree) @@ -709,12 +714,29 @@ class PDFParser(PSStackParser): ## PDFObjStrmParser ## -class PDFObjStrmParser(PSStackParser): +class PDFObjStrmParser(PDFParser): - def __init__(self, data): + def __init__(self, data, doc): PSStackParser.__init__(self, StringIO(data)) + self.doc = doc return def flush(self): self.add_results(*self.popall()) return + + KEYWORD_R = KWD('R') + def do_keyword(self, pos, token): + if token is self.KEYWORD_R: + # reference to indirect object + try: + ((_,objid), (_,genno)) = self.pop(2) + (objid, genno) = (int(objid), int(genno)) + obj = PDFObjRef(self.doc, objid, genno) + self.push((pos, obj)) + except PSSyntaxError: + pass + return + # others + self.push((pos, token)) + return diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index b102b1d..2d62282 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -8,11 +8,15 @@ from psparser import PSException, PSObject from psparser import LIT, KWD, STRICT LITERAL_CRYPT = LIT('Crypt') + +# Abbreviation of Filter names in PDF 4.8.6. "Inline Images" LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl')) LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85')) LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) +LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) +LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) ## PDF Objects @@ -145,8 +149,9 @@ def stream_value(x): ## class PDFStream(PDFObject): - def __init__(self, dic, rawdata, decipher=None): - self.dic = dic + def __init__(self, attrs, rawdata, decipher=None): + assert isinstance(attrs, dict) + self.attrs = attrs self.rawdata = rawdata self.decipher = decipher self.data = None @@ -160,7 +165,14 @@ class PDFStream(PDFObject): return def __repr__(self): - return '' % (self.objid, len(self.rawdata), self.dic) + return '' % (self.objid, len(self.rawdata), self.attrs) + + def __contains__(self, name): + return name in self.attrs + def __getitem__(self, name): + return self.attrs[name] + def get(self, name, default=None): + return self.attrs.get(name, default) def decomp(self,data): buf = data @@ -181,11 +193,11 @@ class PDFStream(PDFObject): if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) - if 'Filter' not in self.dic: - self.data = data - self.rawdata = None + try: + filters = self['Filter'] + except KeyError: + self.rawdata = self.data = data return - filters = self.dic['Filter'] if not isinstance(filters, list): filters = [ filters ] for f in filters: @@ -206,10 +218,10 @@ class PDFStream(PDFObject): else: raise PDFNotImplementedError('Unsupported filter: %r' % f) # apply predictors - if 'DP' in self.dic: - params = self.dic['DP'] - else: - params = self.dic.get('DecodeParms', {}) + try: + params = self['DP'] + except KeyError: + params = self.get('DecodeParms', {}) if 'Predictor' in params: pred = int_value(params['Predictor']) if pred: diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index a2d98cf..00ec441 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -83,16 +83,16 @@ class PSSymbolTable(object): """ def __init__(self, klass): - self.dic = {} + self.dict = {} self.klass = klass return def intern(self, name): - if name in self.dic: - lit = self.dic[name] + if name in self.dict: + lit = self.dict[name] else: lit = self.klass(name) - self.dic[name] = lit + self.dict[name] = lit return lit PSLiteralTable = PSSymbolTable(PSLiteral) @@ -153,7 +153,7 @@ class PSBaseParser(object): return def __repr__(self): - return '' % (self.fp, self.bufpos) + return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos) def flush(self): return diff --git a/tools/dumppdf.py b/tools/dumppdf.py index bc29280..702687c 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -9,7 +9,7 @@ import sys, re from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.pdfparser import PDFDocument, PDFParser -from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1 +from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]') @@ -42,13 +42,18 @@ def dumpxml(out, obj, codec=None): return if isinstance(obj, PDFStream): - out.write('\n\n') - dumpxml(out, obj.dic) - out.write('\n\n') - if codec == 'text': - data = obj.get_data() - out.write('%s\n' % (len(data), esc(data))) - out.write('') + if codec == 'raw': + out.write(obj.get_rawdata()) + elif codec == 'binary': + out.write(obj.get_data()) + else: + out.write('\n\n') + dumpxml(out, obj.attrs) + out.write('\n\n') + if codec == 'text': + data = obj.get_data() + out.write('%s\n' % (len(data), esc(data))) + out.write('') return if isinstance(obj, PDFObjRef): @@ -128,16 +133,16 @@ def dumppdf(outfp, fname, objids, pagenos, password='', if objids: for objid in objids: obj = doc.getobj(objid) - if isinstance(obj, PDFStream) and codec == 'raw': - outfp.write(obj.get_rawdata()) - elif isinstance(obj, PDFStream) and codec == 'binary': - outfp.write(obj.get_data()) - else: - dumpxml(outfp, obj, codec=codec) + dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(doc.get_pages()): if pageno in pagenos: - dumpxml(outfp, page.attrs) + if codec: + for obj in page.contents: + obj = stream_value(obj) + dumpxml(outfp, obj, codec=codec) + else: + dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index bbd9e54..7e80f09 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -13,10 +13,10 @@ def main(argv): def usage(): print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' '[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' - '[-t text|html|xml|tag] [-o output] file ...' % argv[0]) + '[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:o:C:D:m:') + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() @@ -29,6 +29,7 @@ def main(argv): # output option outfile = None outtype = None + imgdir = None codec = 'utf-8' pageno = 1 scale = 1 @@ -42,6 +43,7 @@ def main(argv): elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-o': outfile = v + elif k == '-I': imgdir = v elif k == '-s': scale = float(v) elif k == '-n': laparams = None elif k == '-D': laparams.direction = v @@ -73,9 +75,9 @@ def main(argv): if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': - device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams) + device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams) + device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: