diff --git a/Makefile b/Makefile index 10db3c1..126ea20 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # Makefile for pdfminer PACKAGE=pdfminer -VERSION=20080107 -TAR=tar +VERSION=20080427 +GNUTAR=tar SVN=svn PYTHON=python @@ -24,7 +24,7 @@ clean: pack: clean $(SVN) cleanup $(SVN) export . $(WORKDIR)/$(DISTNAME) - $(TAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner + $(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner rm -rf $(WORKDIR)/$(DISTNAME) pychecker: diff --git a/dumppdf.py b/dumppdf.py index 5f7ab75..7403ad8 100755 --- a/dumppdf.py +++ b/dumppdf.py @@ -19,7 +19,7 @@ def esc(s): # dumpxml -def dumpxml(out, obj): +def dumpxml(out, obj, codec=None): if isinstance(obj, dict): out.write('\n' % len(obj)) for (k,v) in obj.iteritems(): @@ -43,16 +43,12 @@ def dumpxml(out, obj): return if isinstance(obj, PDFStream): - props = obj.dic.copy() - if 'Filter' in props: - del props['Filter'] - if 'DecodeParms' in props: - del props['DecodeParms'] out.write('\n\n') - dumpxml(out, props) - data = obj.get_data() + dumpxml(out, obj.dic) out.write('\n\n') - out.write('%s\n' % (len(data), esc(data))) + if codec: + data = obj.get_data() + out.write('%s\n' % (len(data), esc(data))) out.write('') return @@ -101,17 +97,17 @@ def dumpallobjs(out, doc): # dumppdf def dumppdf(outfp, fname, objids, pageids, - dumpall=False, binary=False, debug=0): + dumpall=False, codec=None, debug=0): doc = PDFDocument(debug=debug) fp = file(fname) parser = PDFParser(doc, fp, debug=debug) if objids: for objid in objids: obj = doc.getobj(objid) - if binary and isinstance(obj, PDFStream): + if codec == 'binary' and isinstance(obj, PDFStream): outfp.write(obj.get_data()) else: - dumpxml(outfp, obj) + dumpxml(outfp, obj, codec=codec) if pageids: for page in doc.get_pages(): if page.pageid in pageids: @@ -129,17 +125,17 @@ def dumppdf(outfp, fname, objids, pageids, def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-a] [-b] [-p pageid] [-i objid] file ...' % argv[0] + print 'usage: %s [-d] [-a] [-c|-b] [-p pageid] [-i objid] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dabi:p:') + (opts, args) = getopt.getopt(argv[1:], 'dacbi:p:') except getopt.GetoptError: return usage() if not args: return usage() debug = 0 objids = [] pageids = set() - binary = False + codec = None dumpall = False outfp = stdout for (k, v) in opts: @@ -147,12 +143,13 @@ def main(argv): elif k == '-i': objids.append(int(v)) elif k == '-p': pageids.add(int(v)) elif k == '-a': dumpall = True - elif k == '-b': binary = True + elif k == '-b': codec = 'binary' + elif k == '-c': codec = 'text' elif k == '-o': outfp = file(v, 'w') # for fname in args: dumppdf(outfp, fname, objids, pageids, - dumpall=dumpall, binary=binary, debug=debug) + dumpall=dumpall, codec=codec, debug=debug) return if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdf2txt.py b/pdf2txt.py index 217547d..5b2e373 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -46,7 +46,8 @@ class FigureItem(PageItem): return ('
' % (self.id, bbox)) def dump(self, outfp, codec): - outfp.write(repr(self)+'\n') + bbox = '%d,%d,%d,%d' % self.bbox + outfp.write('
\n' % (self.id, bbox)) for obj in self.objs: obj.dump(outfp, codec) outfp.write('
\n') @@ -126,6 +127,9 @@ class TextConverter(PDFDevice): self.context.add(fig) return + def render_image(self, stream, size, matrix): + return + def handle_undefined_char(self, cidcoding, cid): if self.debug: print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) @@ -155,7 +159,7 @@ class TextConverter(PDFDevice): font, textstate.fontsize, size, text) self.context.add(item) return - + def dump(self, outfp, codec): for page in self.pages: page.dump(outfp, codec) diff --git a/pdfinterp.py b/pdfinterp.py index efaddeb..c015020 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -41,6 +41,7 @@ LITERAL_PDF = PSLiteralTable.intern('PDF') LITERAL_TEXT = PSLiteralTable.intern('Text') LITERAL_FONT = PSLiteralTable.intern('Font') LITERAL_FORM = PSLiteralTable.intern('Form') +LITERAL_IMAGE = PSLiteralTable.intern('Image') LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') @@ -484,6 +485,8 @@ class PDFDevice: def render_string(self, textstate, textmatrix, size, seq): raise NotImplementedError + def render_image(self, stream, size, matrix): + raise NotImplementedError ## PDFContentParser @@ -942,9 +945,10 @@ class PDFPageInterpreter: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return - if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic: - if 1 <= self.debug: - print >>stderr, 'Processing xobj: %r' % xobj + if 1 <= self.debug: + print >>stderr, 'Processing xobj: %r' % xobj + subtype = xobj.dic.get('Subtype') + if subtype == LITERAL_FORM and 'BBox' in xobj.dic: interpreter = PDFPageInterpreter(self.rsrc, self.device) (x0,y0,x1,y1) = xobj.dic['BBox'] ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm) @@ -954,6 +958,16 @@ class PDFPageInterpreter: self.device.begin_figure(xobjid, bbox) interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm) self.device.end_figure(xobjid) + elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: + (x0,y0) = apply_matrix(self.ctm, (0,0)) + (x1,y1) = apply_matrix(self.ctm, (1,1)) + self.device.begin_figure(xobjid, (x0,y0,x1,y1)) + (w,h) = (xobj.dic['Width'], xobj.dic['Height']) + self.device.render_image(xobj, (w,h), self.ctm) + self.device.end_figure(xobjid) + else: + # unsupported xobject type. + pass return def process_page(self, page):