diff --git a/pdf2txt.py b/pdf2txt.py index 34c4a8c..fd61d0e 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -23,17 +23,19 @@ class TextConverter(PDFDevice): self.outfp.write('\n') return - def begin_page(self, name, (x0,y0,x1,y1)): - self.outfp.write('\n' % - (name,x0,y0,x1,y1)) + def begin_page(self, page): + (x0,y0,x1,y1) = page.mediabox + self.outfp.write('' % + (page.pageid, x0,y0,x1,y1, page.rotate)) return def end_page(self, _): self.outfp.write('\n') return - def begin_figure(self, name, (x0,y0,x1,y1)): - self.outfp.write('
\n' % - (name,x0,y0,x1,y1)) + def begin_figure(self, name, bbox): + (x0,y0,x1,y1) = bbox + self.outfp.write('
\n' % + (name, x0,y0,x1,y1)) return def end_figure(self, _): self.outfp.write('
\n') @@ -80,7 +82,7 @@ class TextConverter(PDFDevice): # pdf2txt def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): device = TextConverter(outfp, rsrc, codec) - outfp.write('') + outfp.write('\n') doc = PDFDocument(debug=debug) fp = file(fname) parser = PDFParser(doc, fp, debug=debug) @@ -89,7 +91,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): if pages and (i not in pages): continue interpreter.process_page(page) fp.close() - outfp.write('') + outfp.write('\n') device.close() return diff --git a/pdfinterp.py b/pdfinterp.py index 952412e..2b611ee 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -443,9 +443,9 @@ class PDFDevice: self.ctm = ctm return - def begin_page(self, name, bbox): + def begin_page(self, page): return - def end_page(self, name): + def end_page(self, page): return def begin_figure(self, name, bbox): return @@ -835,9 +835,9 @@ class PDFPageInterpreter: def process_page(self, page): if 1 <= self.debug: print >>stderr, 'Processing page: %r' % page - self.device.begin_page(page.pageid, page.mediabox) + self.device.begin_page(page) self.render_contents(page.resources, page.contents) - self.device.end_page(page.pageid) + self.device.end_page(page) return def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY): diff --git a/pdfparser.py b/pdfparser.py index ea0f11c..96cc542 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -205,8 +205,16 @@ class PDFPage: self.doc = doc self.pageid = pageidx self.attrs = dict_value(attrs) + self.lastmod = self.attrs.get('LastModified') self.resources = resolve1(self.attrs['Resources']) self.mediabox = resolve1(self.attrs['MediaBox']) + if 'CropBox' in self.attrs: + self.cropbox = resolve1(self.attrs['CropBox']) + else: + self.cropbox = self.mediabox + self.rotate = self.attrs.get('Rotate', 0) + self.annots = self.attrs.get('Annots') + self.beads = self.attrs.get('B') contents = resolve1(self.attrs['Contents']) if not isinstance(contents, list): contents = [ contents ] @@ -293,6 +301,7 @@ class PDFXRefStream: ## PDFDocument ## +INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) class PDFDocument: def __init__(self, debug=0): @@ -368,7 +377,7 @@ class PDFDocument: def search(obj, parent): tree = dict_value(obj).copy() for (k,v) in parent.iteritems(): - if k not in tree: + if k in INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree['Type'] == LITERAL_PAGES: if 1 <= debug: