git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@15 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
dc77b838f7
commit
80d17eb79b
18
pdf2txt.py
18
pdf2txt.py
|
@ -23,17 +23,19 @@ class TextConverter(PDFDevice):
|
||||||
self.outfp.write('\n')
|
self.outfp.write('\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, name, (x0,y0,x1,y1)):
|
def begin_page(self, page):
|
||||||
self.outfp.write('<page name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
(x0,y0,x1,y1) = page.mediabox
|
||||||
(name,x0,y0,x1,y1))
|
self.outfp.write('<page id="%d" mediabox="%d,%d,%d,%d" rotate="%d">' %
|
||||||
|
(page.pageid, x0,y0,x1,y1, page.rotate))
|
||||||
return
|
return
|
||||||
def end_page(self, _):
|
def end_page(self, _):
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write('</page>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_figure(self, name, (x0,y0,x1,y1)):
|
def begin_figure(self, name, bbox):
|
||||||
self.outfp.write('<figure name="%s" x0="%d" y0="%d" x1="%d" y1="%d">\n' %
|
(x0,y0,x1,y1) = bbox
|
||||||
(name,x0,y0,x1,y1))
|
self.outfp.write('<figure name="%s" bbox="%d,%d,%d,%d">\n' %
|
||||||
|
(name, x0,y0,x1,y1))
|
||||||
return
|
return
|
||||||
def end_figure(self, _):
|
def end_figure(self, _):
|
||||||
self.outfp.write('</figure>\n')
|
self.outfp.write('</figure>\n')
|
||||||
|
@ -80,7 +82,7 @@ class TextConverter(PDFDevice):
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
device = TextConverter(outfp, rsrc, codec)
|
device = TextConverter(outfp, rsrc, codec)
|
||||||
outfp.write('<document>')
|
outfp.write('<document>\n')
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname)
|
fp = file(fname)
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp, debug=debug)
|
||||||
|
@ -89,7 +91,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||||
if pages and (i not in pages): continue
|
if pages and (i not in pages): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
fp.close()
|
fp.close()
|
||||||
outfp.write('</document>')
|
outfp.write('</document>\n')
|
||||||
device.close()
|
device.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -443,9 +443,9 @@ class PDFDevice:
|
||||||
self.ctm = ctm
|
self.ctm = ctm
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, name, bbox):
|
def begin_page(self, page):
|
||||||
return
|
return
|
||||||
def end_page(self, name):
|
def end_page(self, page):
|
||||||
return
|
return
|
||||||
def begin_figure(self, name, bbox):
|
def begin_figure(self, name, bbox):
|
||||||
return
|
return
|
||||||
|
@ -835,9 +835,9 @@ class PDFPageInterpreter:
|
||||||
def process_page(self, page):
|
def process_page(self, page):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing page: %r' % page
|
print >>stderr, 'Processing page: %r' % page
|
||||||
self.device.begin_page(page.pageid, page.mediabox)
|
self.device.begin_page(page)
|
||||||
self.render_contents(page.resources, page.contents)
|
self.render_contents(page.resources, page.contents)
|
||||||
self.device.end_page(page.pageid)
|
self.device.end_page(page)
|
||||||
return
|
return
|
||||||
|
|
||||||
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
def render_contents(self, resources, contents, ctm=MATRIX_IDENTITY):
|
||||||
|
|
11
pdfparser.py
11
pdfparser.py
|
@ -205,8 +205,16 @@ class PDFPage:
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.pageid = pageidx
|
self.pageid = pageidx
|
||||||
self.attrs = dict_value(attrs)
|
self.attrs = dict_value(attrs)
|
||||||
|
self.lastmod = self.attrs.get('LastModified')
|
||||||
self.resources = resolve1(self.attrs['Resources'])
|
self.resources = resolve1(self.attrs['Resources'])
|
||||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||||
|
if 'CropBox' in self.attrs:
|
||||||
|
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||||
|
else:
|
||||||
|
self.cropbox = self.mediabox
|
||||||
|
self.rotate = self.attrs.get('Rotate', 0)
|
||||||
|
self.annots = self.attrs.get('Annots')
|
||||||
|
self.beads = self.attrs.get('B')
|
||||||
contents = resolve1(self.attrs['Contents'])
|
contents = resolve1(self.attrs['Contents'])
|
||||||
if not isinstance(contents, list):
|
if not isinstance(contents, list):
|
||||||
contents = [ contents ]
|
contents = [ contents ]
|
||||||
|
@ -293,6 +301,7 @@ class PDFXRefStream:
|
||||||
|
|
||||||
## PDFDocument
|
## PDFDocument
|
||||||
##
|
##
|
||||||
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
class PDFDocument:
|
class PDFDocument:
|
||||||
|
|
||||||
def __init__(self, debug=0):
|
def __init__(self, debug=0):
|
||||||
|
@ -368,7 +377,7 @@ class PDFDocument:
|
||||||
def search(obj, parent):
|
def search(obj, parent):
|
||||||
tree = dict_value(obj).copy()
|
tree = dict_value(obj).copy()
|
||||||
for (k,v) in parent.iteritems():
|
for (k,v) in parent.iteritems():
|
||||||
if k not in tree:
|
if k in INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree['Type'] == LITERAL_PAGES:
|
if tree['Type'] == LITERAL_PAGES:
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
|
|
Loading…
Reference in New Issue