jpeg extraction support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-01-30 07:30:01 +00:00
parent a9d7a00ccd
commit dc6e5c366d
9 changed files with 179 additions and 66 deletions

View File

@ -1,10 +1,11 @@
#!/usr/bin/env python
import sys
import sys, os.path
from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
from utils import enc
from utils import apply_matrix_pt, mult_matrix
@ -109,6 +110,16 @@ class PDFPageAggregator(PDFTextDevice):
self.cur_item.add(fig)
return
def render_image(self, name, stream):
assert isinstance(self.cur_item, LTFigure)
item = LTImage(name, stream['Filter'],
(stream['Width'], stream['Height']),
(self.cur_item.x0, self.cur_item.y0,
self.cur_item.x1, self.cur_item.y1),
stream.get_rawdata())
self.cur_item.add(item)
return
def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path)
if shape == 'ml':
@ -166,12 +177,25 @@ class PDFConverter(PDFPageAggregator):
##
class XMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.imgdir = imgdir
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n')
return
def write_image(self, image):
if image.type in LITERALS_DCT_DECODE:
ext = '.jpg'
else:
return None
name = image.name+ext
path = os.path.join(self.imgdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
return name
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
@ -181,11 +205,11 @@ class XMLConverter(PDFConverter):
render(child)
self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>' % (item.linewidth, item.get_bbox(), item.get_pts()))
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, item.get_bbox(), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item:
@ -209,6 +233,13 @@ class XMLConverter(PDFConverter):
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text)
elif isinstance(item, LTImage):
x = ''
if self.imgdir:
name = self.write_image(item)
if name:
x = 'name="%s" ' % enc(name)
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
else:
assert 0, item
return
@ -226,10 +257,11 @@ class XMLConverter(PDFConverter):
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50):
scale=1, showpageno=True, pagepad=50, imgdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno
self.pagepad = pagepad
self.imgdir = imgdir
self.scale = scale
self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
@ -244,6 +276,24 @@ class HTMLConverter(PDFConverter):
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return
def write_image(self, image):
if image.type in LITERALS_DCT_DECODE:
ext = '.jpg'
else:
return
name = image.name+ext
path = os.path.join(self.imgdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
(x0,y0,x1,y1) = image.dstbbox
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name),
x0*self.scale, (self.yoffset-y1)*self.scale,
(x1-x0)*self.scale, (y1-y0)*self.scale))
return
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
@ -281,6 +331,9 @@ class HTMLConverter(PDFConverter):
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTImage):
if self.imgdir:
self.write_image(item)
return
page = PDFConverter.end_page(self, page)
render(page)

View File

@ -283,6 +283,26 @@ class LTRect(LTPolygon):
return
## LTImage
##
class LTImage(object):
def __init__(self, name, type, srcsize, dstbbox, data):
self.name = name
self.type = type
self.srcsize = srcsize
self.dstbbox = dstbbox
self.data = data
return
def __repr__(self):
(w,h) = self.srcsize
return '<image %s %s %dx%d>' % (self.id, self.type, w, h)
def get_weight(self):
return 0
## LTText
##
class LTText(object):

View File

@ -44,7 +44,7 @@ class PDFDevice(object):
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, stream, size):
def render_image(self, name, stream):
return
def render_string(self, textstate, seq):
return

View File

@ -293,7 +293,7 @@ class PDFPageInterpreter(object):
else:
name = literal_name(spec)
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, stream_value(spec[1]).dic['N'])
return PDFColorSpace(name, stream_value(spec[1])['N'])
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, len(list_value(spec[1])))
else:
@ -681,18 +681,17 @@ class PDFPageInterpreter(object):
return
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
subtype = xobj.dic.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj:
interpreter = self.dup()
bbox = list_value(xobj.dic['BBox'])
matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY))
bbox = list_value(xobj['BBox'])
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
(w,h) = (xobj.dic['Width'], xobj.dic['Height'])
self.device.render_image(xobj, (w,h))
self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid)
else:
# unsupported xobject type.

View File

@ -159,22 +159,22 @@ class PDFXRefStream(PDFBaseXRef):
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size']
index_array = stream.dic.get('Index', (0,size))
size = stream['Size']
index_array = stream.get('Index', (0,size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.objid_ranges.extend( ObjIdRange(start, nobjs)
for (start,nobjs) in choplist(2, index_array) )
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
(self.fl1, self.fl2, self.fl3) = stream['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
self.trailer = stream.attrs
if debug:
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.objid_ranges),
self.fl1, self.fl2, self.fl3)))
(', '.join(map(repr, self.objid_ranges)),
self.fl1, self.fl2, self.fl3))
return
def get_trailer(self):
@ -430,11 +430,11 @@ class PDFDocument(object):
return None
if strmid:
stream = stream_value(self.getobj(strmid))
if stream.dic.get('Type') is not LITERAL_OBJSTM:
if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
n = stream.dic['N']
n = stream['N']
except KeyError:
if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream)
@ -442,7 +442,7 @@ class PDFDocument(object):
if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid]
else:
parser = PDFObjStrmParser(stream.get_data())
parser = PDFObjStrmParser(stream.get_data(), self)
objs = []
try:
while 1:
@ -493,6 +493,11 @@ class PDFDocument(object):
raise PDFException('PDFDocument is not initialized')
#assert self.xrefs
def search(obj, parent):
if isinstance(obj, int):
objid = obj
tree = dict_value(self.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
if k in self.INHERITABLE_ATTRS and k not in tree:
@ -506,7 +511,7 @@ class PDFDocument(object):
elif tree.get('Type') is LITERAL_PAGE:
if 1 <= self.debug:
print >>stderr, 'Page: %r' % tree
yield (obj.objid, tree)
yield (objid, tree)
if 'Pages' not in self.catalog: return
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, pageid, tree)
@ -709,12 +714,29 @@ class PDFParser(PSStackParser):
## PDFObjStrmParser
##
class PDFObjStrmParser(PSStackParser):
class PDFObjStrmParser(PDFParser):
def __init__(self, data):
def __init__(self, data, doc):
PSStackParser.__init__(self, StringIO(data))
self.doc = doc
return
def flush(self):
self.add_results(*self.popall())
return
KEYWORD_R = KWD('R')
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
return
# others
self.push((pos, token))
return

View File

@ -8,11 +8,15 @@ from psparser import PSException, PSObject
from psparser import LIT, KWD, STRICT
LITERAL_CRYPT = LIT('Crypt')
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
## PDF Objects
@ -145,8 +149,9 @@ def stream_value(x):
##
class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
def __init__(self, attrs, rawdata, decipher=None):
assert isinstance(attrs, dict)
self.attrs = attrs
self.rawdata = rawdata
self.decipher = decipher
self.data = None
@ -160,7 +165,14 @@ class PDFStream(PDFObject):
return
def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
def __contains__(self, name):
return name in self.attrs
def __getitem__(self, name):
return self.attrs[name]
def get(self, name, default=None):
return self.attrs.get(name, default)
def decomp(self,data):
buf = data
@ -181,11 +193,11 @@ class PDFStream(PDFObject):
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
try:
filters = self['Filter']
except KeyError:
self.rawdata = self.data = data
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
@ -206,10 +218,10 @@ class PDFStream(PDFObject):
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
if 'DP' in self.dic:
params = self.dic['DP']
else:
params = self.dic.get('DecodeParms', {})
try:
params = self['DP']
except KeyError:
params = self.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:

View File

@ -83,16 +83,16 @@ class PSSymbolTable(object):
"""
def __init__(self, klass):
self.dic = {}
self.dict = {}
self.klass = klass
return
def intern(self, name):
if name in self.dic:
lit = self.dic[name]
if name in self.dict:
lit = self.dict[name]
else:
lit = self.klass(name)
self.dic[name] = lit
self.dict[name] = lit
return lit
PSLiteralTable = PSSymbolTable(PSLiteral)
@ -153,7 +153,7 @@ class PSBaseParser(object):
return
def __repr__(self):
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos)
def flush(self):
return

View File

@ -9,7 +9,7 @@
import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
@ -42,8 +42,13 @@ def dumpxml(out, obj, codec=None):
return
if isinstance(obj, PDFStream):
if codec == 'raw':
out.write(obj.get_rawdata())
elif codec == 'binary':
out.write(obj.get_data())
else:
out.write('<stream>\n<props>\n')
dumpxml(out, obj.dic)
dumpxml(out, obj.attrs)
out.write('\n</props>\n')
if codec == 'text':
data = obj.get_data()
@ -128,15 +133,15 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
if objids:
for objid in objids:
obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw':
outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)

View File

@ -13,10 +13,10 @@ def main(argv):
def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|xml|tag] [-o output] file ...' % argv[0])
'[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:o:C:D:m:')
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@ -29,6 +29,7 @@ def main(argv):
# output option
outfile = None
outtype = None
imgdir = None
codec = 'utf-8'
pageno = 1
scale = 1
@ -42,6 +43,7 @@ def main(argv):
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-o': outfile = v
elif k == '-I': imgdir = v
elif k == '-s': scale = float(v)
elif k == '-n': laparams = None
elif k == '-D': laparams.direction = v
@ -73,9 +75,9 @@ def main(argv):
if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec)
else: