jpeg extraction support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-01-30 07:30:01 +00:00
parent a9d7a00ccd
commit dc6e5c366d
9 changed files with 179 additions and 66 deletions

View File

@ -1,10 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys, os.path
from pdfdevice import PDFDevice, PDFTextDevice from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
from layout import LayoutContainer from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
from utils import enc from utils import enc
from utils import apply_matrix_pt, mult_matrix from utils import apply_matrix_pt, mult_matrix
@ -109,6 +110,16 @@ class PDFPageAggregator(PDFTextDevice):
self.cur_item.add(fig) self.cur_item.add(fig)
return return
def render_image(self, name, stream):
assert isinstance(self.cur_item, LTFigure)
item = LTImage(name, stream['Filter'],
(stream['Width'], stream['Height']),
(self.cur_item.x0, self.cur_item.y0,
self.cur_item.x1, self.cur_item.y1),
stream.get_rawdata())
self.cur_item.add(item)
return
def paint_path(self, gstate, stroke, fill, evenodd, path): def paint_path(self, gstate, stroke, fill, evenodd, path):
shape = ''.join(x[0] for x in path) shape = ''.join(x[0] for x in path)
if shape == 'ml': if shape == 'ml':
@ -166,11 +177,24 @@ class PDFConverter(PDFPageAggregator):
## ##
class XMLConverter(PDFConverter): class XMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.imgdir = imgdir
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec) self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n') self.outfp.write('<pages>\n')
return return
def write_image(self, image):
if image.type in LITERALS_DCT_DECODE:
ext = '.jpg'
else:
return None
name = image.name+ext
path = os.path.join(self.imgdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
return name
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
@ -181,11 +205,11 @@ class XMLConverter(PDFConverter):
render(child) render(child)
self.outfp.write('</page>\n') self.outfp.write('</page>\n')
elif isinstance(item, LTLine) and item.direction: elif isinstance(item, LTLine) and item.direction:
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox())) self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, item.get_bbox()))
elif isinstance(item, LTRect): elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox())) self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, item.get_bbox()))
elif isinstance(item, LTPolygon): elif isinstance(item, LTPolygon):
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>' % (item.linewidth, item.get_bbox(), item.get_pts())) self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, item.get_bbox(), item.get_pts()))
elif isinstance(item, LTFigure): elif isinstance(item, LTFigure):
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox())) self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
for child in item: for child in item:
@ -209,6 +233,13 @@ class XMLConverter(PDFConverter):
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text) self.outfp.write('<text>%s</text>\n' % item.text)
elif isinstance(item, LTImage):
x = ''
if self.imgdir:
name = self.write_image(item)
if name:
x = 'name="%s" ' % enc(name)
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
else: else:
assert 0, item assert 0, item
return return
@ -226,10 +257,11 @@ class XMLConverter(PDFConverter):
class HTMLConverter(PDFConverter): class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, showpageno=True, pagepad=50): scale=1, showpageno=True, pagepad=50, imgdir=None):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
self.pagepad = pagepad self.pagepad = pagepad
self.imgdir = imgdir
self.scale = scale self.scale = scale
self.outfp.write('<html><head>\n') self.outfp.write('<html><head>\n')
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
@ -244,6 +276,24 @@ class HTMLConverter(PDFConverter):
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale)) (color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
return return
def write_image(self, image):
if image.type in LITERALS_DCT_DECODE:
ext = '.jpg'
else:
return
name = image.name+ext
path = os.path.join(self.imgdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
(x0,y0,x1,y1) = image.dstbbox
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name),
x0*self.scale, (self.yoffset-y1)*self.scale,
(x1-x0)*self.scale, (y1-y0)*self.scale))
return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
@ -281,6 +331,9 @@ class HTMLConverter(PDFConverter):
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTImage):
if self.imgdir:
self.write_image(item)
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)

View File

@ -283,6 +283,26 @@ class LTRect(LTPolygon):
return return
## LTImage
##
class LTImage(object):
def __init__(self, name, type, srcsize, dstbbox, data):
self.name = name
self.type = type
self.srcsize = srcsize
self.dstbbox = dstbbox
self.data = data
return
def __repr__(self):
(w,h) = self.srcsize
return '<image %s %s %dx%d>' % (self.id, self.type, w, h)
def get_weight(self):
return 0
## LTText ## LTText
## ##
class LTText(object): class LTText(object):

View File

@ -44,7 +44,7 @@ class PDFDevice(object):
def paint_path(self, graphicstate, stroke, fill, evenodd, path): def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return return
def render_image(self, stream, size): def render_image(self, name, stream):
return return
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
return return

View File

@ -293,7 +293,7 @@ class PDFPageInterpreter(object):
else: else:
name = literal_name(spec) name = literal_name(spec)
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, stream_value(spec[1]).dic['N']) return PDFColorSpace(name, stream_value(spec[1])['N'])
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, len(list_value(spec[1]))) return PDFColorSpace(name, len(list_value(spec[1])))
else: else:
@ -681,18 +681,17 @@ class PDFPageInterpreter(object):
return return
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj print >>stderr, 'Processing xobj: %r' % xobj
subtype = xobj.dic.get('Subtype') subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj.dic: if subtype is LITERAL_FORM and 'BBox' in xobj:
interpreter = self.dup() interpreter = self.dup()
bbox = list_value(xobj.dic['BBox']) bbox = list_value(xobj['BBox'])
matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
self.device.begin_figure(xobjid, bbox, matrix) self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
(w,h) = (xobj.dic['Width'], xobj.dic['Height']) self.device.render_image(xobjid, xobj)
self.device.render_image(xobj, (w,h))
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
else: else:
# unsupported xobject type. # unsupported xobject type.

View File

@ -159,22 +159,22 @@ class PDFXRefStream(PDFBaseXRef):
(_,genno) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken() (_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject() (_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF: if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.') raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size'] size = stream['Size']
index_array = stream.dic.get('Index', (0,size)) index_array = stream.get('Index', (0,size))
if len(index_array) % 2 != 0: if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number') raise PDFSyntaxError('Invalid index number')
self.objid_ranges.extend( ObjIdRange(start, nobjs) self.objid_ranges.extend( ObjIdRange(start, nobjs)
for (start,nobjs) in choplist(2, index_array) ) for (start,nobjs) in choplist(2, index_array) )
(self.fl1, self.fl2, self.fl3) = stream.dic['W'] (self.fl1, self.fl2, self.fl3) = stream['W']
self.data = stream.get_data() self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3 self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic self.trailer = stream.attrs
if debug: if debug:
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.objid_ranges), (', '.join(map(repr, self.objid_ranges)),
self.fl1, self.fl2, self.fl3))) self.fl1, self.fl2, self.fl3))
return return
def get_trailer(self): def get_trailer(self):
@ -430,11 +430,11 @@ class PDFDocument(object):
return None return None
if strmid: if strmid:
stream = stream_value(self.getobj(strmid)) stream = stream_value(self.getobj(strmid))
if stream.dic.get('Type') is not LITERAL_OBJSTM: if stream.get('Type') is not LITERAL_OBJSTM:
if STRICT: if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream) raise PDFSyntaxError('Not a stream object: %r' % stream)
try: try:
n = stream.dic['N'] n = stream['N']
except KeyError: except KeyError:
if STRICT: if STRICT:
raise PDFSyntaxError('N is not defined: %r' % stream) raise PDFSyntaxError('N is not defined: %r' % stream)
@ -442,7 +442,7 @@ class PDFDocument(object):
if strmid in self.parsed_objs: if strmid in self.parsed_objs:
objs = self.parsed_objs[strmid] objs = self.parsed_objs[strmid]
else: else:
parser = PDFObjStrmParser(stream.get_data()) parser = PDFObjStrmParser(stream.get_data(), self)
objs = [] objs = []
try: try:
while 1: while 1:
@ -493,7 +493,12 @@ class PDFDocument(object):
raise PDFException('PDFDocument is not initialized') raise PDFException('PDFDocument is not initialized')
#assert self.xrefs #assert self.xrefs
def search(obj, parent): def search(obj, parent):
tree = dict_value(obj).copy() if isinstance(obj, int):
objid = obj
tree = dict_value(self.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems(): for (k,v) in parent.iteritems():
if k in self.INHERITABLE_ATTRS and k not in tree: if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
@ -506,7 +511,7 @@ class PDFDocument(object):
elif tree.get('Type') is LITERAL_PAGE: elif tree.get('Type') is LITERAL_PAGE:
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Page: %r' % tree print >>stderr, 'Page: %r' % tree
yield (obj.objid, tree) yield (objid, tree)
if 'Pages' not in self.catalog: return if 'Pages' not in self.catalog: return
for (pageid,tree) in search(self.catalog['Pages'], self.catalog): for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, pageid, tree) yield PDFPage(self, pageid, tree)
@ -709,12 +714,29 @@ class PDFParser(PSStackParser):
## PDFObjStrmParser ## PDFObjStrmParser
## ##
class PDFObjStrmParser(PSStackParser): class PDFObjStrmParser(PDFParser):
def __init__(self, data): def __init__(self, data, doc):
PSStackParser.__init__(self, StringIO(data)) PSStackParser.__init__(self, StringIO(data))
self.doc = doc
return return
def flush(self): def flush(self):
self.add_results(*self.popall()) self.add_results(*self.popall())
return return
KEYWORD_R = KWD('R')
def do_keyword(self, pos, token):
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
pass
return
# others
self.push((pos, token))
return

View File

@ -8,11 +8,15 @@ from psparser import PSException, PSObject
from psparser import LIT, KWD, STRICT from psparser import LIT, KWD, STRICT
LITERAL_CRYPT = LIT('Crypt') LITERAL_CRYPT = LIT('Crypt')
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl')) LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85')) LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
## PDF Objects ## PDF Objects
@ -145,8 +149,9 @@ def stream_value(x):
## ##
class PDFStream(PDFObject): class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None): def __init__(self, attrs, rawdata, decipher=None):
self.dic = dic assert isinstance(attrs, dict)
self.attrs = attrs
self.rawdata = rawdata self.rawdata = rawdata
self.decipher = decipher self.decipher = decipher
self.data = None self.data = None
@ -160,7 +165,14 @@ class PDFStream(PDFObject):
return return
def __repr__(self): def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic) return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
def __contains__(self, name):
return name in self.attrs
def __getitem__(self, name):
return self.attrs[name]
def get(self, name, default=None):
return self.attrs.get(name, default)
def decomp(self,data): def decomp(self,data):
buf = data buf = data
@ -181,11 +193,11 @@ class PDFStream(PDFObject):
if self.decipher: if self.decipher:
# Handle encryption # Handle encryption
data = self.decipher(self.objid, self.genno, data) data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic: try:
self.data = data filters = self['Filter']
self.rawdata = None except KeyError:
self.rawdata = self.data = data
return return
filters = self.dic['Filter']
if not isinstance(filters, list): if not isinstance(filters, list):
filters = [ filters ] filters = [ filters ]
for f in filters: for f in filters:
@ -206,10 +218,10 @@ class PDFStream(PDFObject):
else: else:
raise PDFNotImplementedError('Unsupported filter: %r' % f) raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors # apply predictors
if 'DP' in self.dic: try:
params = self.dic['DP'] params = self['DP']
else: except KeyError:
params = self.dic.get('DecodeParms', {}) params = self.get('DecodeParms', {})
if 'Predictor' in params: if 'Predictor' in params:
pred = int_value(params['Predictor']) pred = int_value(params['Predictor'])
if pred: if pred:

View File

@ -83,16 +83,16 @@ class PSSymbolTable(object):
""" """
def __init__(self, klass): def __init__(self, klass):
self.dic = {} self.dict = {}
self.klass = klass self.klass = klass
return return
def intern(self, name): def intern(self, name):
if name in self.dic: if name in self.dict:
lit = self.dic[name] lit = self.dict[name]
else: else:
lit = self.klass(name) lit = self.klass(name)
self.dic[name] = lit self.dict[name] = lit
return lit return lit
PSLiteralTable = PSSymbolTable(PSLiteral) PSLiteralTable = PSSymbolTable(PSLiteral)
@ -153,7 +153,7 @@ class PSBaseParser(object):
return return
def __repr__(self): def __repr__(self):
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos) return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos)
def flush(self): def flush(self):
return return

View File

@ -9,7 +9,7 @@
import sys, re import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1 from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]') ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
@ -42,13 +42,18 @@ def dumpxml(out, obj, codec=None):
return return
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
out.write('<stream>\n<props>\n') if codec == 'raw':
dumpxml(out, obj.dic) out.write(obj.get_rawdata())
out.write('\n</props>\n') elif codec == 'binary':
if codec == 'text': out.write(obj.get_data())
data = obj.get_data() else:
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data))) out.write('<stream>\n<props>\n')
out.write('</stream>') dumpxml(out, obj.attrs)
out.write('\n</props>\n')
if codec == 'text':
data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('</stream>')
return return
if isinstance(obj, PDFObjRef): if isinstance(obj, PDFObjRef):
@ -128,16 +133,16 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
if objids: if objids:
for objid in objids: for objid in objids:
obj = doc.getobj(objid) obj = doc.getobj(objid)
if isinstance(obj, PDFStream) and codec == 'raw': dumpxml(outfp, obj, codec=codec)
outfp.write(obj.get_rawdata())
elif isinstance(obj, PDFStream) and codec == 'binary':
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pagenos: if pagenos:
for (pageno,page) in enumerate(doc.get_pages()): for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos: if pageno in pagenos:
dumpxml(outfp, page.attrs) if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall: if dumpall:
dumpallobjs(outfp, doc, codec=codec) dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall): if (not objids) and (not pagenos) and (not dumpall):

View File

@ -13,10 +13,10 @@ def main(argv):
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
'[-t text|html|xml|tag] [-o output] file ...' % argv[0]) '[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:o:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -29,6 +29,7 @@ def main(argv):
# output option # output option
outfile = None outfile = None
outtype = None outtype = None
imgdir = None
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
scale = 1 scale = 1
@ -42,6 +43,7 @@ def main(argv):
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-I': imgdir = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
elif k == '-n': laparams = None elif k == '-n': laparams = None
elif k == '-D': laparams.direction = v elif k == '-D': laparams.direction = v
@ -73,9 +75,9 @@ def main(argv):
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml': elif outtype == 'xml':
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams) device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams) device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec) device = TagExtractor(rsrc, outfp, codec=codec)
else: else: