jpeg extraction support added.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@174 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
a9d7a00ccd
commit
dc6e5c366d
|
@ -1,10 +1,11 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys, os.path
|
||||||
from pdfdevice import PDFDevice, PDFTextDevice
|
from pdfdevice import PDFDevice, PDFTextDevice
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
|
from pdftypes import LITERALS_DCT_DECODE
|
||||||
from layout import LayoutContainer
|
from layout import LayoutContainer
|
||||||
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
||||||
from layout import LTFigure, LTTextItem, LTTextBox, LTTextLine
|
from layout import LTFigure, LTImage, LTTextItem, LTTextBox, LTTextLine
|
||||||
from utils import enc
|
from utils import enc
|
||||||
from utils import apply_matrix_pt, mult_matrix
|
from utils import apply_matrix_pt, mult_matrix
|
||||||
|
|
||||||
|
@ -109,6 +110,16 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
self.cur_item.add(fig)
|
self.cur_item.add(fig)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def render_image(self, name, stream):
|
||||||
|
assert isinstance(self.cur_item, LTFigure)
|
||||||
|
item = LTImage(name, stream['Filter'],
|
||||||
|
(stream['Width'], stream['Height']),
|
||||||
|
(self.cur_item.x0, self.cur_item.y0,
|
||||||
|
self.cur_item.x1, self.cur_item.y1),
|
||||||
|
stream.get_rawdata())
|
||||||
|
self.cur_item.add(item)
|
||||||
|
return
|
||||||
|
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||||
shape = ''.join(x[0] for x in path)
|
shape = ''.join(x[0] for x in path)
|
||||||
if shape == 'ml':
|
if shape == 'ml':
|
||||||
|
@ -166,11 +177,24 @@ class PDFConverter(PDFPageAggregator):
|
||||||
##
|
##
|
||||||
class XMLConverter(PDFConverter):
|
class XMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None):
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None, imgdir=None):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
|
self.imgdir = imgdir
|
||||||
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
||||||
self.outfp.write('<pages>\n')
|
self.outfp.write('<pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def write_image(self, image):
|
||||||
|
if image.type in LITERALS_DCT_DECODE:
|
||||||
|
ext = '.jpg'
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
name = image.name+ext
|
||||||
|
path = os.path.join(self.imgdir, name)
|
||||||
|
fp = file(path, 'wb')
|
||||||
|
fp.write(image.data)
|
||||||
|
fp.close()
|
||||||
|
return name
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
|
@ -181,11 +205,11 @@ class XMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write('</page>\n')
|
||||||
elif isinstance(item, LTLine) and item.direction:
|
elif isinstance(item, LTLine) and item.direction:
|
||||||
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
|
self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, item.get_bbox()))
|
||||||
elif isinstance(item, LTRect):
|
elif isinstance(item, LTRect):
|
||||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
|
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, item.get_bbox()))
|
||||||
elif isinstance(item, LTPolygon):
|
elif isinstance(item, LTPolygon):
|
||||||
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>' % (item.linewidth, item.get_bbox(), item.get_pts()))
|
self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, item.get_bbox(), item.get_pts()))
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
|
||||||
for child in item:
|
for child in item:
|
||||||
|
@ -209,6 +233,13 @@ class XMLConverter(PDFConverter):
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||||
|
elif isinstance(item, LTImage):
|
||||||
|
x = ''
|
||||||
|
if self.imgdir:
|
||||||
|
name = self.write_image(item)
|
||||||
|
if name:
|
||||||
|
x = 'name="%s" ' % enc(name)
|
||||||
|
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height))
|
||||||
else:
|
else:
|
||||||
assert 0, item
|
assert 0, item
|
||||||
return
|
return
|
||||||
|
@ -226,10 +257,11 @@ class XMLConverter(PDFConverter):
|
||||||
class HTMLConverter(PDFConverter):
|
class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
scale=1, showpageno=True, pagepad=50):
|
scale=1, showpageno=True, pagepad=50, imgdir=None):
|
||||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
|
self.imgdir = imgdir
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.outfp.write('<html><head>\n')
|
self.outfp.write('<html><head>\n')
|
||||||
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
|
self.outfp.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' %
|
||||||
|
@ -244,6 +276,24 @@ class HTMLConverter(PDFConverter):
|
||||||
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
(color, width, x*self.scale, y*self.scale, w*self.scale, h*self.scale))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def write_image(self, image):
|
||||||
|
if image.type in LITERALS_DCT_DECODE:
|
||||||
|
ext = '.jpg'
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
name = image.name+ext
|
||||||
|
path = os.path.join(self.imgdir, name)
|
||||||
|
fp = file(path, 'wb')
|
||||||
|
fp.write(image.data)
|
||||||
|
fp.close()
|
||||||
|
(x0,y0,x1,y1) = image.dstbbox
|
||||||
|
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||||
|
'width="%d" height="%d" />\n' %
|
||||||
|
(enc(name),
|
||||||
|
x0*self.scale, (self.yoffset-y1)*self.scale,
|
||||||
|
(x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||||
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
|
@ -281,6 +331,9 @@ class HTMLConverter(PDFConverter):
|
||||||
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
self.write_rect('green', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
|
elif isinstance(item, LTImage):
|
||||||
|
if self.imgdir:
|
||||||
|
self.write_image(item)
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
|
|
|
@ -283,6 +283,26 @@ class LTRect(LTPolygon):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## LTImage
|
||||||
|
##
|
||||||
|
class LTImage(object):
|
||||||
|
|
||||||
|
def __init__(self, name, type, srcsize, dstbbox, data):
|
||||||
|
self.name = name
|
||||||
|
self.type = type
|
||||||
|
self.srcsize = srcsize
|
||||||
|
self.dstbbox = dstbbox
|
||||||
|
self.data = data
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
(w,h) = self.srcsize
|
||||||
|
return '<image %s %s %dx%d>' % (self.id, self.type, w, h)
|
||||||
|
|
||||||
|
def get_weight(self):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
## LTText
|
## LTText
|
||||||
##
|
##
|
||||||
class LTText(object):
|
class LTText(object):
|
||||||
|
|
|
@ -44,7 +44,7 @@ class PDFDevice(object):
|
||||||
|
|
||||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||||
return
|
return
|
||||||
def render_image(self, stream, size):
|
def render_image(self, name, stream):
|
||||||
return
|
return
|
||||||
def render_string(self, textstate, seq):
|
def render_string(self, textstate, seq):
|
||||||
return
|
return
|
||||||
|
|
|
@ -293,7 +293,7 @@ class PDFPageInterpreter(object):
|
||||||
else:
|
else:
|
||||||
name = literal_name(spec)
|
name = literal_name(spec)
|
||||||
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
||||||
return PDFColorSpace(name, stream_value(spec[1]).dic['N'])
|
return PDFColorSpace(name, stream_value(spec[1])['N'])
|
||||||
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
||||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||||
else:
|
else:
|
||||||
|
@ -681,18 +681,17 @@ class PDFPageInterpreter(object):
|
||||||
return
|
return
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing xobj: %r' % xobj
|
print >>stderr, 'Processing xobj: %r' % xobj
|
||||||
subtype = xobj.dic.get('Subtype')
|
subtype = xobj.get('Subtype')
|
||||||
if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
|
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
||||||
interpreter = self.dup()
|
interpreter = self.dup()
|
||||||
bbox = list_value(xobj.dic['BBox'])
|
bbox = list_value(xobj['BBox'])
|
||||||
matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY))
|
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
|
||||||
self.device.begin_figure(xobjid, bbox, matrix)
|
self.device.begin_figure(xobjid, bbox, matrix)
|
||||||
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
|
interpreter.render_contents(dict_value(xobj.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||||
self.device.end_figure(xobjid)
|
self.device.end_figure(xobjid)
|
||||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
|
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
||||||
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
|
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
|
||||||
(w,h) = (xobj.dic['Width'], xobj.dic['Height'])
|
self.device.render_image(xobjid, xobj)
|
||||||
self.device.render_image(xobj, (w,h))
|
|
||||||
self.device.end_figure(xobjid)
|
self.device.end_figure(xobjid)
|
||||||
else:
|
else:
|
||||||
# unsupported xobject type.
|
# unsupported xobject type.
|
||||||
|
|
|
@ -159,22 +159,22 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
(_,genno) = parser.nexttoken() # ignored
|
(_,genno) = parser.nexttoken() # ignored
|
||||||
(_,kwd) = parser.nexttoken()
|
(_,kwd) = parser.nexttoken()
|
||||||
(_,stream) = parser.nextobject()
|
(_,stream) = parser.nextobject()
|
||||||
if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
|
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
|
||||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||||
size = stream.dic['Size']
|
size = stream['Size']
|
||||||
index_array = stream.dic.get('Index', (0,size))
|
index_array = stream.get('Index', (0,size))
|
||||||
if len(index_array) % 2 != 0:
|
if len(index_array) % 2 != 0:
|
||||||
raise PDFSyntaxError('Invalid index number')
|
raise PDFSyntaxError('Invalid index number')
|
||||||
self.objid_ranges.extend( ObjIdRange(start, nobjs)
|
self.objid_ranges.extend( ObjIdRange(start, nobjs)
|
||||||
for (start,nobjs) in choplist(2, index_array) )
|
for (start,nobjs) in choplist(2, index_array) )
|
||||||
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
|
(self.fl1, self.fl2, self.fl3) = stream['W']
|
||||||
self.data = stream.get_data()
|
self.data = stream.get_data()
|
||||||
self.entlen = self.fl1+self.fl2+self.fl3
|
self.entlen = self.fl1+self.fl2+self.fl3
|
||||||
self.trailer = stream.dic
|
self.trailer = stream.attrs
|
||||||
if debug:
|
if debug:
|
||||||
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
|
||||||
(', '.join(map(repr, self.objid_ranges),
|
(', '.join(map(repr, self.objid_ranges)),
|
||||||
self.fl1, self.fl2, self.fl3)))
|
self.fl1, self.fl2, self.fl3))
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_trailer(self):
|
def get_trailer(self):
|
||||||
|
@ -430,11 +430,11 @@ class PDFDocument(object):
|
||||||
return None
|
return None
|
||||||
if strmid:
|
if strmid:
|
||||||
stream = stream_value(self.getobj(strmid))
|
stream = stream_value(self.getobj(strmid))
|
||||||
if stream.dic.get('Type') is not LITERAL_OBJSTM:
|
if stream.get('Type') is not LITERAL_OBJSTM:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||||
try:
|
try:
|
||||||
n = stream.dic['N']
|
n = stream['N']
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||||
|
@ -442,7 +442,7 @@ class PDFDocument(object):
|
||||||
if strmid in self.parsed_objs:
|
if strmid in self.parsed_objs:
|
||||||
objs = self.parsed_objs[strmid]
|
objs = self.parsed_objs[strmid]
|
||||||
else:
|
else:
|
||||||
parser = PDFObjStrmParser(stream.get_data())
|
parser = PDFObjStrmParser(stream.get_data(), self)
|
||||||
objs = []
|
objs = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -493,7 +493,12 @@ class PDFDocument(object):
|
||||||
raise PDFException('PDFDocument is not initialized')
|
raise PDFException('PDFDocument is not initialized')
|
||||||
#assert self.xrefs
|
#assert self.xrefs
|
||||||
def search(obj, parent):
|
def search(obj, parent):
|
||||||
tree = dict_value(obj).copy()
|
if isinstance(obj, int):
|
||||||
|
objid = obj
|
||||||
|
tree = dict_value(self.getobj(objid)).copy()
|
||||||
|
else:
|
||||||
|
objid = obj.objid
|
||||||
|
tree = dict_value(obj).copy()
|
||||||
for (k,v) in parent.iteritems():
|
for (k,v) in parent.iteritems():
|
||||||
if k in self.INHERITABLE_ATTRS and k not in tree:
|
if k in self.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
|
@ -506,7 +511,7 @@ class PDFDocument(object):
|
||||||
elif tree.get('Type') is LITERAL_PAGE:
|
elif tree.get('Type') is LITERAL_PAGE:
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Page: %r' % tree
|
print >>stderr, 'Page: %r' % tree
|
||||||
yield (obj.objid, tree)
|
yield (objid, tree)
|
||||||
if 'Pages' not in self.catalog: return
|
if 'Pages' not in self.catalog: return
|
||||||
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
|
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
|
||||||
yield PDFPage(self, pageid, tree)
|
yield PDFPage(self, pageid, tree)
|
||||||
|
@ -709,12 +714,29 @@ class PDFParser(PSStackParser):
|
||||||
|
|
||||||
## PDFObjStrmParser
|
## PDFObjStrmParser
|
||||||
##
|
##
|
||||||
class PDFObjStrmParser(PSStackParser):
|
class PDFObjStrmParser(PDFParser):
|
||||||
|
|
||||||
def __init__(self, data):
|
def __init__(self, data, doc):
|
||||||
PSStackParser.__init__(self, StringIO(data))
|
PSStackParser.__init__(self, StringIO(data))
|
||||||
|
self.doc = doc
|
||||||
return
|
return
|
||||||
|
|
||||||
def flush(self):
|
def flush(self):
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
return
|
return
|
||||||
|
|
||||||
|
KEYWORD_R = KWD('R')
|
||||||
|
def do_keyword(self, pos, token):
|
||||||
|
if token is self.KEYWORD_R:
|
||||||
|
# reference to indirect object
|
||||||
|
try:
|
||||||
|
((_,objid), (_,genno)) = self.pop(2)
|
||||||
|
(objid, genno) = (int(objid), int(genno))
|
||||||
|
obj = PDFObjRef(self.doc, objid, genno)
|
||||||
|
self.push((pos, obj))
|
||||||
|
except PSSyntaxError:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
# others
|
||||||
|
self.push((pos, token))
|
||||||
|
return
|
||||||
|
|
|
@ -8,11 +8,15 @@ from psparser import PSException, PSObject
|
||||||
from psparser import LIT, KWD, STRICT
|
from psparser import LIT, KWD, STRICT
|
||||||
|
|
||||||
LITERAL_CRYPT = LIT('Crypt')
|
LITERAL_CRYPT = LIT('Crypt')
|
||||||
|
|
||||||
|
# Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
|
||||||
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
|
LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
|
||||||
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
|
LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
|
||||||
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
|
LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
|
||||||
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
|
LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
|
||||||
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
|
LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
|
||||||
|
LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
|
||||||
|
LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
||||||
|
|
||||||
|
|
||||||
## PDF Objects
|
## PDF Objects
|
||||||
|
@ -145,8 +149,9 @@ def stream_value(x):
|
||||||
##
|
##
|
||||||
class PDFStream(PDFObject):
|
class PDFStream(PDFObject):
|
||||||
|
|
||||||
def __init__(self, dic, rawdata, decipher=None):
|
def __init__(self, attrs, rawdata, decipher=None):
|
||||||
self.dic = dic
|
assert isinstance(attrs, dict)
|
||||||
|
self.attrs = attrs
|
||||||
self.rawdata = rawdata
|
self.rawdata = rawdata
|
||||||
self.decipher = decipher
|
self.decipher = decipher
|
||||||
self.data = None
|
self.data = None
|
||||||
|
@ -160,7 +165,14 @@ class PDFStream(PDFObject):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
|
||||||
|
|
||||||
|
def __contains__(self, name):
|
||||||
|
return name in self.attrs
|
||||||
|
def __getitem__(self, name):
|
||||||
|
return self.attrs[name]
|
||||||
|
def get(self, name, default=None):
|
||||||
|
return self.attrs.get(name, default)
|
||||||
|
|
||||||
def decomp(self,data):
|
def decomp(self,data):
|
||||||
buf = data
|
buf = data
|
||||||
|
@ -181,11 +193,11 @@ class PDFStream(PDFObject):
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
# Handle encryption
|
# Handle encryption
|
||||||
data = self.decipher(self.objid, self.genno, data)
|
data = self.decipher(self.objid, self.genno, data)
|
||||||
if 'Filter' not in self.dic:
|
try:
|
||||||
self.data = data
|
filters = self['Filter']
|
||||||
self.rawdata = None
|
except KeyError:
|
||||||
|
self.rawdata = self.data = data
|
||||||
return
|
return
|
||||||
filters = self.dic['Filter']
|
|
||||||
if not isinstance(filters, list):
|
if not isinstance(filters, list):
|
||||||
filters = [ filters ]
|
filters = [ filters ]
|
||||||
for f in filters:
|
for f in filters:
|
||||||
|
@ -206,10 +218,10 @@ class PDFStream(PDFObject):
|
||||||
else:
|
else:
|
||||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||||
# apply predictors
|
# apply predictors
|
||||||
if 'DP' in self.dic:
|
try:
|
||||||
params = self.dic['DP']
|
params = self['DP']
|
||||||
else:
|
except KeyError:
|
||||||
params = self.dic.get('DecodeParms', {})
|
params = self.get('DecodeParms', {})
|
||||||
if 'Predictor' in params:
|
if 'Predictor' in params:
|
||||||
pred = int_value(params['Predictor'])
|
pred = int_value(params['Predictor'])
|
||||||
if pred:
|
if pred:
|
||||||
|
|
|
@ -83,16 +83,16 @@ class PSSymbolTable(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, klass):
|
def __init__(self, klass):
|
||||||
self.dic = {}
|
self.dict = {}
|
||||||
self.klass = klass
|
self.klass = klass
|
||||||
return
|
return
|
||||||
|
|
||||||
def intern(self, name):
|
def intern(self, name):
|
||||||
if name in self.dic:
|
if name in self.dict:
|
||||||
lit = self.dic[name]
|
lit = self.dict[name]
|
||||||
else:
|
else:
|
||||||
lit = self.klass(name)
|
lit = self.klass(name)
|
||||||
self.dic[name] = lit
|
self.dict[name] = lit
|
||||||
return lit
|
return lit
|
||||||
|
|
||||||
PSLiteralTable = PSSymbolTable(PSLiteral)
|
PSLiteralTable = PSSymbolTable(PSLiteral)
|
||||||
|
@ -153,7 +153,7 @@ class PSBaseParser(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
|
return '<%s: %r, bufpos=%d>' % (self.__class__.__name__, self.fp, self.bufpos)
|
||||||
|
|
||||||
def flush(self):
|
def flush(self):
|
||||||
return
|
return
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
import sys, re
|
import sys, re
|
||||||
from pdfminer.psparser import PSKeyword, PSLiteral
|
from pdfminer.psparser import PSKeyword, PSLiteral
|
||||||
from pdfminer.pdfparser import PDFDocument, PDFParser
|
from pdfminer.pdfparser import PDFDocument, PDFParser
|
||||||
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1
|
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
|
||||||
|
|
||||||
|
|
||||||
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
|
ESC_PAT = re.compile(r'[\000-\037&<>()\042\047\134\177-\377]')
|
||||||
|
@ -42,13 +42,18 @@ def dumpxml(out, obj, codec=None):
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
out.write('<stream>\n<props>\n')
|
if codec == 'raw':
|
||||||
dumpxml(out, obj.dic)
|
out.write(obj.get_rawdata())
|
||||||
out.write('\n</props>\n')
|
elif codec == 'binary':
|
||||||
if codec == 'text':
|
out.write(obj.get_data())
|
||||||
data = obj.get_data()
|
else:
|
||||||
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
out.write('<stream>\n<props>\n')
|
||||||
out.write('</stream>')
|
dumpxml(out, obj.attrs)
|
||||||
|
out.write('\n</props>\n')
|
||||||
|
if codec == 'text':
|
||||||
|
data = obj.get_data()
|
||||||
|
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
||||||
|
out.write('</stream>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFObjRef):
|
if isinstance(obj, PDFObjRef):
|
||||||
|
@ -128,16 +133,16 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
if objids:
|
if objids:
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
if isinstance(obj, PDFStream) and codec == 'raw':
|
dumpxml(outfp, obj, codec=codec)
|
||||||
outfp.write(obj.get_rawdata())
|
|
||||||
elif isinstance(obj, PDFStream) and codec == 'binary':
|
|
||||||
outfp.write(obj.get_data())
|
|
||||||
else:
|
|
||||||
dumpxml(outfp, obj, codec=codec)
|
|
||||||
if pagenos:
|
if pagenos:
|
||||||
for (pageno,page) in enumerate(doc.get_pages()):
|
for (pageno,page) in enumerate(doc.get_pages()):
|
||||||
if pageno in pagenos:
|
if pageno in pagenos:
|
||||||
dumpxml(outfp, page.attrs)
|
if codec:
|
||||||
|
for obj in page.contents:
|
||||||
|
obj = stream_value(obj)
|
||||||
|
dumpxml(outfp, obj, codec=codec)
|
||||||
|
else:
|
||||||
|
dumpxml(outfp, page.attrs)
|
||||||
if dumpall:
|
if dumpall:
|
||||||
dumpallobjs(outfp, doc, codec=codec)
|
dumpallobjs(outfp, doc, codec=codec)
|
||||||
if (not objids) and (not pagenos) and (not dumpall):
|
if (not objids) and (not pagenos) and (not dumpall):
|
||||||
|
|
|
@ -13,10 +13,10 @@ def main(argv):
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
|
||||||
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
'[-n] [-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||||
'[-t text|html|xml|tag] [-o output] file ...' % argv[0])
|
'[-t text|html|xml|tag] [-I imgdir] [-o output] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:o:C:D:m:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:nD:M:L:W:t:I:o:C:D:m:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -29,6 +29,7 @@ def main(argv):
|
||||||
# output option
|
# output option
|
||||||
outfile = None
|
outfile = None
|
||||||
outtype = None
|
outtype = None
|
||||||
|
imgdir = None
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
pageno = 1
|
pageno = 1
|
||||||
scale = 1
|
scale = 1
|
||||||
|
@ -42,6 +43,7 @@ def main(argv):
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
|
elif k == '-I': imgdir = v
|
||||||
elif k == '-s': scale = float(v)
|
elif k == '-s': scale = float(v)
|
||||||
elif k == '-n': laparams = None
|
elif k == '-n': laparams = None
|
||||||
elif k == '-D': laparams.direction = v
|
elif k == '-D': laparams.direction = v
|
||||||
|
@ -73,9 +75,9 @@ def main(argv):
|
||||||
if outtype == 'text':
|
if outtype == 'text':
|
||||||
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
||||||
elif outtype == 'xml':
|
elif outtype == 'xml':
|
||||||
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
|
device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, imgdir=imgdir)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
|
device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, imgdir=imgdir)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue