898 lines
25 KiB
Python
898 lines
25 KiB
Python
|
|
import re
|
|
import logging
|
|
from io import BytesIO
|
|
from .cmapdb import CMapDB
|
|
from .cmapdb import CMap
|
|
from .psparser import PSTypeError
|
|
from .psparser import PSEOF
|
|
from .psparser import PSKeyword
|
|
from .psparser import literal_name
|
|
from .psparser import keyword_name
|
|
from .psparser import PSStackParser
|
|
from .psparser import LIT
|
|
from .psparser import KWD
|
|
from . import settings
|
|
from .pdftypes import PDFException
|
|
from .pdftypes import PDFStream
|
|
from .pdftypes import PDFObjRef
|
|
from .pdftypes import resolve1
|
|
from .pdftypes import list_value
|
|
from .pdftypes import dict_value
|
|
from .pdftypes import stream_value
|
|
from .pdffont import PDFFontError
|
|
from .pdffont import PDFType1Font
|
|
from .pdffont import PDFTrueTypeFont
|
|
from .pdffont import PDFType3Font
|
|
from .pdffont import PDFCIDFont
|
|
from .pdfcolor import PDFColorSpace
|
|
from .pdfcolor import PREDEFINED_COLORSPACE
|
|
from .utils import choplist
|
|
from .utils import mult_matrix
|
|
from .utils import MATRIX_IDENTITY
|
|
|
|
import six # Python 2+3 compatibility
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
## Exceptions
|
|
##
|
|
class PDFResourceError(PDFException):
|
|
pass
|
|
|
|
class PDFInterpreterError(PDFException):
|
|
pass
|
|
|
|
## Constants
|
|
##
|
|
LITERAL_PDF = LIT('PDF')
|
|
LITERAL_TEXT = LIT('Text')
|
|
LITERAL_FONT = LIT('Font')
|
|
LITERAL_FORM = LIT('Form')
|
|
LITERAL_IMAGE = LIT('Image')
|
|
|
|
## PDFTextState
|
|
##
|
|
class PDFTextState(object):
|
|
|
|
def __init__(self):
|
|
self.font = None
|
|
self.fontsize = 0
|
|
self.charspace = 0
|
|
self.wordspace = 0
|
|
self.scaling = 100
|
|
self.leading = 0
|
|
self.render = 0
|
|
self.rise = 0
|
|
self.reset()
|
|
# self.matrix is set
|
|
# self.linematrix is set
|
|
return
|
|
|
|
def __repr__(self):
|
|
return ('<PDFTextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
|
|
' scaling=%r, leading=%r, render=%r, rise=%r, '
|
|
' matrix=%r, linematrix=%r>' %
|
|
(self.font, self.fontsize, self.charspace, self.wordspace,
|
|
self.scaling, self.leading, self.render, self.rise,
|
|
self.matrix, self.linematrix))
|
|
|
|
def copy(self):
|
|
obj = PDFTextState()
|
|
obj.font = self.font
|
|
obj.fontsize = self.fontsize
|
|
obj.charspace = self.charspace
|
|
obj.wordspace = self.wordspace
|
|
obj.scaling = self.scaling
|
|
obj.leading = self.leading
|
|
obj.render = self.render
|
|
obj.rise = self.rise
|
|
obj.matrix = self.matrix
|
|
obj.linematrix = self.linematrix
|
|
return obj
|
|
|
|
def reset(self):
|
|
self.matrix = MATRIX_IDENTITY
|
|
self.linematrix = (0, 0)
|
|
return
|
|
|
|
|
|
## PDFGraphicState
|
|
##
|
|
class PDFGraphicState(object):
|
|
|
|
def __init__(self):
|
|
self.linewidth = 0
|
|
self.linecap = None
|
|
self.linejoin = None
|
|
self.miterlimit = None
|
|
self.dash = None
|
|
self.intent = None
|
|
self.flatness = None
|
|
|
|
# stroking color
|
|
self.scolor = None
|
|
|
|
# non stroking color
|
|
self.ncolor = None
|
|
return
|
|
|
|
def copy(self):
|
|
obj = PDFGraphicState()
|
|
obj.linewidth = self.linewidth
|
|
obj.linecap = self.linecap
|
|
obj.linejoin = self.linejoin
|
|
obj.miterlimit = self.miterlimit
|
|
obj.dash = self.dash
|
|
obj.intent = self.intent
|
|
obj.flatness = self.flatness
|
|
obj.scolor = self.scolor
|
|
obj.ncolor = self.ncolor
|
|
return obj
|
|
|
|
def __repr__(self):
|
|
return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
|
|
' miterlimit=%r, dash=%r, intent=%r, flatness=%r, '
|
|
' stroking color=%r, non stroking color=%r>' %
|
|
(self.linewidth, self.linecap, self.linejoin,
|
|
self.miterlimit, self.dash, self.intent, self.flatness,
|
|
self.scolor, self.ncolor))
|
|
|
|
|
|
## Resource Manager
|
|
##
|
|
class PDFResourceManager(object):
|
|
|
|
"""Repository of shared resources.
|
|
|
|
ResourceManager facilitates reuse of shared resources
|
|
such as fonts and images so that large objects are not
|
|
allocated multiple times.
|
|
"""
|
|
|
|
def __init__(self, caching=True):
|
|
self.caching = caching
|
|
self._cached_fonts = {}
|
|
return
|
|
|
|
def get_procset(self, procs):
|
|
for proc in procs:
|
|
if proc is LITERAL_PDF:
|
|
pass
|
|
elif proc is LITERAL_TEXT:
|
|
pass
|
|
else:
|
|
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
|
pass
|
|
return
|
|
|
|
def get_cmap(self, cmapname, strict=False):
|
|
try:
|
|
return CMapDB.get_cmap(cmapname)
|
|
except CMapDB.CMapNotFound:
|
|
if strict:
|
|
raise
|
|
return CMap()
|
|
|
|
def get_font(self, objid, spec):
|
|
if objid and objid in self._cached_fonts:
|
|
font = self._cached_fonts[objid]
|
|
else:
|
|
log.info('get_font: create: objid=%r, spec=%r', objid, spec)
|
|
if settings.STRICT:
|
|
if spec['Type'] is not LITERAL_FONT:
|
|
raise PDFFontError('Type is not /Font')
|
|
# Create a Font object.
|
|
if 'Subtype' in spec:
|
|
subtype = literal_name(spec['Subtype'])
|
|
else:
|
|
if settings.STRICT:
|
|
raise PDFFontError('Font Subtype is not specified.')
|
|
subtype = 'Type1'
|
|
if subtype in ('Type1', 'MMType1'):
|
|
# Type1 Font
|
|
font = PDFType1Font(self, spec)
|
|
elif subtype == 'TrueType':
|
|
# TrueType Font
|
|
font = PDFTrueTypeFont(self, spec)
|
|
elif subtype == 'Type3':
|
|
# Type3 Font
|
|
font = PDFType3Font(self, spec)
|
|
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
|
# CID Font
|
|
font = PDFCIDFont(self, spec)
|
|
elif subtype == 'Type0':
|
|
# Type0 Font
|
|
dfonts = list_value(spec['DescendantFonts'])
|
|
assert dfonts
|
|
subspec = dict_value(dfonts[0]).copy()
|
|
for k in ('Encoding', 'ToUnicode'):
|
|
if k in spec:
|
|
subspec[k] = resolve1(spec[k])
|
|
font = self.get_font(None, subspec)
|
|
else:
|
|
if settings.STRICT:
|
|
raise PDFFontError('Invalid Font spec: %r' % spec)
|
|
font = PDFType1Font(self, spec) # this is so wrong!
|
|
if objid and self.caching:
|
|
self._cached_fonts[objid] = font
|
|
return font
|
|
|
|
|
|
## PDFContentParser
|
|
##
|
|
class PDFContentParser(PSStackParser):
|
|
|
|
def __init__(self, streams):
|
|
self.streams = streams
|
|
self.istream = 0
|
|
PSStackParser.__init__(self, None)
|
|
return
|
|
|
|
def fillfp(self):
|
|
if not self.fp:
|
|
if self.istream < len(self.streams):
|
|
strm = stream_value(self.streams[self.istream])
|
|
self.istream += 1
|
|
else:
|
|
raise PSEOF('Unexpected EOF, file truncated?')
|
|
self.fp = BytesIO(strm.get_data())
|
|
return
|
|
|
|
def seek(self, pos):
|
|
self.fillfp()
|
|
PSStackParser.seek(self, pos)
|
|
return
|
|
|
|
def fillbuf(self):
|
|
if self.charpos < len(self.buf):
|
|
return
|
|
while 1:
|
|
self.fillfp()
|
|
self.bufpos = self.fp.tell()
|
|
self.buf = self.fp.read(self.BUFSIZ)
|
|
if self.buf:
|
|
break
|
|
self.fp = None
|
|
self.charpos = 0
|
|
return
|
|
|
|
def get_inline_data(self, pos, target=b'EI'):
|
|
self.seek(pos)
|
|
i = 0
|
|
data = b''
|
|
while i <= len(target):
|
|
self.fillbuf()
|
|
if i:
|
|
c = six.indexbytes(self.buf,self.charpos)
|
|
c=six.int2byte(c)
|
|
data += c
|
|
self.charpos += 1
|
|
if len(target) <= i and c.isspace():
|
|
i += 1
|
|
elif i < len(target) and c == (six.int2byte(target[i]) if six.PY3 else target[i]):
|
|
i += 1
|
|
else:
|
|
i = 0
|
|
else:
|
|
try:
|
|
j = self.buf.index(target[0], self.charpos)
|
|
#print 'found', (0, self.buf[j:j+10])
|
|
data += self.buf[self.charpos:j+1]
|
|
self.charpos = j+1
|
|
i = 1
|
|
except ValueError:
|
|
data += self.buf[self.charpos:]
|
|
self.charpos = len(self.buf)
|
|
data = data[:-(len(target)+1)] # strip the last part
|
|
data = re.sub(br'(\x0d\x0a|[\x0d\x0a])$', b'', data)
|
|
return (pos, data)
|
|
|
|
def flush(self):
|
|
self.add_results(*self.popall())
|
|
return
|
|
|
|
KEYWORD_BI = KWD(b'BI')
|
|
KEYWORD_ID = KWD(b'ID')
|
|
KEYWORD_EI = KWD(b'EI')
|
|
|
|
def do_keyword(self, pos, token):
|
|
if token is self.KEYWORD_BI:
|
|
# inline image within a content stream
|
|
self.start_type(pos, 'inline')
|
|
elif token is self.KEYWORD_ID:
|
|
try:
|
|
(_, objs) = self.end_type('inline')
|
|
if len(objs) % 2 != 0:
|
|
raise PSTypeError('Invalid dictionary construct: %r' % objs)
|
|
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
|
|
(pos, data) = self.get_inline_data(pos+len(b'ID '))
|
|
obj = PDFStream(d, data)
|
|
self.push((pos, obj))
|
|
self.push((pos, self.KEYWORD_EI))
|
|
except PSTypeError:
|
|
if settings.STRICT:
|
|
raise
|
|
else:
|
|
self.push((pos, token))
|
|
return
|
|
|
|
|
|
## Interpreter
|
|
##
|
|
class PDFPageInterpreter(object):
|
|
|
|
def __init__(self, rsrcmgr, device):
|
|
self.rsrcmgr = rsrcmgr
|
|
self.device = device
|
|
return
|
|
|
|
def dup(self):
|
|
return self.__class__(self.rsrcmgr, self.device)
|
|
|
|
# init_resources(resources):
|
|
# Prepare the fonts and XObjects listed in the Resource attribute.
|
|
def init_resources(self, resources):
|
|
self.resources = resources
|
|
self.fontmap = {}
|
|
self.xobjmap = {}
|
|
self.csmap = PREDEFINED_COLORSPACE.copy()
|
|
if not resources:
|
|
return
|
|
|
|
def get_colorspace(spec):
|
|
if isinstance(spec, list):
|
|
name = literal_name(spec[0])
|
|
else:
|
|
name = literal_name(spec)
|
|
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
|
return PDFColorSpace(name, stream_value(spec[1])['N'])
|
|
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
|
return PDFColorSpace(name, len(list_value(spec[1])))
|
|
else:
|
|
return PREDEFINED_COLORSPACE.get(name)
|
|
for (k, v) in six.iteritems(dict_value(resources)):
|
|
log.debug('Resource: %r: %r', k, v)
|
|
if k == 'Font':
|
|
for (fontid, spec) in six.iteritems(dict_value(v)):
|
|
objid = None
|
|
if isinstance(spec, PDFObjRef):
|
|
objid = spec.objid
|
|
spec = dict_value(spec)
|
|
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
|
elif k == 'ColorSpace':
|
|
for (csid, spec) in six.iteritems(dict_value(v)):
|
|
self.csmap[csid] = get_colorspace(resolve1(spec))
|
|
elif k == 'ProcSet':
|
|
self.rsrcmgr.get_procset(list_value(v))
|
|
elif k == 'XObject':
|
|
for (xobjid, xobjstrm) in six.iteritems(dict_value(v)):
|
|
self.xobjmap[xobjid] = xobjstrm
|
|
return
|
|
|
|
# init_state(ctm)
|
|
# Initialize the text and graphic states for rendering a page.
|
|
def init_state(self, ctm):
|
|
# gstack: stack for graphical states.
|
|
self.gstack = []
|
|
self.ctm = ctm
|
|
self.device.set_ctm(self.ctm)
|
|
self.textstate = PDFTextState()
|
|
self.graphicstate = PDFGraphicState()
|
|
self.curpath = []
|
|
# argstack: stack for command arguments.
|
|
self.argstack = []
|
|
# set some global states.
|
|
self.scs = self.ncs = None
|
|
if self.csmap:
|
|
self.scs = self.ncs = six.next(six.itervalues(self.csmap))
|
|
return
|
|
|
|
def push(self, obj):
|
|
self.argstack.append(obj)
|
|
return
|
|
|
|
def pop(self, n):
|
|
if n == 0:
|
|
return []
|
|
x = self.argstack[-n:]
|
|
self.argstack = self.argstack[:-n]
|
|
return x
|
|
|
|
def get_current_state(self):
|
|
return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
|
|
|
|
def set_current_state(self, state):
|
|
(self.ctm, self.textstate, self.graphicstate) = state
|
|
self.device.set_ctm(self.ctm)
|
|
return
|
|
|
|
# gsave
|
|
def do_q(self):
|
|
self.gstack.append(self.get_current_state())
|
|
return
|
|
|
|
# grestore
|
|
def do_Q(self):
|
|
if self.gstack:
|
|
self.set_current_state(self.gstack.pop())
|
|
return
|
|
|
|
# concat-matrix
|
|
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
|
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
|
|
self.device.set_ctm(self.ctm)
|
|
return
|
|
|
|
# setlinewidth
|
|
def do_w(self, linewidth):
|
|
self.graphicstate.linewidth = linewidth
|
|
return
|
|
|
|
# setlinecap
|
|
def do_J(self, linecap):
|
|
self.graphicstate.linecap = linecap
|
|
return
|
|
|
|
# setlinejoin
|
|
def do_j(self, linejoin):
|
|
self.graphicstate.linejoin = linejoin
|
|
return
|
|
|
|
# setmiterlimit
|
|
def do_M(self, miterlimit):
|
|
self.graphicstate.miterlimit = miterlimit
|
|
return
|
|
|
|
# setdash
|
|
def do_d(self, dash, phase):
|
|
self.graphicstate.dash = (dash, phase)
|
|
return
|
|
|
|
# setintent
|
|
def do_ri(self, intent):
|
|
self.graphicstate.intent = intent
|
|
return
|
|
|
|
# setflatness
|
|
def do_i(self, flatness):
|
|
self.graphicstate.flatness = flatness
|
|
return
|
|
|
|
# load-gstate
|
|
def do_gs(self, name):
|
|
#XXX
|
|
return
|
|
|
|
# moveto
|
|
def do_m(self, x, y):
|
|
self.curpath.append(('m', x, y))
|
|
return
|
|
|
|
# lineto
|
|
def do_l(self, x, y):
|
|
self.curpath.append(('l', x, y))
|
|
return
|
|
|
|
# curveto
|
|
def do_c(self, x1, y1, x2, y2, x3, y3):
|
|
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
|
|
return
|
|
|
|
# urveto
|
|
def do_v(self, x2, y2, x3, y3):
|
|
self.curpath.append(('v', x2, y2, x3, y3))
|
|
return
|
|
|
|
# rveto
|
|
def do_y(self, x1, y1, x3, y3):
|
|
self.curpath.append(('y', x1, y1, x3, y3))
|
|
return
|
|
|
|
# closepath
|
|
def do_h(self):
|
|
self.curpath.append(('h',))
|
|
return
|
|
|
|
# rectangle
|
|
def do_re(self, x, y, w, h):
|
|
self.curpath.append(('m', x, y))
|
|
self.curpath.append(('l', x+w, y))
|
|
self.curpath.append(('l', x+w, y+h))
|
|
self.curpath.append(('l', x, y+h))
|
|
self.curpath.append(('h',))
|
|
return
|
|
|
|
# stroke
|
|
def do_S(self):
|
|
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
|
self.curpath = []
|
|
return
|
|
|
|
# close-and-stroke
|
|
def do_s(self):
|
|
self.do_h()
|
|
self.do_S()
|
|
return
|
|
|
|
# fill
|
|
def do_f(self):
|
|
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
|
self.curpath = []
|
|
return
|
|
# fill (obsolete)
|
|
do_F = do_f
|
|
|
|
# fill-even-odd
|
|
def do_f_a(self):
|
|
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
|
self.curpath = []
|
|
return
|
|
|
|
# fill-and-stroke
|
|
def do_B(self):
|
|
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
|
self.curpath = []
|
|
return
|
|
|
|
# fill-and-stroke-even-odd
|
|
def do_B_a(self):
|
|
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
|
self.curpath = []
|
|
return
|
|
|
|
# close-fill-and-stroke
|
|
def do_b(self):
|
|
self.do_h()
|
|
self.do_B()
|
|
return
|
|
|
|
# close-fill-and-stroke-even-odd
|
|
def do_b_a(self):
|
|
self.do_h()
|
|
self.do_B_a()
|
|
return
|
|
|
|
# close-only
|
|
def do_n(self):
|
|
self.curpath = []
|
|
return
|
|
|
|
# clip
|
|
def do_W(self):
|
|
return
|
|
|
|
# clip-even-odd
|
|
def do_W_a(self):
|
|
return
|
|
|
|
# setcolorspace-stroking
|
|
def do_CS(self, name):
|
|
try:
|
|
self.scs = self.csmap[literal_name(name)]
|
|
except KeyError:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
|
return
|
|
|
|
# setcolorspace-non-strokine
|
|
def do_cs(self, name):
|
|
try:
|
|
self.ncs = self.csmap[literal_name(name)]
|
|
except KeyError:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('Undefined ColorSpace: %r' % name)
|
|
return
|
|
|
|
# setgray-stroking
|
|
def do_G(self, gray):
|
|
self.graphicstate.color = gray
|
|
#self.do_CS(LITERAL_DEVICE_GRAY)
|
|
return
|
|
|
|
# setgray-non-stroking
|
|
def do_g(self, gray):
|
|
self.graphicstate.color = gray
|
|
#self.do_cs(LITERAL_DEVICE_GRAY)
|
|
return
|
|
|
|
# setrgb-stroking
|
|
def do_RG(self, r, g, b):
|
|
self.graphicstate.color = (r, g, b)
|
|
#self.do_CS(LITERAL_DEVICE_RGB)
|
|
return
|
|
|
|
# setrgb-non-stroking
|
|
def do_rg(self, r, g, b):
|
|
self.graphicstate.color = (r, g, b)
|
|
#self.do_cs(LITERAL_DEVICE_RGB)
|
|
return
|
|
|
|
# setcmyk-stroking
|
|
def do_K(self, c, m, y, k):
|
|
self.graphicstate.color = (c, m, y, k)
|
|
#self.do_CS(LITERAL_DEVICE_CMYK)
|
|
return
|
|
|
|
# setcmyk-non-stroking
|
|
def do_k(self, c, m, y, k):
|
|
self.graphicstate.color = (c, m, y, k)
|
|
#self.do_cs(LITERAL_DEVICE_CMYK)
|
|
return
|
|
|
|
# setcolor
|
|
def do_SCN(self):
|
|
if self.scs:
|
|
n = self.scs.ncomponents
|
|
else:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('No colorspace specified!')
|
|
n = 1
|
|
self.graphicstate.scolor = self.pop(n)
|
|
return
|
|
|
|
def do_scn(self):
|
|
if self.ncs:
|
|
n = self.ncs.ncomponents
|
|
else:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('No colorspace specified!')
|
|
n = 1
|
|
self.graphicstate.ncolor = self.pop(n)
|
|
return
|
|
|
|
def do_SC(self):
|
|
self.do_SCN()
|
|
return
|
|
|
|
def do_sc(self):
|
|
self.do_scn()
|
|
return
|
|
|
|
# sharing-name
|
|
def do_sh(self, name):
|
|
return
|
|
|
|
# begin-text
|
|
def do_BT(self):
|
|
self.textstate.reset()
|
|
return
|
|
|
|
# end-text
|
|
def do_ET(self):
|
|
return
|
|
|
|
# begin-compat
|
|
def do_BX(self):
|
|
return
|
|
|
|
# end-compat
|
|
def do_EX(self):
|
|
return
|
|
|
|
# marked content operators
|
|
def do_MP(self, tag):
|
|
self.device.do_tag(tag)
|
|
return
|
|
|
|
def do_DP(self, tag, props):
|
|
self.device.do_tag(tag, props)
|
|
return
|
|
|
|
def do_BMC(self, tag):
|
|
self.device.begin_tag(tag)
|
|
return
|
|
|
|
def do_BDC(self, tag, props):
|
|
self.device.begin_tag(tag, props)
|
|
return
|
|
|
|
def do_EMC(self):
|
|
self.device.end_tag()
|
|
return
|
|
|
|
# setcharspace
|
|
def do_Tc(self, space):
|
|
self.textstate.charspace = space
|
|
return
|
|
|
|
# setwordspace
|
|
def do_Tw(self, space):
|
|
self.textstate.wordspace = space
|
|
return
|
|
|
|
# textscale
|
|
def do_Tz(self, scale):
|
|
self.textstate.scaling = scale
|
|
return
|
|
|
|
# setleading
|
|
def do_TL(self, leading):
|
|
self.textstate.leading = -leading
|
|
return
|
|
|
|
# selectfont
|
|
def do_Tf(self, fontid, fontsize):
|
|
try:
|
|
self.textstate.font = self.fontmap[literal_name(fontid)]
|
|
except KeyError:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('Undefined Font id: %r' % fontid)
|
|
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
|
self.textstate.fontsize = fontsize
|
|
return
|
|
|
|
# setrendering
|
|
def do_Tr(self, render):
|
|
self.textstate.render = render
|
|
return
|
|
|
|
# settextrise
|
|
def do_Ts(self, rise):
|
|
self.textstate.rise = rise
|
|
return
|
|
|
|
# text-move
|
|
def do_Td(self, tx, ty):
|
|
(a, b, c, d, e, f) = self.textstate.matrix
|
|
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
|
self.textstate.linematrix = (0, 0)
|
|
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
|
|
return
|
|
|
|
# text-move
|
|
def do_TD(self, tx, ty):
|
|
(a, b, c, d, e, f) = self.textstate.matrix
|
|
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
|
|
self.textstate.leading = ty
|
|
self.textstate.linematrix = (0, 0)
|
|
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
|
|
return
|
|
|
|
# textmatrix
|
|
def do_Tm(self, a, b, c, d, e, f):
|
|
self.textstate.matrix = (a, b, c, d, e, f)
|
|
self.textstate.linematrix = (0, 0)
|
|
return
|
|
|
|
# nextline
|
|
def do_T_a(self):
|
|
(a, b, c, d, e, f) = self.textstate.matrix
|
|
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
|
|
self.textstate.linematrix = (0, 0)
|
|
return
|
|
|
|
# show-pos
|
|
def do_TJ(self, seq):
|
|
#print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
|
|
if self.textstate.font is None:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('No font specified!')
|
|
return
|
|
self.device.render_string(self.textstate, seq)
|
|
return
|
|
|
|
# show
|
|
def do_Tj(self, s):
|
|
self.do_TJ([s])
|
|
return
|
|
|
|
# quote
|
|
def do__q(self, s):
|
|
self.do_T_a()
|
|
self.do_TJ([s])
|
|
return
|
|
|
|
# doublequote
|
|
def do__w(self, aw, ac, s):
|
|
self.do_Tw(aw)
|
|
self.do_Tc(ac)
|
|
self.do_TJ([s])
|
|
return
|
|
|
|
# inline image
|
|
def do_BI(self): # never called
|
|
return
|
|
|
|
def do_ID(self): # never called
|
|
return
|
|
|
|
def do_EI(self, obj):
|
|
if 'W' in obj and 'H' in obj:
|
|
iobjid = str(id(obj))
|
|
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
|
self.device.render_image(iobjid, obj)
|
|
self.device.end_figure(iobjid)
|
|
return
|
|
|
|
# invoke an XObject
|
|
def do_Do(self, xobjid):
|
|
xobjid = literal_name(xobjid)
|
|
try:
|
|
xobj = stream_value(self.xobjmap[xobjid])
|
|
except KeyError:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
|
return
|
|
log.info('Processing xobj: %r', xobj)
|
|
subtype = xobj.get('Subtype')
|
|
if subtype is LITERAL_FORM and 'BBox' in xobj:
|
|
interpreter = self.dup()
|
|
bbox = list_value(xobj['BBox'])
|
|
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
|
|
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
|
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
|
# instead of having their own Resources entry.
|
|
xobjres = xobj.get('Resources')
|
|
resources = dict_value(xobjres) if xobjres else self.resources.copy()
|
|
self.device.begin_figure(xobjid, bbox, matrix)
|
|
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
|
|
self.device.end_figure(xobjid)
|
|
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
|
|
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
|
self.device.render_image(xobjid, xobj)
|
|
self.device.end_figure(xobjid)
|
|
else:
|
|
# unsupported xobject type.
|
|
pass
|
|
return
|
|
|
|
def process_page(self, page):
|
|
log.info('Processing page: %r', page)
|
|
(x0, y0, x1, y1) = page.mediabox
|
|
if page.rotate == 90:
|
|
ctm = (0, -1, 1, 0, -y0, x1)
|
|
elif page.rotate == 180:
|
|
ctm = (-1, 0, 0, -1, x1, y1)
|
|
elif page.rotate == 270:
|
|
ctm = (0, 1, -1, 0, y1, -x0)
|
|
else:
|
|
ctm = (1, 0, 0, 1, -x0, -y0)
|
|
self.device.begin_page(page, ctm)
|
|
self.render_contents(page.resources, page.contents, ctm=ctm)
|
|
self.device.end_page(page)
|
|
return
|
|
|
|
# render_contents(resources, streams, ctm)
|
|
# Render the content streams.
|
|
# This method may be called recursively.
|
|
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
|
|
log.info('render_contents: resources=%r, streams=%r, ctm=%r',
|
|
resources, streams, ctm)
|
|
self.init_resources(resources)
|
|
self.init_state(ctm)
|
|
self.execute(list_value(streams))
|
|
return
|
|
|
|
def execute(self, streams):
|
|
try:
|
|
parser = PDFContentParser(streams)
|
|
except PSEOF:
|
|
# empty page
|
|
return
|
|
while 1:
|
|
try:
|
|
(_, obj) = parser.nextobject()
|
|
except PSEOF:
|
|
break
|
|
if isinstance(obj, PSKeyword):
|
|
name = keyword_name(obj)
|
|
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
|
|
if hasattr(self, method):
|
|
func = getattr(self, method)
|
|
nargs = six.get_function_code(func).co_argcount-1
|
|
if nargs:
|
|
args = self.pop(nargs)
|
|
log.debug('exec: %s %r', name, args)
|
|
if len(args) == nargs:
|
|
func(*args)
|
|
else:
|
|
log.debug('exec: %s', name)
|
|
func()
|
|
else:
|
|
if settings.STRICT:
|
|
raise PDFInterpreterError('Unknown operator: %r' % name)
|
|
else:
|
|
self.push(obj)
|
|
return
|