tests pass under Py 2.7 and 3.4
parent
b0e035c24f
commit
faea7291a8
|
@ -13,6 +13,7 @@
|
||||||
import sys
|
import sys
|
||||||
import array
|
import array
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
## BitParser
|
## BitParser
|
||||||
##
|
##
|
||||||
|
@ -26,7 +27,7 @@ class BitParser(object):
|
||||||
def add(klass, root, v, bits):
|
def add(klass, root, v, bits):
|
||||||
p = root
|
p = root
|
||||||
b = None
|
b = None
|
||||||
for i in xrange(len(bits)):
|
for i in range(len(bits)):
|
||||||
if 0 < i:
|
if 0 < i:
|
||||||
if p[b] is None:
|
if p[b] is None:
|
||||||
p[b] = [None, None]
|
p[b] = [None, None]
|
||||||
|
@ -686,6 +687,25 @@ class TestCCITTG4Parser(unittest.TestCase):
|
||||||
|
|
||||||
## CCITTFaxDecoder
|
## CCITTFaxDecoder
|
||||||
##
|
##
|
||||||
|
def test___init__(self):
|
||||||
|
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||||
|
raise SkipTest # TODO: implement your test here
|
||||||
|
|
||||||
|
def test_feedbytes(self):
|
||||||
|
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||||
|
# assert_equal(expected, c_citt_g4_parser.feedbytes(data))
|
||||||
|
raise SkipTest # TODO: implement your test here
|
||||||
|
|
||||||
|
def test_output_line(self):
|
||||||
|
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||||
|
# assert_equal(expected, c_citt_g4_parser.output_line(y, bits))
|
||||||
|
raise SkipTest # TODO: implement your test here
|
||||||
|
|
||||||
|
def test_reset(self):
|
||||||
|
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||||
|
# assert_equal(expected, c_citt_g4_parser.reset())
|
||||||
|
raise SkipTest # TODO: implement your test here
|
||||||
|
|
||||||
class CCITTFaxDecoder(CCITTG4Parser):
|
class CCITTFaxDecoder(CCITTG4Parser):
|
||||||
|
|
||||||
def __init__(self, width, bytealign=False, reversed=False):
|
def __init__(self, width, bytealign=False, reversed=False):
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from .psparser import LIT
|
from .psparser import LIT
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
## PDFColorSpace
|
## PDFColorSpace
|
||||||
##
|
##
|
||||||
|
@ -20,15 +21,17 @@ class PDFColorSpace(object):
|
||||||
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||||
|
|
||||||
|
|
||||||
PREDEFINED_COLORSPACE = dict(
|
PREDEFINED_COLORSPACE = {}
|
||||||
(name, PDFColorSpace(name, n)) for (name, n) in {
|
for (name, n) in six.iteritems({
|
||||||
'CalRGB': 3,
|
'CalRGB': 3,
|
||||||
'CalGray': 1,
|
'CalGray': 1,
|
||||||
'Lab': 3,
|
'Lab': 3,
|
||||||
'DeviceRGB': 3,
|
'DeviceRGB': 3,
|
||||||
'DeviceCMYK': 4,
|
'DeviceCMYK': 4,
|
||||||
'DeviceGray': 1,
|
'DeviceGray': 1,
|
||||||
'Separation': 1,
|
'Separation': 1,
|
||||||
'Indexed': 1,
|
'Indexed': 1,
|
||||||
'Pattern': 1,
|
'Pattern': 1,
|
||||||
}.iteritems())
|
}) :
|
||||||
|
PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import six # Python 2+3 compatibility
|
||||||
try:
|
try:
|
||||||
import hashlib as md5
|
import hashlib as md5
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -107,10 +109,13 @@ class PDFXRef(PDFBaseXRef):
|
||||||
if len(f) != 2:
|
if len(f) != 2:
|
||||||
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
|
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
|
||||||
try:
|
try:
|
||||||
(start, nobjs) = map(long, f)
|
if six.PY2:
|
||||||
|
(start, nobjs) = map(long, f)
|
||||||
|
else:
|
||||||
|
(start, nobjs) = map(int, f)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
||||||
for objid in xrange(start, start+nobjs):
|
for objid in range(start, start+nobjs):
|
||||||
try:
|
try:
|
||||||
(_, line) = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
|
@ -121,17 +126,15 @@ class PDFXRef(PDFBaseXRef):
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
if use != b'n':
|
if use != b'n':
|
||||||
continue
|
continue
|
||||||
self.offsets[objid] = (None, long(pos), int(genno))
|
self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
|
||||||
logging.info('xref objects: %r' % self.offsets)
|
logging.info('xref objects: %r' % self.offsets)
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
return
|
return
|
||||||
|
|
||||||
KEYWORD_TRAILER = KWD('trailer')
|
|
||||||
|
|
||||||
def load_trailer(self, parser):
|
def load_trailer(self, parser):
|
||||||
try:
|
try:
|
||||||
(_, kwd) = parser.nexttoken()
|
(_, kwd) = parser.nexttoken()
|
||||||
assert kwd is self.KEYWORD_TRAILER
|
assert kwd.name == 'trailer'
|
||||||
(_, dic) = parser.nextobject()
|
(_, dic) = parser.nextobject()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
x = parser.pop(1)
|
x = parser.pop(1)
|
||||||
|
@ -145,7 +148,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
return self.trailer
|
return self.trailer
|
||||||
|
|
||||||
def get_objids(self):
|
def get_objids(self):
|
||||||
return self.offsets.iterkeys()
|
return six.iterkeys(self.offsets)
|
||||||
|
|
||||||
def get_pos(self, objid):
|
def get_pos(self, objid):
|
||||||
try:
|
try:
|
||||||
|
@ -175,6 +178,8 @@ class PDFXRefFallback(PDFXRef):
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
logging.info('trailer: %r' % self.get_trailer())
|
logging.info('trailer: %r' % self.get_trailer())
|
||||||
break
|
break
|
||||||
|
if six.PY3:
|
||||||
|
line=line.decode('utf-8')
|
||||||
m = self.PDFOBJ_CUE.match(line)
|
m = self.PDFOBJ_CUE.match(line)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
|
@ -634,8 +639,6 @@ class PDFDocument(object):
|
||||||
pass
|
pass
|
||||||
return (objs, n)
|
return (objs, n)
|
||||||
|
|
||||||
KEYWORD_OBJ = KWD('obj')
|
|
||||||
|
|
||||||
def _getobj_parse(self, pos, objid):
|
def _getobj_parse(self, pos, objid):
|
||||||
self._parser.seek(pos)
|
self._parser.seek(pos)
|
||||||
(_, objid1) = self._parser.nexttoken() # objid
|
(_, objid1) = self._parser.nexttoken() # objid
|
||||||
|
@ -643,7 +646,7 @@ class PDFDocument(object):
|
||||||
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
||||||
(_, genno) = self._parser.nexttoken() # genno
|
(_, genno) = self._parser.nexttoken() # genno
|
||||||
(_, kwd) = self._parser.nexttoken()
|
(_, kwd) = self._parser.nexttoken()
|
||||||
if kwd is not self.KEYWORD_OBJ:
|
if kwd.name !='obj':
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||||
(_, obj) = self._parser.nextobject()
|
(_, obj) = self._parser.nextobject()
|
||||||
return obj
|
return obj
|
||||||
|
@ -762,7 +765,7 @@ class PDFDocument(object):
|
||||||
else:
|
else:
|
||||||
raise PDFNoValidXRef('Unexpected EOF')
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
logging.info('xref found: pos=%r' % prev)
|
logging.info('xref found: pos=%r' % prev)
|
||||||
return long(prev)
|
return long(prev) if six.PY2 else int(prev)
|
||||||
|
|
||||||
# read xref table
|
# read xref table
|
||||||
def read_xref_from(self, parser, start, xrefs):
|
def read_xref_from(self, parser, start, xrefs):
|
||||||
|
|
|
@ -31,6 +31,7 @@ from .utils import choplist
|
||||||
from .utils import mult_matrix
|
from .utils import mult_matrix
|
||||||
from .utils import MATRIX_IDENTITY
|
from .utils import MATRIX_IDENTITY
|
||||||
|
|
||||||
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
##
|
##
|
||||||
|
@ -41,15 +42,6 @@ class PDFInterpreterError(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
## Constants
|
|
||||||
##
|
|
||||||
LITERAL_PDF = LIT('PDF')
|
|
||||||
LITERAL_TEXT = LIT('Text')
|
|
||||||
LITERAL_FONT = LIT('Font')
|
|
||||||
LITERAL_FORM = LIT('Form')
|
|
||||||
LITERAL_IMAGE = LIT('Image')
|
|
||||||
|
|
||||||
|
|
||||||
## PDFTextState
|
## PDFTextState
|
||||||
##
|
##
|
||||||
class PDFTextState(object):
|
class PDFTextState(object):
|
||||||
|
@ -341,7 +333,7 @@ class PDFPageInterpreter(object):
|
||||||
return PDFColorSpace(name, len(list_value(spec[1])))
|
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||||
else:
|
else:
|
||||||
return PREDEFINED_COLORSPACE.get(name)
|
return PREDEFINED_COLORSPACE.get(name)
|
||||||
for (k, v) in dict_value(resources).iteritems():
|
for (k, v) in six.iteritems(dict_value(resources)):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
logging.debug('Resource: %r: %r' % (k, v))
|
logging.debug('Resource: %r: %r' % (k, v))
|
||||||
if k == 'Font':
|
if k == 'Font':
|
||||||
|
@ -352,7 +344,7 @@ class PDFPageInterpreter(object):
|
||||||
spec = dict_value(spec)
|
spec = dict_value(spec)
|
||||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||||
elif k == 'ColorSpace':
|
elif k == 'ColorSpace':
|
||||||
for (csid, spec) in dict_value(v).iteritems():
|
for (csid, spec) in six.iteritems(dict_value(v)):
|
||||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||||
elif k == 'ProcSet':
|
elif k == 'ProcSet':
|
||||||
self.rsrcmgr.get_procset(list_value(v))
|
self.rsrcmgr.get_procset(list_value(v))
|
||||||
|
@ -376,7 +368,7 @@ class PDFPageInterpreter(object):
|
||||||
# set some global states.
|
# set some global states.
|
||||||
self.scs = self.ncs = None
|
self.scs = self.ncs = None
|
||||||
if self.csmap:
|
if self.csmap:
|
||||||
self.scs = self.ncs = self.csmap.values()[0]
|
self.scs = self.ncs = six.next(six.itervalues(self.csmap))
|
||||||
return
|
return
|
||||||
|
|
||||||
def push(self, obj):
|
def push(self, obj):
|
||||||
|
|
|
@ -10,10 +10,7 @@ from .pdfparser import PDFParser
|
||||||
from .pdfdocument import PDFDocument
|
from .pdfdocument import PDFDocument
|
||||||
from .pdfdocument import PDFTextExtractionNotAllowed
|
from .pdfdocument import PDFTextExtractionNotAllowed
|
||||||
|
|
||||||
# some predefined literals and keywords.
|
import six # Python 2+3 compatibility
|
||||||
LITERAL_PAGE = LIT('Page')
|
|
||||||
LITERAL_PAGES = LIT('Pages')
|
|
||||||
|
|
||||||
|
|
||||||
## PDFPage
|
## PDFPage
|
||||||
##
|
##
|
||||||
|
@ -82,15 +79,15 @@ class PDFPage(object):
|
||||||
else:
|
else:
|
||||||
objid = obj.objid
|
objid = obj.objid
|
||||||
tree = dict_value(obj).copy()
|
tree = dict_value(obj).copy()
|
||||||
for (k, v) in parent.iteritems():
|
for (k, v) in six.iteritems(parent):
|
||||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
if tree.get('Type').name=='Pages' and 'Kids' in tree:
|
||||||
logging.info('Pages: Kids=%r' % tree['Kids'])
|
logging.info('Pages: Kids=%r' % tree['Kids'])
|
||||||
for c in list_value(tree['Kids']):
|
for c in list_value(tree['Kids']):
|
||||||
for x in search(c, tree):
|
for x in search(c, tree):
|
||||||
yield x
|
yield x
|
||||||
elif tree.get('Type') is LITERAL_PAGE:
|
elif tree.get('Type').name=='Page':
|
||||||
logging.info('Page: %r' % tree)
|
logging.info('Page: %r' % tree)
|
||||||
yield (objid, tree)
|
yield (objid, tree)
|
||||||
pages = False
|
pages = False
|
||||||
|
|
|
@ -120,9 +120,7 @@ class PDFParser(PSStackParser):
|
||||||
data += line
|
data += line
|
||||||
self.seek(pos+objlen)
|
self.seek(pos+objlen)
|
||||||
# XXX limit objlen not to exceed object boundary
|
# XXX limit objlen not to exceed object boundary
|
||||||
if self.debug:
|
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % (pos, objlen, dic, data[:10]))
|
||||||
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
|
||||||
(pos, objlen, dic, data[:10]))
|
|
||||||
obj = PDFStream(dic, data, self.doc.decipher)
|
obj = PDFStream(dic, data, self.doc.decipher)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,29 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
|
def bytes(s,i,j=None):
|
||||||
|
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
|
||||||
|
if six.PY2:
|
||||||
|
if j is None:
|
||||||
|
return s[i]
|
||||||
|
if j<0:
|
||||||
|
return s[i:]
|
||||||
|
return s[i:j]
|
||||||
|
else: # six.PY3
|
||||||
|
if i<0 : i=len(s)+i
|
||||||
|
if j is None: j=i+1
|
||||||
|
if j<0 : j=len(s)
|
||||||
|
return b''.join(six.int2byte(s[_]) for _ in range(i,j))
|
||||||
|
|
||||||
from .utils import choplist
|
from .utils import choplist
|
||||||
|
|
||||||
STRICT = 0
|
STRICT = 0
|
||||||
|
|
||||||
|
|
||||||
## PS Exceptions
|
## PS Exceptions
|
||||||
##
|
##
|
||||||
class PSException(Exception):
|
class PSException(Exception):
|
||||||
|
@ -57,10 +75,10 @@ class PSLiteral(PSObject):
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '/%s' % self.name
|
name=self.name
|
||||||
|
return '/%r' % name
|
||||||
|
|
||||||
|
|
||||||
## PSKeyword
|
## PSKeyword
|
||||||
|
@ -82,7 +100,8 @@ class PSKeyword(PSObject):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.name
|
name=self.name
|
||||||
|
return '/%r' % name
|
||||||
|
|
||||||
|
|
||||||
## PSSymbolTable
|
## PSSymbolTable
|
||||||
|
@ -159,8 +178,6 @@ class PSBaseParser(object):
|
||||||
"""
|
"""
|
||||||
BUFSIZ = 4096
|
BUFSIZ = 4096
|
||||||
|
|
||||||
debug = 0
|
|
||||||
|
|
||||||
def __init__(self, fp):
|
def __init__(self, fp):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.seek(0)
|
self.seek(0)
|
||||||
|
@ -191,8 +208,7 @@ class PSBaseParser(object):
|
||||||
def seek(self, pos):
|
def seek(self, pos):
|
||||||
"""Seeks the parser to the given position.
|
"""Seeks the parser to the given position.
|
||||||
"""
|
"""
|
||||||
if self.debug:
|
logging.debug('seek: %r' % pos)
|
||||||
logging.debug('seek: %r' % pos)
|
|
||||||
self.fp.seek(pos)
|
self.fp.seek(pos)
|
||||||
# reset the status for nextline()
|
# reset the status for nextline()
|
||||||
self.bufpos = pos
|
self.bufpos = pos
|
||||||
|
@ -225,7 +241,7 @@ class PSBaseParser(object):
|
||||||
while 1:
|
while 1:
|
||||||
self.fillbuf()
|
self.fillbuf()
|
||||||
if eol:
|
if eol:
|
||||||
c = self.buf[self.charpos]
|
c = bytes(self.buf,self.charpos)
|
||||||
# handle b'\r\n'
|
# handle b'\r\n'
|
||||||
if c == b'\n':
|
if c == b'\n':
|
||||||
linebuf += c
|
linebuf += c
|
||||||
|
@ -233,17 +249,17 @@ class PSBaseParser(object):
|
||||||
break
|
break
|
||||||
m = EOL.search(self.buf, self.charpos)
|
m = EOL.search(self.buf, self.charpos)
|
||||||
if m:
|
if m:
|
||||||
linebuf += self.buf[self.charpos:m.end(0)]
|
linebuf += bytes(self.buf,self.charpos,m.end(0))
|
||||||
self.charpos = m.end(0)
|
self.charpos = m.end(0)
|
||||||
if linebuf[-1] == b'\r':
|
if bytes(linebuf,-1) == b'\r':
|
||||||
eol = True
|
eol = True
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
linebuf += self.buf[self.charpos:]
|
linebuf += bytes(self.buf,self.charpos,-1)
|
||||||
self.charpos = len(self.buf)
|
self.charpos = len(self.buf)
|
||||||
if self.debug:
|
logging.debug('nextline: %r, %r' % (linepos, linebuf))
|
||||||
logging.debug('nextline: %r, %r' % (linepos, linebuf))
|
|
||||||
return (linepos, linebuf)
|
return (linepos, linebuf)
|
||||||
|
|
||||||
def revreadlines(self):
|
def revreadlines(self):
|
||||||
|
@ -266,8 +282,8 @@ class PSBaseParser(object):
|
||||||
if n == -1:
|
if n == -1:
|
||||||
buf = s + buf
|
buf = s + buf
|
||||||
break
|
break
|
||||||
yield s[n:]+buf
|
yield bytes(s,n,-1)+buf
|
||||||
s = s[:n]
|
s = bytes(s,0,n)
|
||||||
buf = b''
|
buf = b''
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -276,7 +292,7 @@ class PSBaseParser(object):
|
||||||
if not m:
|
if not m:
|
||||||
return len(s)
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
c = s[j]
|
c = bytes(s,j)
|
||||||
self._curtokenpos = self.bufpos+j
|
self._curtokenpos = self.bufpos+j
|
||||||
if c == b'%':
|
if c == b'%':
|
||||||
self._curtoken = b'%'
|
self._curtoken = b'%'
|
||||||
|
@ -322,10 +338,10 @@ class PSBaseParser(object):
|
||||||
def _parse_comment(self, s, i):
|
def _parse_comment(self, s, i):
|
||||||
m = EOL.search(s, i)
|
m = EOL.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += bytes(s,i,-1)
|
||||||
return (self._parse_comment, len(s))
|
return (self._parse_comment, len(s))
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += bytes(s,i,j)
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
# We ignore comments.
|
# We ignore comments.
|
||||||
#self._tokens.append(self._curtoken)
|
#self._tokens.append(self._curtoken)
|
||||||
|
@ -334,37 +350,41 @@ class PSBaseParser(object):
|
||||||
def _parse_literal(self, s, i):
|
def _parse_literal(self, s, i):
|
||||||
m = END_LITERAL.search(s, i)
|
m = END_LITERAL.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += bytes(s,i,-1)
|
||||||
return len(s)
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += bytes(s,i,j)
|
||||||
c = s[j]
|
c = bytes(s,j)
|
||||||
if c == b'#':
|
if c == b'#':
|
||||||
self.hex = b''
|
self.hex = b''
|
||||||
self._parse1 = self._parse_literal_hex
|
self._parse1 = self._parse_literal_hex
|
||||||
return j+1
|
return j+1
|
||||||
self._add_token(LIT(unicode(self._curtoken)))
|
try:
|
||||||
|
self._curtoken=str(self._curtoken,'utf-8')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
self._add_token(LIT(self._curtoken))
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def _parse_literal_hex(self, s, i):
|
def _parse_literal_hex(self, s, i):
|
||||||
c = s[i]
|
c = bytes(s,i)
|
||||||
if HEX.match(c) and len(self.hex) < 2:
|
if HEX.match(c) and len(self.hex) < 2:
|
||||||
self.hex += c
|
self.hex += c
|
||||||
return i+1
|
return i+1
|
||||||
if self.hex:
|
if self.hex:
|
||||||
self._curtoken += chr(int(self.hex, 16))
|
self._curtoken += six.int2byte(int(self.hex, 16))
|
||||||
self._parse1 = self._parse_literal
|
self._parse1 = self._parse_literal
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def _parse_number(self, s, i):
|
def _parse_number(self, s, i):
|
||||||
m = END_NUMBER.search(s, i)
|
m = END_NUMBER.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += bytes(s,i,-1)
|
||||||
return len(s)
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += bytes(s,i,j)
|
||||||
c = s[j]
|
c = bytes(s,j)
|
||||||
if c == b'.':
|
if c == b'.':
|
||||||
self._curtoken += c
|
self._curtoken += c
|
||||||
self._parse1 = self._parse_float
|
self._parse1 = self._parse_float
|
||||||
|
@ -379,10 +399,10 @@ class PSBaseParser(object):
|
||||||
def _parse_float(self, s, i):
|
def _parse_float(self, s, i):
|
||||||
m = END_NUMBER.search(s, i)
|
m = END_NUMBER.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += bytes(s,i,-1)
|
||||||
return len(s)
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += bytes(s,i,j)
|
||||||
try:
|
try:
|
||||||
self._add_token(float(self._curtoken))
|
self._add_token(float(self._curtoken))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -393,10 +413,10 @@ class PSBaseParser(object):
|
||||||
def _parse_keyword(self, s, i):
|
def _parse_keyword(self, s, i):
|
||||||
m = END_KEYWORD.search(s, i)
|
m = END_KEYWORD.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += bytes(s,i,-1)
|
||||||
return len(s)
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += bytes(s,i,j)
|
||||||
if self._curtoken == b'true':
|
if self._curtoken == b'true':
|
||||||
token = True
|
token = True
|
||||||
elif self._curtoken == b'false':
|
elif self._curtoken == b'false':
|
||||||
|
@ -410,11 +430,11 @@ class PSBaseParser(object):
|
||||||
def _parse_string(self, s, i):
|
def _parse_string(self, s, i):
|
||||||
m = END_STRING.search(s, i)
|
m = END_STRING.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += bytes(s,i,-1)
|
||||||
return len(s)
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += bytes(s,i,j)
|
||||||
c = s[j]
|
c = bytes(s,j)
|
||||||
if c == b'\\':
|
if c == b'\\':
|
||||||
self.oct = b''
|
self.oct = b''
|
||||||
self._parse1 = self._parse_string_1
|
self._parse1 = self._parse_string_1
|
||||||
|
@ -428,26 +448,26 @@ class PSBaseParser(object):
|
||||||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||||
self._curtoken += c
|
self._curtoken += c
|
||||||
return j+1
|
return j+1
|
||||||
self._add_token(str(self._curtoken))
|
self._add_token(self._curtoken)
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j+1
|
return j+1
|
||||||
|
|
||||||
def _parse_string_1(self, s, i):
|
def _parse_string_1(self, s, i):
|
||||||
c = s[i]
|
c = bytes(s,i)
|
||||||
if OCT_STRING.match(c) and len(self.oct) < 3:
|
if OCT_STRING.match(c) and len(self.oct) < 3:
|
||||||
self.oct += c
|
self.oct += c
|
||||||
return i+1
|
return i+1
|
||||||
if self.oct:
|
if self.oct:
|
||||||
self._curtoken += chr(int(self.oct, 8))
|
self._curtoken += six.int2byte(int(self.oct, 8))
|
||||||
self._parse1 = self._parse_string
|
self._parse1 = self._parse_string
|
||||||
return i
|
return i
|
||||||
if c in ESC_STRING:
|
if c in ESC_STRING:
|
||||||
self._curtoken += chr(ESC_STRING[c])
|
self._curtoken += six.int2byte(ESC_STRING[c])
|
||||||
self._parse1 = self._parse_string
|
self._parse1 = self._parse_string
|
||||||
return i+1
|
return i+1
|
||||||
|
|
||||||
def _parse_wopen(self, s, i):
|
def _parse_wopen(self, s, i):
|
||||||
c = s[i]
|
c = bytes(s,i)
|
||||||
if c == b'<':
|
if c == b'<':
|
||||||
self._add_token(KEYWORD_DICT_BEGIN)
|
self._add_token(KEYWORD_DICT_BEGIN)
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
|
@ -457,7 +477,7 @@ class PSBaseParser(object):
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def _parse_wclose(self, s, i):
|
def _parse_wclose(self, s, i):
|
||||||
c = s[i]
|
c = bytes(s,i)
|
||||||
if c == b'>':
|
if c == b'>':
|
||||||
self._add_token(KEYWORD_DICT_END)
|
self._add_token(KEYWORD_DICT_END)
|
||||||
i += 1
|
i += 1
|
||||||
|
@ -467,12 +487,11 @@ class PSBaseParser(object):
|
||||||
def _parse_hexstring(self, s, i):
|
def _parse_hexstring(self, s, i):
|
||||||
m = END_HEX_STRING.search(s, i)
|
m = END_HEX_STRING.search(s, i)
|
||||||
if not m:
|
if not m:
|
||||||
self._curtoken += s[i:]
|
self._curtoken += bytes(s,i,-1)
|
||||||
return len(s)
|
return len(s)
|
||||||
j = m.start(0)
|
j = m.start(0)
|
||||||
self._curtoken += s[i:j]
|
self._curtoken += bytes(s,i,j)
|
||||||
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
|
token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken))
|
||||||
SPC.sub(b'', self._curtoken))
|
|
||||||
self._add_token(token)
|
self._add_token(token)
|
||||||
self._parse1 = self._parse_main
|
self._parse1 = self._parse_main
|
||||||
return j
|
return j
|
||||||
|
@ -482,8 +501,7 @@ class PSBaseParser(object):
|
||||||
self.fillbuf()
|
self.fillbuf()
|
||||||
self.charpos = self._parse1(self.buf, self.charpos)
|
self.charpos = self._parse1(self.buf, self.charpos)
|
||||||
token = self._tokens.pop(0)
|
token = self._tokens.pop(0)
|
||||||
if self.debug:
|
logging.debug('nexttoken: (%r:%r)' % token)
|
||||||
logging.debug('nexttoken: %r' % token)
|
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
@ -523,16 +541,17 @@ class PSStackParser(PSBaseParser):
|
||||||
return objs
|
return objs
|
||||||
|
|
||||||
def add_results(self, *objs):
|
def add_results(self, *objs):
|
||||||
if self.debug:
|
try:
|
||||||
logging.debug('add_results: %r' % objs)
|
logging.debug('add_results: %s' % repr(objs))
|
||||||
|
except:
|
||||||
|
logging.debug('add_results: (unprintable object)')
|
||||||
self.results.extend(objs)
|
self.results.extend(objs)
|
||||||
return
|
return
|
||||||
|
|
||||||
def start_type(self, pos, type):
|
def start_type(self, pos, type):
|
||||||
self.context.append((pos, self.curtype, self.curstack))
|
self.context.append((pos, self.curtype, self.curstack))
|
||||||
(self.curtype, self.curstack) = (type, [])
|
(self.curtype, self.curstack) = (type, [])
|
||||||
if self.debug:
|
logging.debug('start_type: pos=%r, type=%r' % (pos, type))
|
||||||
logging.debug('start_type: pos=%r, type=%r' % (pos, type))
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_type(self, type):
|
def end_type(self, type):
|
||||||
|
@ -540,8 +559,7 @@ class PSStackParser(PSBaseParser):
|
||||||
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
||||||
objs = [obj for (_, obj) in self.curstack]
|
objs = [obj for (_, obj) in self.curstack]
|
||||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||||
if self.debug:
|
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs))
|
||||||
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs))
|
|
||||||
return (pos, objs)
|
return (pos, objs)
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
|
@ -556,7 +574,7 @@ class PSStackParser(PSBaseParser):
|
||||||
while not self.results:
|
while not self.results:
|
||||||
(pos, token) = self.nexttoken()
|
(pos, token) = self.nexttoken()
|
||||||
#print (pos,token), (self.curtype, self.curstack)
|
#print (pos,token), (self.curtype, self.curstack)
|
||||||
if isinstance(token, (int, long, float, bool, str, PSLiteral)):
|
if isinstance(token, (six.integer_types, float, bool, six.string_types, six.binary_type, PSLiteral)):
|
||||||
# normal token
|
# normal token
|
||||||
self.push((pos, token))
|
self.push((pos, token))
|
||||||
elif token == KEYWORD_ARRAY_BEGIN:
|
elif token == KEYWORD_ARRAY_BEGIN:
|
||||||
|
@ -594,115 +612,20 @@ class PSStackParser(PSBaseParser):
|
||||||
except PSTypeError:
|
except PSTypeError:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise
|
raise
|
||||||
else:
|
elif isinstance(token,PSKeyword):
|
||||||
if self.debug:
|
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
|
||||||
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % \
|
|
||||||
(pos, token, self.curstack))
|
|
||||||
self.do_keyword(pos, token)
|
self.do_keyword(pos, token)
|
||||||
|
else:
|
||||||
|
logging.error('unknown token: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
|
||||||
|
self.do_keyword(pos, token)
|
||||||
|
raise
|
||||||
if self.context:
|
if self.context:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
self.flush()
|
self.flush()
|
||||||
obj = self.results.pop(0)
|
obj = self.results.pop(0)
|
||||||
if self.debug:
|
try:
|
||||||
logging.debug('nextobject: %r' % obj)
|
logging.debug('nextobject: %s' % repr(obj))
|
||||||
|
except:
|
||||||
|
logging.debug('nextobject: (unprintable object)')
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
|
|
||||||
## Simplistic Test cases
|
|
||||||
##
|
|
||||||
class TestPSBaseParser(unittest.TestCase):
|
|
||||||
|
|
||||||
TESTDATA = br'''%!PS
|
|
||||||
begin end
|
|
||||||
" @ #
|
|
||||||
/a/BCD /Some_Name /foo#5f#xbaa
|
|
||||||
0 +1 -2 .5 1.234
|
|
||||||
(abc) () (abc ( def ) ghi)
|
|
||||||
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
|
|
||||||
(this % is not a comment.)
|
|
||||||
(foo
|
|
||||||
baa)
|
|
||||||
(foo\
|
|
||||||
baa)
|
|
||||||
<> <20> < 40 4020 >
|
|
||||||
<abcd00
|
|
||||||
12345>
|
|
||||||
func/a/b{(c)do*}def
|
|
||||||
[ 1 (z) ! ]
|
|
||||||
<< /foo (bar) >>
|
|
||||||
'''
|
|
||||||
|
|
||||||
TOKENS = [
|
|
||||||
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')),
|
|
||||||
(21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
|
||||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
|
||||||
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
|
|
||||||
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
|
|
||||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
|
|
||||||
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
|
|
||||||
(226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')),
|
|
||||||
(234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')),
|
|
||||||
(242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')),
|
|
||||||
(256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'),
|
|
||||||
(272, KWD(b'>>'))
|
|
||||||
]
|
|
||||||
|
|
||||||
OBJS = [
|
|
||||||
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
|
||||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
|
||||||
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
|
|
||||||
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
|
|
||||||
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
|
|
||||||
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'),
|
|
||||||
(230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']),
|
|
||||||
(258, {'foo': 'bar'}),
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_tokens(self, s):
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
class MyParser(PSBaseParser):
|
|
||||||
def flush(self):
|
|
||||||
self.add_results(*self.popall())
|
|
||||||
parser = MyParser(BytesIO(s))
|
|
||||||
r = []
|
|
||||||
try:
|
|
||||||
while 1:
|
|
||||||
r.append(parser.nexttoken())
|
|
||||||
except PSEOF:
|
|
||||||
pass
|
|
||||||
return r
|
|
||||||
|
|
||||||
def get_objects(self, s):
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
class MyParser(PSStackParser):
|
|
||||||
def flush(self):
|
|
||||||
self.add_results(*self.popall())
|
|
||||||
parser = MyParser(BytesIO(s))
|
|
||||||
r = []
|
|
||||||
try:
|
|
||||||
while 1:
|
|
||||||
r.append(parser.nextobject())
|
|
||||||
except PSEOF:
|
|
||||||
pass
|
|
||||||
return r
|
|
||||||
|
|
||||||
def test_1(self):
|
|
||||||
tokens = self.get_tokens(self.TESTDATA)
|
|
||||||
print (tokens)
|
|
||||||
self.assertEqual(tokens, self.TOKENS)
|
|
||||||
return
|
|
||||||
|
|
||||||
def test_2(self):
|
|
||||||
objs = self.get_objects(self.TESTDATA)
|
|
||||||
print (objs)
|
|
||||||
self.assertEqual(objs, self.OBJS)
|
|
||||||
return
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
unittest.main()
|
|
||||||
|
|
|
@ -3,8 +3,9 @@
|
||||||
Miscellaneous Routines.
|
Miscellaneous Routines.
|
||||||
"""
|
"""
|
||||||
import struct
|
import struct
|
||||||
from sys import maxint as INF
|
INF=2147483647 #from sys import maxint as INF #doesn't work anymore under Python3, but PDF still uses 32 bits ints
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
## PNG Predictor
|
## PNG Predictor
|
||||||
##
|
##
|
||||||
|
@ -184,7 +185,7 @@ def nunpack(s, default=0):
|
||||||
|
|
||||||
|
|
||||||
# decode_text
|
# decode_text
|
||||||
PDFDocEncoding = ''.join(unichr(x) for x in (
|
PDFDocEncoding = ''.join(six.unichr(x) for x in (
|
||||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||||
|
|
Loading…
Reference in New Issue