tests pass under Py 2.7 and 3.4
parent
b0e035c24f
commit
faea7291a8
|
@ -13,6 +13,7 @@
|
|||
import sys
|
||||
import array
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
## BitParser
|
||||
##
|
||||
|
@ -26,7 +27,7 @@ class BitParser(object):
|
|||
def add(klass, root, v, bits):
|
||||
p = root
|
||||
b = None
|
||||
for i in xrange(len(bits)):
|
||||
for i in range(len(bits)):
|
||||
if 0 < i:
|
||||
if p[b] is None:
|
||||
p[b] = [None, None]
|
||||
|
@ -686,6 +687,25 @@ class TestCCITTG4Parser(unittest.TestCase):
|
|||
|
||||
## CCITTFaxDecoder
|
||||
##
|
||||
def test___init__(self):
|
||||
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||
raise SkipTest # TODO: implement your test here
|
||||
|
||||
def test_feedbytes(self):
|
||||
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||
# assert_equal(expected, c_citt_g4_parser.feedbytes(data))
|
||||
raise SkipTest # TODO: implement your test here
|
||||
|
||||
def test_output_line(self):
|
||||
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||
# assert_equal(expected, c_citt_g4_parser.output_line(y, bits))
|
||||
raise SkipTest # TODO: implement your test here
|
||||
|
||||
def test_reset(self):
|
||||
# c_citt_g4_parser = CCITTG4Parser(width, bytealign)
|
||||
# assert_equal(expected, c_citt_g4_parser.reset())
|
||||
raise SkipTest # TODO: implement your test here
|
||||
|
||||
class CCITTFaxDecoder(CCITTG4Parser):
|
||||
|
||||
def __init__(self, width, bytealign=False, reversed=False):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
from .psparser import LIT
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
## PDFColorSpace
|
||||
##
|
||||
|
@ -20,15 +21,17 @@ class PDFColorSpace(object):
|
|||
return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||
|
||||
|
||||
PREDEFINED_COLORSPACE = dict(
|
||||
(name, PDFColorSpace(name, n)) for (name, n) in {
|
||||
'CalRGB': 3,
|
||||
'CalGray': 1,
|
||||
'Lab': 3,
|
||||
'DeviceRGB': 3,
|
||||
'DeviceCMYK': 4,
|
||||
'DeviceGray': 1,
|
||||
'Separation': 1,
|
||||
'Indexed': 1,
|
||||
'Pattern': 1,
|
||||
}.iteritems())
|
||||
PREDEFINED_COLORSPACE = {}
|
||||
for (name, n) in six.iteritems({
|
||||
'CalRGB': 3,
|
||||
'CalGray': 1,
|
||||
'Lab': 3,
|
||||
'DeviceRGB': 3,
|
||||
'DeviceCMYK': 4,
|
||||
'DeviceGray': 1,
|
||||
'Separation': 1,
|
||||
'Indexed': 1,
|
||||
'Pattern': 1,
|
||||
}) :
|
||||
PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n)
|
||||
|
|
@ -2,6 +2,8 @@
|
|||
import re
|
||||
import struct
|
||||
import logging
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
try:
|
||||
import hashlib as md5
|
||||
except ImportError:
|
||||
|
@ -107,10 +109,13 @@ class PDFXRef(PDFBaseXRef):
|
|||
if len(f) != 2:
|
||||
raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
|
||||
try:
|
||||
(start, nobjs) = map(long, f)
|
||||
if six.PY2:
|
||||
(start, nobjs) = map(long, f)
|
||||
else:
|
||||
(start, nobjs) = map(int, f)
|
||||
except ValueError:
|
||||
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
|
||||
for objid in xrange(start, start+nobjs):
|
||||
for objid in range(start, start+nobjs):
|
||||
try:
|
||||
(_, line) = parser.nextline()
|
||||
except PSEOF:
|
||||
|
@ -121,17 +126,15 @@ class PDFXRef(PDFBaseXRef):
|
|||
(pos, genno, use) = f
|
||||
if use != b'n':
|
||||
continue
|
||||
self.offsets[objid] = (None, long(pos), int(genno))
|
||||
self.offsets[objid] = (None, long(pos) if six.PY2 else int(pos), int(genno))
|
||||
logging.info('xref objects: %r' % self.offsets)
|
||||
self.load_trailer(parser)
|
||||
return
|
||||
|
||||
KEYWORD_TRAILER = KWD('trailer')
|
||||
|
||||
def load_trailer(self, parser):
|
||||
try:
|
||||
(_, kwd) = parser.nexttoken()
|
||||
assert kwd is self.KEYWORD_TRAILER
|
||||
assert kwd.name == 'trailer'
|
||||
(_, dic) = parser.nextobject()
|
||||
except PSEOF:
|
||||
x = parser.pop(1)
|
||||
|
@ -145,7 +148,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
return self.trailer
|
||||
|
||||
def get_objids(self):
|
||||
return self.offsets.iterkeys()
|
||||
return six.iterkeys(self.offsets)
|
||||
|
||||
def get_pos(self, objid):
|
||||
try:
|
||||
|
@ -175,6 +178,8 @@ class PDFXRefFallback(PDFXRef):
|
|||
self.load_trailer(parser)
|
||||
logging.info('trailer: %r' % self.get_trailer())
|
||||
break
|
||||
if six.PY3:
|
||||
line=line.decode('utf-8')
|
||||
m = self.PDFOBJ_CUE.match(line)
|
||||
if not m:
|
||||
continue
|
||||
|
@ -634,8 +639,6 @@ class PDFDocument(object):
|
|||
pass
|
||||
return (objs, n)
|
||||
|
||||
KEYWORD_OBJ = KWD('obj')
|
||||
|
||||
def _getobj_parse(self, pos, objid):
|
||||
self._parser.seek(pos)
|
||||
(_, objid1) = self._parser.nexttoken() # objid
|
||||
|
@ -643,7 +646,7 @@ class PDFDocument(object):
|
|||
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
|
||||
(_, genno) = self._parser.nexttoken() # genno
|
||||
(_, kwd) = self._parser.nexttoken()
|
||||
if kwd is not self.KEYWORD_OBJ:
|
||||
if kwd.name !='obj':
|
||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
|
||||
(_, obj) = self._parser.nextobject()
|
||||
return obj
|
||||
|
@ -762,7 +765,7 @@ class PDFDocument(object):
|
|||
else:
|
||||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
logging.info('xref found: pos=%r' % prev)
|
||||
return long(prev)
|
||||
return long(prev) if six.PY2 else int(prev)
|
||||
|
||||
# read xref table
|
||||
def read_xref_from(self, parser, start, xrefs):
|
||||
|
|
|
@ -31,6 +31,7 @@ from .utils import choplist
|
|||
from .utils import mult_matrix
|
||||
from .utils import MATRIX_IDENTITY
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
## Exceptions
|
||||
##
|
||||
|
@ -41,15 +42,6 @@ class PDFInterpreterError(PDFException):
|
|||
pass
|
||||
|
||||
|
||||
## Constants
|
||||
##
|
||||
LITERAL_PDF = LIT('PDF')
|
||||
LITERAL_TEXT = LIT('Text')
|
||||
LITERAL_FONT = LIT('Font')
|
||||
LITERAL_FORM = LIT('Form')
|
||||
LITERAL_IMAGE = LIT('Image')
|
||||
|
||||
|
||||
## PDFTextState
|
||||
##
|
||||
class PDFTextState(object):
|
||||
|
@ -341,7 +333,7 @@ class PDFPageInterpreter(object):
|
|||
return PDFColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE.get(name)
|
||||
for (k, v) in dict_value(resources).iteritems():
|
||||
for (k, v) in six.iteritems(dict_value(resources)):
|
||||
if self.debug:
|
||||
logging.debug('Resource: %r: %r' % (k, v))
|
||||
if k == 'Font':
|
||||
|
@ -352,7 +344,7 @@ class PDFPageInterpreter(object):
|
|||
spec = dict_value(spec)
|
||||
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid, spec) in dict_value(v).iteritems():
|
||||
for (csid, spec) in six.iteritems(dict_value(v)):
|
||||
self.csmap[csid] = get_colorspace(resolve1(spec))
|
||||
elif k == 'ProcSet':
|
||||
self.rsrcmgr.get_procset(list_value(v))
|
||||
|
@ -376,7 +368,7 @@ class PDFPageInterpreter(object):
|
|||
# set some global states.
|
||||
self.scs = self.ncs = None
|
||||
if self.csmap:
|
||||
self.scs = self.ncs = self.csmap.values()[0]
|
||||
self.scs = self.ncs = six.next(six.itervalues(self.csmap))
|
||||
return
|
||||
|
||||
def push(self, obj):
|
||||
|
|
|
@ -10,10 +10,7 @@ from .pdfparser import PDFParser
|
|||
from .pdfdocument import PDFDocument
|
||||
from .pdfdocument import PDFTextExtractionNotAllowed
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_PAGE = LIT('Page')
|
||||
LITERAL_PAGES = LIT('Pages')
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
|
@ -82,15 +79,15 @@ class PDFPage(object):
|
|||
else:
|
||||
objid = obj.objid
|
||||
tree = dict_value(obj).copy()
|
||||
for (k, v) in parent.iteritems():
|
||||
for (k, v) in six.iteritems(parent):
|
||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||
if tree.get('Type').name=='Pages' and 'Kids' in tree:
|
||||
logging.info('Pages: Kids=%r' % tree['Kids'])
|
||||
for c in list_value(tree['Kids']):
|
||||
for x in search(c, tree):
|
||||
yield x
|
||||
elif tree.get('Type') is LITERAL_PAGE:
|
||||
elif tree.get('Type').name=='Page':
|
||||
logging.info('Page: %r' % tree)
|
||||
yield (objid, tree)
|
||||
pages = False
|
||||
|
|
|
@ -120,9 +120,7 @@ class PDFParser(PSStackParser):
|
|||
data += line
|
||||
self.seek(pos+objlen)
|
||||
# XXX limit objlen not to exceed object boundary
|
||||
if self.debug:
|
||||
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||
(pos, objlen, dic, data[:10]))
|
||||
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % (pos, objlen, dic, data[:10]))
|
||||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
self.push((pos, obj))
|
||||
|
||||
|
|
|
@ -1,11 +1,29 @@
|
|||
#!/usr/bin/env python
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
def bytes(s,i,j=None):
|
||||
"""implements s[i], s[i:], s[i:j] for Python2 and Python3"""
|
||||
if six.PY2:
|
||||
if j is None:
|
||||
return s[i]
|
||||
if j<0:
|
||||
return s[i:]
|
||||
return s[i:j]
|
||||
else: # six.PY3
|
||||
if i<0 : i=len(s)+i
|
||||
if j is None: j=i+1
|
||||
if j<0 : j=len(s)
|
||||
return b''.join(six.int2byte(s[_]) for _ in range(i,j))
|
||||
|
||||
from .utils import choplist
|
||||
|
||||
STRICT = 0
|
||||
|
||||
|
||||
## PS Exceptions
|
||||
##
|
||||
class PSException(Exception):
|
||||
|
@ -57,10 +75,10 @@ class PSLiteral(PSObject):
|
|||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '/%s' % self.name
|
||||
name=self.name
|
||||
return '/%r' % name
|
||||
|
||||
|
||||
## PSKeyword
|
||||
|
@ -82,7 +100,8 @@ class PSKeyword(PSObject):
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
name=self.name
|
||||
return '/%r' % name
|
||||
|
||||
|
||||
## PSSymbolTable
|
||||
|
@ -159,8 +178,6 @@ class PSBaseParser(object):
|
|||
"""
|
||||
BUFSIZ = 4096
|
||||
|
||||
debug = 0
|
||||
|
||||
def __init__(self, fp):
|
||||
self.fp = fp
|
||||
self.seek(0)
|
||||
|
@ -191,8 +208,7 @@ class PSBaseParser(object):
|
|||
def seek(self, pos):
|
||||
"""Seeks the parser to the given position.
|
||||
"""
|
||||
if self.debug:
|
||||
logging.debug('seek: %r' % pos)
|
||||
logging.debug('seek: %r' % pos)
|
||||
self.fp.seek(pos)
|
||||
# reset the status for nextline()
|
||||
self.bufpos = pos
|
||||
|
@ -225,7 +241,7 @@ class PSBaseParser(object):
|
|||
while 1:
|
||||
self.fillbuf()
|
||||
if eol:
|
||||
c = self.buf[self.charpos]
|
||||
c = bytes(self.buf,self.charpos)
|
||||
# handle b'\r\n'
|
||||
if c == b'\n':
|
||||
linebuf += c
|
||||
|
@ -233,17 +249,17 @@ class PSBaseParser(object):
|
|||
break
|
||||
m = EOL.search(self.buf, self.charpos)
|
||||
if m:
|
||||
linebuf += self.buf[self.charpos:m.end(0)]
|
||||
linebuf += bytes(self.buf,self.charpos,m.end(0))
|
||||
self.charpos = m.end(0)
|
||||
if linebuf[-1] == b'\r':
|
||||
if bytes(linebuf,-1) == b'\r':
|
||||
eol = True
|
||||
else:
|
||||
break
|
||||
else:
|
||||
linebuf += self.buf[self.charpos:]
|
||||
linebuf += bytes(self.buf,self.charpos,-1)
|
||||
self.charpos = len(self.buf)
|
||||
if self.debug:
|
||||
logging.debug('nextline: %r, %r' % (linepos, linebuf))
|
||||
logging.debug('nextline: %r, %r' % (linepos, linebuf))
|
||||
|
||||
return (linepos, linebuf)
|
||||
|
||||
def revreadlines(self):
|
||||
|
@ -266,8 +282,8 @@ class PSBaseParser(object):
|
|||
if n == -1:
|
||||
buf = s + buf
|
||||
break
|
||||
yield s[n:]+buf
|
||||
s = s[:n]
|
||||
yield bytes(s,n,-1)+buf
|
||||
s = bytes(s,0,n)
|
||||
buf = b''
|
||||
return
|
||||
|
||||
|
@ -276,7 +292,7 @@ class PSBaseParser(object):
|
|||
if not m:
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
c = s[j]
|
||||
c = bytes(s,j)
|
||||
self._curtokenpos = self.bufpos+j
|
||||
if c == b'%':
|
||||
self._curtoken = b'%'
|
||||
|
@ -322,10 +338,10 @@ class PSBaseParser(object):
|
|||
def _parse_comment(self, s, i):
|
||||
m = EOL.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
return (self._parse_comment, len(s))
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
self._curtoken += bytes(s,i,j)
|
||||
self._parse1 = self._parse_main
|
||||
# We ignore comments.
|
||||
#self._tokens.append(self._curtoken)
|
||||
|
@ -334,37 +350,41 @@ class PSBaseParser(object):
|
|||
def _parse_literal(self, s, i):
|
||||
m = END_LITERAL.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j]
|
||||
self._curtoken += bytes(s,i,j)
|
||||
c = bytes(s,j)
|
||||
if c == b'#':
|
||||
self.hex = b''
|
||||
self._parse1 = self._parse_literal_hex
|
||||
return j+1
|
||||
self._add_token(LIT(unicode(self._curtoken)))
|
||||
try:
|
||||
self._curtoken=str(self._curtoken,'utf-8')
|
||||
except:
|
||||
pass
|
||||
self._add_token(LIT(self._curtoken))
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
||||
def _parse_literal_hex(self, s, i):
|
||||
c = s[i]
|
||||
c = bytes(s,i)
|
||||
if HEX.match(c) and len(self.hex) < 2:
|
||||
self.hex += c
|
||||
return i+1
|
||||
if self.hex:
|
||||
self._curtoken += chr(int(self.hex, 16))
|
||||
self._curtoken += six.int2byte(int(self.hex, 16))
|
||||
self._parse1 = self._parse_literal
|
||||
return i
|
||||
|
||||
def _parse_number(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j]
|
||||
self._curtoken += bytes(s,i,j)
|
||||
c = bytes(s,j)
|
||||
if c == b'.':
|
||||
self._curtoken += c
|
||||
self._parse1 = self._parse_float
|
||||
|
@ -379,10 +399,10 @@ class PSBaseParser(object):
|
|||
def _parse_float(self, s, i):
|
||||
m = END_NUMBER.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
self._curtoken += bytes(s,i,j)
|
||||
try:
|
||||
self._add_token(float(self._curtoken))
|
||||
except ValueError:
|
||||
|
@ -393,10 +413,10 @@ class PSBaseParser(object):
|
|||
def _parse_keyword(self, s, i):
|
||||
m = END_KEYWORD.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
self._curtoken += bytes(s,i,j)
|
||||
if self._curtoken == b'true':
|
||||
token = True
|
||||
elif self._curtoken == b'false':
|
||||
|
@ -410,11 +430,11 @@ class PSBaseParser(object):
|
|||
def _parse_string(self, s, i):
|
||||
m = END_STRING.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
c = s[j]
|
||||
self._curtoken += bytes(s,i,j)
|
||||
c = bytes(s,j)
|
||||
if c == b'\\':
|
||||
self.oct = b''
|
||||
self._parse1 = self._parse_string_1
|
||||
|
@ -428,26 +448,26 @@ class PSBaseParser(object):
|
|||
if self.paren: # WTF, they said balanced parens need no special treatment.
|
||||
self._curtoken += c
|
||||
return j+1
|
||||
self._add_token(str(self._curtoken))
|
||||
self._add_token(self._curtoken)
|
||||
self._parse1 = self._parse_main
|
||||
return j+1
|
||||
|
||||
def _parse_string_1(self, s, i):
|
||||
c = s[i]
|
||||
c = bytes(s,i)
|
||||
if OCT_STRING.match(c) and len(self.oct) < 3:
|
||||
self.oct += c
|
||||
return i+1
|
||||
if self.oct:
|
||||
self._curtoken += chr(int(self.oct, 8))
|
||||
self._curtoken += six.int2byte(int(self.oct, 8))
|
||||
self._parse1 = self._parse_string
|
||||
return i
|
||||
if c in ESC_STRING:
|
||||
self._curtoken += chr(ESC_STRING[c])
|
||||
self._curtoken += six.int2byte(ESC_STRING[c])
|
||||
self._parse1 = self._parse_string
|
||||
return i+1
|
||||
|
||||
def _parse_wopen(self, s, i):
|
||||
c = s[i]
|
||||
c = bytes(s,i)
|
||||
if c == b'<':
|
||||
self._add_token(KEYWORD_DICT_BEGIN)
|
||||
self._parse1 = self._parse_main
|
||||
|
@ -457,7 +477,7 @@ class PSBaseParser(object):
|
|||
return i
|
||||
|
||||
def _parse_wclose(self, s, i):
|
||||
c = s[i]
|
||||
c = bytes(s,i)
|
||||
if c == b'>':
|
||||
self._add_token(KEYWORD_DICT_END)
|
||||
i += 1
|
||||
|
@ -467,12 +487,11 @@ class PSBaseParser(object):
|
|||
def _parse_hexstring(self, s, i):
|
||||
m = END_HEX_STRING.search(s, i)
|
||||
if not m:
|
||||
self._curtoken += s[i:]
|
||||
self._curtoken += bytes(s,i,-1)
|
||||
return len(s)
|
||||
j = m.start(0)
|
||||
self._curtoken += s[i:j]
|
||||
token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)),
|
||||
SPC.sub(b'', self._curtoken))
|
||||
self._curtoken += bytes(s,i,j)
|
||||
token = HEX_PAIR.sub(lambda m: six.int2byte(int(m.group(0), 16)),SPC.sub(b'', self._curtoken))
|
||||
self._add_token(token)
|
||||
self._parse1 = self._parse_main
|
||||
return j
|
||||
|
@ -482,8 +501,7 @@ class PSBaseParser(object):
|
|||
self.fillbuf()
|
||||
self.charpos = self._parse1(self.buf, self.charpos)
|
||||
token = self._tokens.pop(0)
|
||||
if self.debug:
|
||||
logging.debug('nexttoken: %r' % token)
|
||||
logging.debug('nexttoken: (%r:%r)' % token)
|
||||
return token
|
||||
|
||||
|
||||
|
@ -523,16 +541,17 @@ class PSStackParser(PSBaseParser):
|
|||
return objs
|
||||
|
||||
def add_results(self, *objs):
|
||||
if self.debug:
|
||||
logging.debug('add_results: %r' % objs)
|
||||
try:
|
||||
logging.debug('add_results: %s' % repr(objs))
|
||||
except:
|
||||
logging.debug('add_results: (unprintable object)')
|
||||
self.results.extend(objs)
|
||||
return
|
||||
|
||||
def start_type(self, pos, type):
|
||||
self.context.append((pos, self.curtype, self.curstack))
|
||||
(self.curtype, self.curstack) = (type, [])
|
||||
if self.debug:
|
||||
logging.debug('start_type: pos=%r, type=%r' % (pos, type))
|
||||
logging.debug('start_type: pos=%r, type=%r' % (pos, type))
|
||||
return
|
||||
|
||||
def end_type(self, type):
|
||||
|
@ -540,8 +559,7 @@ class PSStackParser(PSBaseParser):
|
|||
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
||||
objs = [obj for (_, obj) in self.curstack]
|
||||
(pos, self.curtype, self.curstack) = self.context.pop()
|
||||
if self.debug:
|
||||
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs))
|
||||
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs))
|
||||
return (pos, objs)
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
|
@ -556,7 +574,7 @@ class PSStackParser(PSBaseParser):
|
|||
while not self.results:
|
||||
(pos, token) = self.nexttoken()
|
||||
#print (pos,token), (self.curtype, self.curstack)
|
||||
if isinstance(token, (int, long, float, bool, str, PSLiteral)):
|
||||
if isinstance(token, (six.integer_types, float, bool, six.string_types, six.binary_type, PSLiteral)):
|
||||
# normal token
|
||||
self.push((pos, token))
|
||||
elif token == KEYWORD_ARRAY_BEGIN:
|
||||
|
@ -594,115 +612,20 @@ class PSStackParser(PSBaseParser):
|
|||
except PSTypeError:
|
||||
if STRICT:
|
||||
raise
|
||||
else:
|
||||
if self.debug:
|
||||
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % \
|
||||
(pos, token, self.curstack))
|
||||
elif isinstance(token,PSKeyword):
|
||||
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
|
||||
self.do_keyword(pos, token)
|
||||
else:
|
||||
logging.error('unknown token: pos=%r, token=%r, stack=%r' % (pos, token, self.curstack))
|
||||
self.do_keyword(pos, token)
|
||||
raise
|
||||
if self.context:
|
||||
continue
|
||||
else:
|
||||
self.flush()
|
||||
obj = self.results.pop(0)
|
||||
if self.debug:
|
||||
logging.debug('nextobject: %r' % obj)
|
||||
try:
|
||||
logging.debug('nextobject: %s' % repr(obj))
|
||||
except:
|
||||
logging.debug('nextobject: (unprintable object)')
|
||||
return obj
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
## Simplistic Test cases
|
||||
##
|
||||
class TestPSBaseParser(unittest.TestCase):
|
||||
|
||||
TESTDATA = br'''%!PS
|
||||
begin end
|
||||
" @ #
|
||||
/a/BCD /Some_Name /foo#5f#xbaa
|
||||
0 +1 -2 .5 1.234
|
||||
(abc) () (abc ( def ) ghi)
|
||||
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
|
||||
(this % is not a comment.)
|
||||
(foo
|
||||
baa)
|
||||
(foo\
|
||||
baa)
|
||||
<> <20> < 40 4020 >
|
||||
<abcd00
|
||||
12345>
|
||||
func/a/b{(c)do*}def
|
||||
[ 1 (z) ! ]
|
||||
<< /foo (bar) >>
|
||||
'''
|
||||
|
||||
TOKENS = [
|
||||
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')),
|
||||
(21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
|
||||
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
|
||||
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
|
||||
(226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')),
|
||||
(234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')),
|
||||
(242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')),
|
||||
(256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'),
|
||||
(272, KWD(b'>>'))
|
||||
]
|
||||
|
||||
OBJS = [
|
||||
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||
(65, 1.234), (71, 'abc'), (77, ''), (80, 'abc ( def ) ghi'),
|
||||
(98, 'def \x00 4ghi'), (118, 'bach\\slask'), (132, 'foo\nbaa'),
|
||||
(143, 'this % is not a comment.'), (170, 'foo\nbaa'), (180, 'foobaa'),
|
||||
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'),
|
||||
(230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']),
|
||||
(258, {'foo': 'bar'}),
|
||||
]
|
||||
|
||||
def get_tokens(self, s):
|
||||
from io import BytesIO
|
||||
|
||||
class MyParser(PSBaseParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
parser = MyParser(BytesIO(s))
|
||||
r = []
|
||||
try:
|
||||
while 1:
|
||||
r.append(parser.nexttoken())
|
||||
except PSEOF:
|
||||
pass
|
||||
return r
|
||||
|
||||
def get_objects(self, s):
|
||||
from io import BytesIO
|
||||
|
||||
class MyParser(PSStackParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
parser = MyParser(BytesIO(s))
|
||||
r = []
|
||||
try:
|
||||
while 1:
|
||||
r.append(parser.nextobject())
|
||||
except PSEOF:
|
||||
pass
|
||||
return r
|
||||
|
||||
def test_1(self):
|
||||
tokens = self.get_tokens(self.TESTDATA)
|
||||
print (tokens)
|
||||
self.assertEqual(tokens, self.TOKENS)
|
||||
return
|
||||
|
||||
def test_2(self):
|
||||
objs = self.get_objects(self.TESTDATA)
|
||||
print (objs)
|
||||
self.assertEqual(objs, self.OBJS)
|
||||
return
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
|
@ -3,8 +3,9 @@
|
|||
Miscellaneous Routines.
|
||||
"""
|
||||
import struct
|
||||
from sys import maxint as INF
|
||||
INF=2147483647 #from sys import maxint as INF #doesn't work anymore under Python3, but PDF still uses 32 bits ints
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
## PNG Predictor
|
||||
##
|
||||
|
@ -184,7 +185,7 @@ def nunpack(s, default=0):
|
|||
|
||||
|
||||
# decode_text
|
||||
PDFDocEncoding = ''.join(unichr(x) for x in (
|
||||
PDFDocEncoding = ''.join(six.unichr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||
|
|
Loading…
Reference in New Issue