PEP8: Whitespace changes to match pep8

pull/1/head
Matthew Duggan 2013-11-07 17:35:04 +09:00
parent c1da8b835c
commit 2caa5edc25
22 changed files with 1395 additions and 1125 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
__version__ = '20131022'
if __name__ == '__main__': print __version__
if __name__ == '__main__':
print __version__

View File

@ -6,6 +6,7 @@ This code is in the public domain.
"""
## Arcfour
##
class Arcfour(object):

View File

@ -9,6 +9,7 @@ This code is in the public domain.
import re
import struct
# ascii85decode(data)
def ascii85decode(data):
"""
@ -51,6 +52,8 @@ def ascii85decode(data):
# asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1

View File

@ -25,6 +25,7 @@ class BitParser(object):
@classmethod
def add(klass, root, v, bits):
p = root
b = None
for i in xrange(len(bits)):
if 0 < i:
if p[b] is None:
@ -309,9 +310,14 @@ class CCITTG4Parser(BitParser):
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
class EOFB(Exception): pass
class InvalidData(Exception): pass
class ByteSkip(Exception): pass
class EOFB(Exception):
pass
class InvalidData(Exception):
pass
class ByteSkip(Exception):
pass
def __init__(self, width, bytealign=False):
BitParser.__init__(self)
@ -358,7 +364,8 @@ class CCITTG4Parser(BitParser):
raise self.InvalidData(mode)
def _parse_horiz1(self, n):
if n is None: raise self.InvalidData
if n is None:
raise self.InvalidData
self._n1 += n
if n < 64:
self._n2 = 0
@ -370,7 +377,8 @@ class CCITTG4Parser(BitParser):
return self.BLACK
def _parse_horiz2(self, n):
if n is None: raise self.InvalidData
if n is None:
raise self.InvalidData
self._n2 += n
if n < 64:
self._color = 1-self._color
@ -384,7 +392,8 @@ class CCITTG4Parser(BitParser):
return self.BLACK
def _parse_uncompressed(self, bits):
if not bits: raise self.InvalidData
if not bits:
raise self.InvalidData
if bits.startswith('T'):
self._accept = self._parse_mode
self._color = int(bits[1])
@ -441,12 +450,13 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and
self._refline[x1] != self._color): break
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break
self._refline[x1] != self._color):
break
x1 += 1
x1 += dx
x0 = max(0, self._curpos)
@ -467,21 +477,23 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and
self._refline[x1] != self._color): break
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break
self._refline[x1] != self._color):
break
x1 += 1
while 1:
if x1 == 0:
if (self._color == 0 and
self._refline[x1] == self._color): break
if (self._color == 0 and self._refline[x1] == self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] != self._color and
self._refline[x1] == self._color): break
self._refline[x1] == self._color):
break
x1 += 1
for x in xrange(self._curpos, x1):
self._curline[x] = self._color
@ -494,11 +506,13 @@ class CCITTG4Parser(BitParser):
self._curpos = 0
x = self._curpos
for _ in xrange(n1):
if len(self._curline) <= x: break
if len(self._curline) <= x:
break
self._curline[x] = self._color
x += 1
for _ in xrange(n2):
if len(self._curline) <= x: break
if len(self._curline) <= x:
break
self._curline[x] = 1-self._color
x += 1
self._curpos = x
@ -512,10 +526,11 @@ class CCITTG4Parser(BitParser):
self._flush_line()
return
import unittest
## Test cases
##
import unittest
class TestCCITTG4Parser(unittest.TestCase):
def get_parser(self, bits):
@ -711,11 +726,13 @@ def main(argv):
import pygame
if not argv[1:]:
return unittest.main()
class Parser(CCITTG4Parser):
def __init__(self, width, bytealign=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width, 1000))
return
def output_line(self, y, bits):
for (x, b) in enumerate(bits):
if b:
@ -723,6 +740,7 @@ def main(argv):
else:
self.img.set_at((x, y), (0, 0, 0))
return
def close(self):
pygame.image.save(self.img, 'out.bmp')
return
@ -735,4 +753,5 @@ def main(argv):
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -26,7 +26,8 @@ from encodingdb import name2unicode
from utils import choplist, nunpack
class CMapError(Exception): pass
class CMapError(Exception):
pass
## CMap
@ -44,6 +45,7 @@ class CMap(object):
def use_cmap(self, cmap):
assert isinstance(cmap, CMap)
def copy(dst, src):
for (k, v) in src.iteritems():
if isinstance(v, dict):
@ -102,7 +104,6 @@ class IdentityCMap(object):
return ()
## UnicodeMap
##
class UnicodeMap(object):
@ -232,17 +233,16 @@ class CMapDB(object):
_cmap_cache = {}
_umap_cache = {}
class CMapNotFound(CMapError): pass
class CMapNotFound(CMapError):
pass
@classmethod
def _load_data(klass, name):
filename = '%s.pickle.gz' % name
if klass.debug:
print >>sys.stderr, 'loading:', name
cmap_paths = (
os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),
)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
@ -306,7 +306,8 @@ class CMapParser(PSStackParser):
elif name == 'endcmap':
self._in_cmap = False
return
if not self._in_cmap: return
if not self._in_cmap:
return
#
if name == 'def':
try:
@ -337,13 +338,15 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
if sprefix != eprefix:
continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
@ -359,7 +362,7 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid))
@ -369,10 +372,11 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
len(s) != len(e)):
continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
@ -393,7 +397,7 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.add_cid2unichr(nunpack(cid), code)
@ -409,6 +413,7 @@ class CMapParser(PSStackParser):
self.push((pos, token))
return
# test
def main(argv):
args = argv[1:]
@ -421,4 +426,5 @@ def main(argv):
cmap.dump()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -176,7 +176,8 @@ class TextConverter(PDFConverter):
# is text. This stops all the image and drawing ouput from being
# recorded and taking up RAM.
def render_image(self, name, stream):
if self.imagewriter is None: return
if self.imagewriter is None:
return
PDFConverter.render_image(self, name, stream)
return
@ -318,6 +319,7 @@ class HTMLConverter(PDFConverter):
for child in item:
show_group(child)
return
def render(item):
if isinstance(item, LTPage):
self._yoffset += item.y1
@ -415,6 +417,7 @@ class XMLConverter(PDFConverter):
show_group(child)
self.outfp.write('</textgroup>\n')
return
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %

View File

@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
from latin_enc import ENCODING
STRIP_NAME = re.compile(r'[0-9]+')
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in glyphname2unicode:
return glyphname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
if not m:
raise KeyError(name)
return unichr(int(m.group(0)))
@ -28,10 +31,14 @@ class EncodingDB(object):
pdf2unicode = {}
for (name, std, mac, win, pdf) in ENCODING:
c = name2unicode(name)
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
if std:
std2unicode[std] = c
if mac:
mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,

View File

@ -7,9 +7,11 @@ import os, os.path
from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
def align32(x):
return ((x+3)/4)*4
## BMPWriter
##
class BMPWriter(object):

View File

@ -339,6 +339,7 @@ class LTTextLine(LTTextContainer):
def find_neighbors(self, plane, ratio):
raise NotImplementedError
class LTTextLineHorizontal(LTTextLine):
def __init__(self, word_margin):
@ -364,6 +365,7 @@ class LTTextLineHorizontal(LTTextLine):
(abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d))]
class LTTextLineVertical(LTTextLine):
def __init__(self, word_margin):
@ -407,6 +409,7 @@ class LTTextBox(LTTextContainer):
(self.__class__.__name__,
self.index, bbox2str(self.bbox), self.get_text()))
class LTTextBoxHorizontal(LTTextBox):
def analyze(self, laparams):
@ -417,6 +420,7 @@ class LTTextBoxHorizontal(LTTextBox):
def get_writing_mode(self):
return 'lr-tb'
class LTTextBoxVertical(LTTextBox):
def analyze(self, laparams):
@ -437,6 +441,7 @@ class LTTextGroup(LTTextContainer):
self.extend(objs)
return
class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams):
@ -447,6 +452,7 @@ class LTTextGroupLRTB(LTTextGroup):
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
return
class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams):
@ -555,7 +561,8 @@ class LTLayoutContainer(LTContainer):
done = set()
for line in lines:
box = boxes[line]
if box in done: continue
if box in done:
continue
done.add(box)
if not box.is_empty():
yield box
@ -563,6 +570,7 @@ class LTLayoutContainer(LTContainer):
def group_textboxes(self, laparams, boxes):
assert boxes
def dist(obj1, obj2):
"""A distance function between two TextBoxes.
@ -580,6 +588,7 @@ class LTLayoutContainer(LTContainer):
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2.
"""
@ -628,7 +637,8 @@ class LTLayoutContainer(LTContainer):
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs: return
if not textobjs:
return
textlines = list(self.get_textlines(laparams, textobjs))
assert len(textobjs) <= sum(len(line._objs) for line in textlines)
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
@ -666,7 +676,8 @@ class LTFigure(LTLayoutContainer):
bbox2str(self.bbox), matrix2str(self.matrix)))
def analyze(self, laparams):
if not laparams.all_texts: return
if not laparams.all_texts:
return
LTLayoutContainer.analyze(self, laparams)
return

View File

@ -44,7 +44,8 @@ class LZWDecoder(object):
v = (v << r) | (self.buff & ((1 << r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
if not x:
raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
@ -97,6 +98,7 @@ class LZWDecoder(object):
(self.nbits, code, x, self.table[258:]))
return
# lzwdecode
def lzwdecode(data):
"""

View File

@ -8,6 +8,7 @@ LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
class PDFColorSpace(object):
def __init__(self, name, ncomponents):

View File

@ -28,24 +28,31 @@ class PDFDevice(object):
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page, ctm):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, name, stream):
return
def render_string(self, textstate, seq):
return
@ -132,7 +139,8 @@ class TagExtractor(PDFDevice):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
if not isinstance(obj, str):
continue
chars = font.decode(obj)
for cid in chars:
try:

View File

@ -23,11 +23,24 @@ from utils import decode_text
## Exceptions
##
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFNoValidXRef(PDFSyntaxError):
pass
class PDFNoOutlines(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
@ -68,7 +81,8 @@ class PDFXRef(PDFBaseXRef):
while 1:
try:
(pos, line) = parser.nextline()
if not line.strip(): continue
if not line.strip():
continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
@ -92,7 +106,8 @@ class PDFXRef(PDFBaseXRef):
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != 'n': continue
if use != 'n':
continue
self.offsets[objid] = (None, long(pos), int(genno))
if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets
@ -100,6 +115,7 @@ class PDFXRef(PDFBaseXRef):
return
KEYWORD_TRAILER = KWD('trailer')
def load_trailer(self, parser):
try:
(_, kwd) = parser.nexttoken()
@ -134,6 +150,7 @@ class PDFXRefFallback(PDFXRef):
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load(self, parser, debug=0):
parser.seek(0)
while 1:
@ -148,7 +165,8 @@ class PDFXRefFallback(PDFXRef):
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
break
m = self.PDFOBJ_CUE.match(line)
if not m: continue
if not m:
continue
(objid, genno) = m.groups()
objid = int(objid)
genno = int(genno)
@ -292,7 +310,8 @@ class PDFDocument(object):
self.xrefs.append(xref)
for xref in self.xrefs:
trailer = xref.get_trailer()
if not trailer: continue
if not trailer:
continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
@ -316,6 +335,7 @@ class PDFDocument(object):
# This step is mandatory even if there's no password associated
# with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
@ -414,6 +434,7 @@ class PDFDocument(object):
return (objs, n)
KEYWORD_OBJ = KWD('obj')
def _getobj_parse(self, pos, objid):
self._parser.seek(pos)
(_, objid1) = self._parser.nexttoken() # objid
@ -465,6 +486,7 @@ class PDFDocument(object):
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
@ -490,10 +512,12 @@ class PDFDocument(object):
raise KeyError((cat, key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1, k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None
if key < k1 or k2 < key:
return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
@ -501,7 +525,8 @@ class PDFDocument(object):
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v: return v
if v:
return v
raise KeyError((cat, key))
return lookup(d0)
@ -528,7 +553,8 @@ class PDFDocument(object):
line = line.strip()
if 2 <= self.debug:
print >>sys.stderr, 'find_xref: %r' % line
if line == 'startxref': break
if line == 'startxref':
break
if line:
prev = line
else:

View File

@ -40,6 +40,7 @@ def get_widths(seq):
#assert get_widths([1,2,3]) == {1:3, 2:3}
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
def get_widths2(seq):
widths = {}
r = []
@ -112,18 +113,21 @@ class Type1FontHeaderParser(PSStackParser):
return
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
##
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
def getdict(data):
d = {}
fp = StringIO(data)
stack = []
while 1:
c = fp.read(1)
if not c: break
if not c:
break
b0 = ord(c)
if b0 <= 21:
d[b0] = stack
@ -150,7 +154,8 @@ def getdict(data):
value = -((b0-251) << 8)-b1-108
else:
b2 = ord(fp.read(1))
if 128 <= b1: b1 -= 256
if 128 <= b1:
b1 -= 256
if b0 == 28:
value = b1 << 8 | b2
else:
@ -158,6 +163,7 @@ def getdict(data):
stack.append(value)
return d
class CFFFont(object):
STANDARD_STRINGS = (
@ -356,7 +362,8 @@ class CFFFont(object):
##
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
class CMapNotFound(Exception):
pass
def __init__(self, name, fp):
self.name = name
@ -397,7 +404,8 @@ class TrueTypeFont(object):
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
for (i, firstcode, entcount, delta, pos) in hdrs:
if not entcount: continue
if not entcount:
continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
@ -433,9 +441,12 @@ class TrueTypeFont(object):
## Fonts
##
class PDFFontError(PDFException):
pass
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass
class PDFUnicodeNotDefined(PDFFontError):
pass
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
LITERAL_TYPE1C = LIT('Type1C')
@ -474,6 +485,7 @@ class PDFFont(object):
def get_ascent(self):
return self.ascent * self.vscale
def get_descent(self):
return self.descent * self.vscale
@ -482,6 +494,7 @@ class PDFFont(object):
if w == 0:
w = -self.default_width
return w * self.hscale
def get_height(self):
h = self.bbox[3]-self.bbox[1]
if h == 0:
@ -540,6 +553,7 @@ class PDFSimpleFont(PDFFont):
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
@ -571,12 +585,14 @@ class PDFType1Font(PDFSimpleFont):
def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
@ -689,7 +705,8 @@ class PDFCIDFont(PDFFont):
def to_unichr(self, cid):
try:
if not self.unicode_map: raise KeyError(cid)
if not self.unicode_map:
raise KeyError(cid)
return self.unicode_map.get_unichr(cid)
except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
@ -705,4 +722,5 @@ def main(argv):
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -30,8 +30,12 @@ from utils import mult_matrix, MATRIX_IDENTITY
## Exceptions
##
class PDFResourceError(PDFException): pass
class PDFInterpreterError(PDFException): pass
class PDFResourceError(PDFException):
pass
class PDFInterpreterError(PDFException):
pass
## Constants
@ -120,6 +124,7 @@ class PDFGraphicState(object):
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness))
## Resource Manager
##
class PDFResourceManager(object):
@ -152,7 +157,8 @@ class PDFResourceManager(object):
try:
return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound:
if strict: raise
if strict:
raise
return CMap()
def get_font(self, objid, spec):
@ -227,12 +233,14 @@ class PDFContentParser(PSStackParser):
return
def fillbuf(self):
if self.charpos < len(self.buf): return
if self.charpos < len(self.buf):
return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf: break
if self.buf:
break
self.fp = None
self.charpos = 0
return
@ -274,6 +282,7 @@ class PDFContentParser(PSStackParser):
KEYWORD_BI = KWD('BI')
KEYWORD_ID = KWD('ID')
KEYWORD_EI = KWD('EI')
def do_keyword(self, pos, token):
if token is self.KEYWORD_BI:
# inline image within a content stream
@ -289,7 +298,8 @@ class PDFContentParser(PSStackParser):
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
else:
self.push((pos, token))
return
@ -316,7 +326,9 @@ class PDFPageInterpreter(object):
self.fontmap = {}
self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy()
if not resources: return
if not resources:
return
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
@ -371,7 +383,8 @@ class PDFPageInterpreter(object):
return
def pop(self, n):
if n == 0: return []
if n == 0:
return []
x = self.argstack[-n:]
self.argstack = self.argstack[:-n]
return x
@ -388,6 +401,7 @@ class PDFPageInterpreter(object):
def do_q(self):
self.gstack.append(self.get_current_state())
return
# grestore
def do_Q(self):
if self.gstack:
@ -404,30 +418,37 @@ class PDFPageInterpreter(object):
def do_w(self, linewidth):
self.graphicstate.linewidth = linewidth
return
# setlinecap
def do_J(self, linecap):
self.graphicstate.linecap = linecap
return
# setlinejoin
def do_j(self, linejoin):
self.graphicstate.linejoin = linejoin
return
# setmiterlimit
def do_M(self, miterlimit):
self.graphicstate.miterlimit = miterlimit
return
# setdash
def do_d(self, dash, phase):
self.graphicstate.dash = (dash, phase)
return
# setintent
def do_ri(self, intent):
self.graphicstate.intent = intent
return
# setflatness
def do_i(self, flatness):
self.graphicstate.flatness = flatness
return
# load-gstate
def do_gs(self, name):
#XXX
@ -437,26 +458,32 @@ class PDFPageInterpreter(object):
def do_m(self, x, y):
self.curpath.append(('m', x, y))
return
# lineto
def do_l(self, x, y):
self.curpath.append(('l', x, y))
return
# curveto
def do_c(self, x1, y1, x2, y2, x3, y3):
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
return
# urveto
def do_v(self, x2, y2, x3, y3):
self.curpath.append(('v', x2, y2, x3, y3))
return
# rveto
def do_y(self, x1, y1, x3, y3):
self.curpath.append(('y', x1, y1, x3, y3))
return
# closepath
def do_h(self):
self.curpath.append(('h',))
return
# rectangle
def do_re(self, x, y, w, h):
self.curpath.append(('m', x, y))
@ -471,11 +498,13 @@ class PDFPageInterpreter(object):
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = []
return
# close-and-stroke
def do_s(self):
self.do_h()
self.do_S()
return
# fill
def do_f(self):
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
@ -483,68 +512,85 @@ class PDFPageInterpreter(object):
return
# fill (obsolete)
do_F = do_f
# fill-even-odd
def do_f_a(self):
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = []
return
# fill-and-stroke
def do_B(self):
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = []
return
# fill-and-stroke-even-odd
def do_B_a(self):
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = []
return
# close-fill-and-stroke
def do_b(self):
self.do_h()
self.do_B()
return
# close-fill-and-stroke-even-odd
def do_b_a(self):
self.do_h()
self.do_B_a()
return
# close-only
def do_n(self):
self.curpath = []
return
# clip
def do_W(self): return
def do_W(self):
return
# clip-even-odd
def do_W_a(self): return
def do_W_a(self):
return
# setcolorspace-stroking
def do_CS(self, name):
self.scs = self.csmap[literal_name(name)]
return
# setcolorspace-non-strokine
def do_cs(self, name):
self.ncs = self.csmap[literal_name(name)]
return
# setgray-stroking
def do_G(self, gray):
#self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
#self.do_cs(LITERAL_DEVICE_GRAY)
return
# setrgb-stroking
def do_RG(self, r, g, b):
#self.do_CS(LITERAL_DEVICE_RGB)
return
# setrgb-non-stroking
def do_rg(self, r, g, b):
#self.do_cs(LITERAL_DEVICE_RGB)
return
# setcmyk-stroking
def do_K(self, c, m, y, k):
#self.do_CS(LITERAL_DEVICE_CMYK)
return
# setcmyk-non-stroking
def do_k(self, c, m, y, k):
#self.do_cs(LITERAL_DEVICE_CMYK)
@ -560,6 +606,7 @@ class PDFPageInterpreter(object):
n = 1
self.pop(n)
return
def do_scn(self):
if self.ncs:
n = self.ncs.ncomponents
@ -569,42 +616,53 @@ class PDFPageInterpreter(object):
n = 1
self.pop(n)
return
def do_SC(self):
self.do_SCN()
return
def do_sc(self):
self.do_scn()
return
# sharing-name
def do_sh(self, name): return
def do_sh(self, name):
return
# begin-text
def do_BT(self):
self.textstate.reset()
return
# end-text
def do_ET(self):
return
# begin-compat
def do_BX(self): return
def do_BX(self):
return
# end-compat
def do_EX(self): return
def do_EX(self):
return
# marked content operators
def do_MP(self, tag):
self.device.do_tag(tag)
return
def do_DP(self, tag, props):
self.device.do_tag(tag, props)
return
def do_BMC(self, tag):
self.device.begin_tag(tag)
return
def do_BDC(self, tag, props):
self.device.begin_tag(tag, props)
return
def do_EMC(self):
self.device.end_tag()
return
@ -613,18 +671,22 @@ class PDFPageInterpreter(object):
def do_Tc(self, space):
self.textstate.charspace = space
return
# setwordspace
def do_Tw(self, space):
self.textstate.wordspace = space
return
# textscale
def do_Tz(self, scale):
self.textstate.scaling = scale
return
# setleading
def do_TL(self, leading):
self.textstate.leading = -leading
return
# selectfont
def do_Tf(self, fontid, fontsize):
try:
@ -635,10 +697,12 @@ class PDFPageInterpreter(object):
self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize
return
# setrendering
def do_Tr(self, render):
self.textstate.render = render
return
# settextrise
def do_Ts(self, rise):
self.textstate.rise = rise
@ -651,6 +715,7 @@ class PDFPageInterpreter(object):
self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
return
# text-move
def do_TD(self, tx, ty):
(a, b, c, d, e, f) = self.textstate.matrix
@ -659,11 +724,13 @@ class PDFPageInterpreter(object):
self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
return
# textmatrix
def do_Tm(self, a, b, c, d, e, f):
self.textstate.matrix = (a, b, c, d, e, f)
self.textstate.linematrix = (0, 0)
return
# nextline
def do_T_a(self):
(a, b, c, d, e, f) = self.textstate.matrix
@ -680,15 +747,18 @@ class PDFPageInterpreter(object):
return
self.device.render_string(self.textstate, seq)
return
# show
def do_Tj(self, s):
self.do_TJ([s])
return
# quote
def do__q(self, s):
self.do_T_a()
self.do_TJ([s])
return
# doublequote
def do__w(self, aw, ac, s):
self.do_Tw(aw)
@ -699,8 +769,10 @@ class PDFPageInterpreter(object):
# inline image
def do_BI(self): # never called
return
def do_ID(self): # never called
return
def do_EI(self, obj):
if 'W' in obj and 'H' in obj:
iobjid = str(id(obj))

View File

@ -71,6 +71,7 @@ class PDFPage(object):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod
def create_pages(klass, document, debug=0):
def search(obj, parent):
@ -110,7 +111,8 @@ class PDFPage(object):
pass
return
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
@classmethod
def get_pages(klass, fp,
@ -128,7 +130,9 @@ class PDFPage(object):
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document.
for (pageno, page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos): continue
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno+1: break
if maxpages and maxpages <= pageno+1:
break
return

View File

@ -15,7 +15,8 @@ from pdftypes import str_value, list_value, dict_value, stream_value
## Exceptions
##
class PDFSyntaxError(PDFException): pass
class PDFSyntaxError(PDFException):
pass
## PDFParser
@ -55,6 +56,7 @@ class PDFParser(PSStackParser):
KEYWORD_STREAM = KWD('stream')
KEYWORD_XREF = KWD('xref')
KEYWORD_STARTXREF = KWD('startxref')
def do_keyword(self, pos, token):
"""Handles PDF-related keywords."""

View File

@ -23,13 +23,28 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
## PDF Objects
##
class PDFObject(PSObject): pass
class PDFObject(PSObject):
pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFObjectNotFound(PDFException): pass
class PDFNotImplementedError(PDFException): pass
class PDFException(PSException):
pass
class PDFTypeError(PDFException):
pass
class PDFValueError(PDFException):
pass
class PDFObjectNotFound(PDFException):
pass
class PDFNotImplementedError(PDFException):
pass
## PDFObjRef
@ -66,6 +81,7 @@ def resolve1(x, default=None):
x = x.resolve(default=default)
return x
def resolve_all(x, default=None):
"""Recursively resolves the given object and all the internals.
@ -81,6 +97,7 @@ def resolve_all(x, default=None):
x[k] = resolve_all(v, default=default)
return x
def decipher_all(decipher, objid, genno, x):
"""Recursively deciphers the given object.
"""
@ -93,6 +110,7 @@ def decipher_all(decipher, objid, genno, x):
x[k] = decipher_all(decipher, objid, genno, v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
@ -102,6 +120,7 @@ def int_value(x):
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
@ -110,6 +129,7 @@ def float_value(x):
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
@ -118,6 +138,7 @@ def num_value(x):
return 0
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
@ -126,6 +147,7 @@ def str_value(x):
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
@ -134,6 +156,7 @@ def list_value(x):
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
@ -142,6 +165,7 @@ def dict_value(x):
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
@ -195,12 +219,14 @@ class PDFStream(PDFObject):
def get_filters(self):
filters = self.get_any(('F', 'Filter'))
if not filters: return []
if isinstance(filters, list): return filters
if not filters:
return []
if isinstance(filters, list):
return filters
return [filters]
def decode(self):
assert self.data is None and self.rawdata != None
assert self.data is None and self.rawdata is not None
data = self.rawdata
if self.decipher:
# Handle encryption

View File

@ -8,11 +8,24 @@ STRICT = 0
## PS Exceptions
##
class PSException(Exception): pass
class PSEOF(PSException): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
class PSException(Exception):
pass
class PSEOF(PSException):
pass
class PSSyntaxError(PSException):
pass
class PSTypeError(PSException):
pass
class PSValueError(PSException):
pass
## Basic PostScript Types
@ -114,6 +127,7 @@ def literal_name(x):
return str(x)
return x.name
def keyword_name(x):
if not isinstance(x, PSKeyword):
if STRICT:
@ -137,6 +151,8 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92}
class PSBaseParser(object):
"""Most basic PostScript parser that performs only tokenization.
@ -190,7 +206,8 @@ class PSBaseParser(object):
return
def fillbuf(self):
if self.charpos < len(self.buf): return
if self.charpos < len(self.buf):
return
# fetch next chunk.
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
@ -242,7 +259,8 @@ class PSBaseParser(object):
pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos)
s = self.fp.read(prevpos-pos)
if not s: break
if not s:
break
while 1:
n = max(s.rfind('\r'), s.rfind('\n'))
if n == -1:
@ -553,7 +571,8 @@ class PSStackParser(PSBaseParser):
try:
self.push(self.end_type('a'))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
@ -567,7 +586,8 @@ class PSStackParser(PSBaseParser):
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
self.push((pos, d))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
elif token == KEYWORD_PROC_BEGIN:
# begin proc
self.start_type(pos, 'p')
@ -576,7 +596,8 @@ class PSStackParser(PSBaseParser):
try:
self.push(self.end_type('p'))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
else:
if 2 <= self.debug:
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
@ -592,9 +613,11 @@ class PSStackParser(PSBaseParser):
return obj
import unittest
## Simplistic Test cases
##
import unittest
class TestPSBaseParser(unittest.TestCase):
TESTDATA = r'''%!PS
@ -645,6 +668,7 @@ func/a/b{(c)do*}def
def get_tokens(self, s):
import StringIO
class MyParser(PSBaseParser):
def flush(self):
self.add_results(*self.popall())
@ -659,6 +683,7 @@ func/a/b{(c)do*}def
def get_objects(self, s):
import StringIO
class MyParser(PSStackParser):
def flush(self):
self.add_results(*self.popall())
@ -683,4 +708,5 @@ func/a/b{(c)do*}def
self.assertEqual(objs, self.OBJS)
return
if __name__ == '__main__': unittest.main()
if __name__ == '__main__':
unittest.main()

View File

@ -13,9 +13,17 @@ by Philip J. Erdelsky:
import sys
import struct
def KEYLENGTH(keybits): return (keybits)/8
def RKLENGTH(keybits): return (keybits)/8+28
def NROUNDS(keybits): return (keybits)/32+6
def KEYLENGTH(keybits):
return (keybits)/8
def RKLENGTH(keybits):
return (keybits)/8+28
def NROUNDS(keybits):
return (keybits)/32+6
Te0 = [
0xc66363a5L, 0xf87c7c84L, 0xee777799L, 0xf67b7b8dL,
@ -703,6 +711,7 @@ else:
def GETU32(x): return struct.unpack('>I', x)[0]
def PUTU32(x): return struct.pack('>I', x)
# Expand the cipher key into the encryption key schedule.
#
# @return the number of rounds for the given cipher key size.
@ -1051,6 +1060,7 @@ class RijndaelDecryptor(object):
assert len(ciphertext) == 16
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
# encrypt(key, fin, fout, keybits=256)
class RijndaelEncryptor(object):

View File

@ -8,6 +8,7 @@
import sys
def rldecode(data):
"""
RunLength decoder (Adobe version) implementation based on PDF Reference

View File

@ -52,20 +52,24 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
##
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
"""Returns the multiplication of two matrices."""
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a, b, c, d, e, f), (x, y)):
"""Translates a matrix by (x, y)."""
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
"""Applies a matrix to a point."""
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
return (a*p+c*q, b*p+d*q)
@ -79,17 +83,20 @@ def uniq(objs):
"""Eliminates duplicated elements."""
done = set()
for obj in objs:
if obj in done: continue
if obj in done:
continue
done.add(obj)
yield obj
return
# csort
def csort(objs, key=lambda x: x):
"""Order-preserving sorting function."""
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
# fsplit
def fsplit(pred, objs):
"""Split a list into two classes according to the predicate."""
@ -102,12 +109,14 @@ def fsplit(pred, objs):
f.append(obj)
return (t, f)
# drange
def drange(v0, v1, d):
"""Returns a discrete range."""
assert v0 < v1
return xrange(int(v0)/d, int(v1+d)/d)
# get_bound
def get_bound(pts):
"""Compute a minimal rectangle that covers all the points."""
@ -119,6 +128,7 @@ def get_bound(pts):
y1 = max(y1, y)
return (x0, y0, x1, y1)
# pick
def pick(seq, func, maxobj=None):
"""Picks the object obj where func(obj) has the highest value."""
@ -129,6 +139,7 @@ def pick(seq, func, maxobj=None):
(maxscore, maxobj) = (score, obj)
return maxobj
# choplist
def choplist(n, seq):
"""Groups every n elements of the list."""
@ -140,6 +151,7 @@ def choplist(n, seq):
r = []
return
# nunpack
def nunpack(s, default=0):
"""Unpacks 1 to 4 byte integers (big endian)."""
@ -157,6 +169,7 @@ def nunpack(s, default=0):
else:
raise TypeError('invalid length: %d' % l)
# decode_text
PDFDocEncoding = ''.join(unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
@ -192,6 +205,8 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
"""Decodes a PDFDocEncoding string to Unicode."""
if s.startswith('\xfe\xff'):
@ -199,15 +214,18 @@ def decode_text(s):
else:
return ''.join(PDFDocEncoding[ord(c)] for c in s)
# enc
def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML"""
x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
return x.encode(codec, 'xmlcharrefreplace')
def bbox2str((x0, y0, x1, y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
def matrix2str((a, b, c, d, e, f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
@ -282,11 +300,14 @@ class Plane(object):
def find(self, (x0, y0, x1, y1)):
done = set()
for k in self._getrange((x0, y0, x1, y1)):
if k not in self._grid: continue
if k not in self._grid:
continue
for obj in self._grid[k]:
if obj in done: continue
if obj in done:
continue
done.add(obj)
if (obj.x1 <= x0 or x1 <= obj.x0 or
obj.y1 <= y0 or y1 <= obj.y0): continue
obj.y1 <= y0 or y1 <= obj.y0):
continue
yield obj
return