PEP8: Whitespace changes to match pep8
parent
c1da8b835c
commit
2caa5edc25
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
__version__ = '20131022'
|
||||
|
||||
if __name__ == '__main__': print __version__
|
||||
if __name__ == '__main__':
|
||||
print __version__
|
||||
|
|
|
@ -6,6 +6,7 @@ This code is in the public domain.
|
|||
|
||||
"""
|
||||
|
||||
|
||||
## Arcfour
|
||||
##
|
||||
class Arcfour(object):
|
||||
|
|
|
@ -9,6 +9,7 @@ This code is in the public domain.
|
|||
import re
|
||||
import struct
|
||||
|
||||
|
||||
# ascii85decode(data)
|
||||
def ascii85decode(data):
|
||||
"""
|
||||
|
@ -51,6 +52,8 @@ def ascii85decode(data):
|
|||
# asciihexdecode(data)
|
||||
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
|
||||
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
|
||||
|
||||
|
||||
def asciihexdecode(data):
|
||||
"""
|
||||
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
||||
|
|
|
@ -25,6 +25,7 @@ class BitParser(object):
|
|||
@classmethod
|
||||
def add(klass, root, v, bits):
|
||||
p = root
|
||||
b = None
|
||||
for i in xrange(len(bits)):
|
||||
if 0 < i:
|
||||
if p[b] is None:
|
||||
|
@ -309,9 +310,14 @@ class CCITTG4Parser(BitParser):
|
|||
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
|
||||
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
|
||||
|
||||
class EOFB(Exception): pass
|
||||
class InvalidData(Exception): pass
|
||||
class ByteSkip(Exception): pass
|
||||
class EOFB(Exception):
|
||||
pass
|
||||
|
||||
class InvalidData(Exception):
|
||||
pass
|
||||
|
||||
class ByteSkip(Exception):
|
||||
pass
|
||||
|
||||
def __init__(self, width, bytealign=False):
|
||||
BitParser.__init__(self)
|
||||
|
@ -358,7 +364,8 @@ class CCITTG4Parser(BitParser):
|
|||
raise self.InvalidData(mode)
|
||||
|
||||
def _parse_horiz1(self, n):
|
||||
if n is None: raise self.InvalidData
|
||||
if n is None:
|
||||
raise self.InvalidData
|
||||
self._n1 += n
|
||||
if n < 64:
|
||||
self._n2 = 0
|
||||
|
@ -370,7 +377,8 @@ class CCITTG4Parser(BitParser):
|
|||
return self.BLACK
|
||||
|
||||
def _parse_horiz2(self, n):
|
||||
if n is None: raise self.InvalidData
|
||||
if n is None:
|
||||
raise self.InvalidData
|
||||
self._n2 += n
|
||||
if n < 64:
|
||||
self._color = 1-self._color
|
||||
|
@ -384,7 +392,8 @@ class CCITTG4Parser(BitParser):
|
|||
return self.BLACK
|
||||
|
||||
def _parse_uncompressed(self, bits):
|
||||
if not bits: raise self.InvalidData
|
||||
if not bits:
|
||||
raise self.InvalidData
|
||||
if bits.startswith('T'):
|
||||
self._accept = self._parse_mode
|
||||
self._color = int(bits[1])
|
||||
|
@ -441,12 +450,13 @@ class CCITTG4Parser(BitParser):
|
|||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and
|
||||
self._refline[x1] != self._color): break
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color): break
|
||||
self._refline[x1] != self._color):
|
||||
break
|
||||
x1 += 1
|
||||
x1 += dx
|
||||
x0 = max(0, self._curpos)
|
||||
|
@ -467,21 +477,23 @@ class CCITTG4Parser(BitParser):
|
|||
x1 = self._curpos+1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 1 and
|
||||
self._refline[x1] != self._color): break
|
||||
if (self._color == 1 and self._refline[x1] != self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] == self._color and
|
||||
self._refline[x1] != self._color): break
|
||||
self._refline[x1] != self._color):
|
||||
break
|
||||
x1 += 1
|
||||
while 1:
|
||||
if x1 == 0:
|
||||
if (self._color == 0 and
|
||||
self._refline[x1] == self._color): break
|
||||
if (self._color == 0 and self._refline[x1] == self._color):
|
||||
break
|
||||
elif x1 == len(self._refline):
|
||||
break
|
||||
elif (self._refline[x1-1] != self._color and
|
||||
self._refline[x1] == self._color): break
|
||||
self._refline[x1] == self._color):
|
||||
break
|
||||
x1 += 1
|
||||
for x in xrange(self._curpos, x1):
|
||||
self._curline[x] = self._color
|
||||
|
@ -494,11 +506,13 @@ class CCITTG4Parser(BitParser):
|
|||
self._curpos = 0
|
||||
x = self._curpos
|
||||
for _ in xrange(n1):
|
||||
if len(self._curline) <= x: break
|
||||
if len(self._curline) <= x:
|
||||
break
|
||||
self._curline[x] = self._color
|
||||
x += 1
|
||||
for _ in xrange(n2):
|
||||
if len(self._curline) <= x: break
|
||||
if len(self._curline) <= x:
|
||||
break
|
||||
self._curline[x] = 1-self._color
|
||||
x += 1
|
||||
self._curpos = x
|
||||
|
@ -512,10 +526,11 @@ class CCITTG4Parser(BitParser):
|
|||
self._flush_line()
|
||||
return
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
## Test cases
|
||||
##
|
||||
import unittest
|
||||
class TestCCITTG4Parser(unittest.TestCase):
|
||||
|
||||
def get_parser(self, bits):
|
||||
|
@ -711,11 +726,13 @@ def main(argv):
|
|||
import pygame
|
||||
if not argv[1:]:
|
||||
return unittest.main()
|
||||
|
||||
class Parser(CCITTG4Parser):
|
||||
def __init__(self, width, bytealign=False):
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.img = pygame.Surface((self.width, 1000))
|
||||
return
|
||||
|
||||
def output_line(self, y, bits):
|
||||
for (x, b) in enumerate(bits):
|
||||
if b:
|
||||
|
@ -723,6 +740,7 @@ def main(argv):
|
|||
else:
|
||||
self.img.set_at((x, y), (0, 0, 0))
|
||||
return
|
||||
|
||||
def close(self):
|
||||
pygame.image.save(self.img, 'out.bmp')
|
||||
return
|
||||
|
@ -735,4 +753,5 @@ def main(argv):
|
|||
fp.close()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -26,7 +26,8 @@ from encodingdb import name2unicode
|
|||
from utils import choplist, nunpack
|
||||
|
||||
|
||||
class CMapError(Exception): pass
|
||||
class CMapError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
## CMap
|
||||
|
@ -44,6 +45,7 @@ class CMap(object):
|
|||
|
||||
def use_cmap(self, cmap):
|
||||
assert isinstance(cmap, CMap)
|
||||
|
||||
def copy(dst, src):
|
||||
for (k, v) in src.iteritems():
|
||||
if isinstance(v, dict):
|
||||
|
@ -102,7 +104,6 @@ class IdentityCMap(object):
|
|||
return ()
|
||||
|
||||
|
||||
|
||||
## UnicodeMap
|
||||
##
|
||||
class UnicodeMap(object):
|
||||
|
@ -232,17 +233,16 @@ class CMapDB(object):
|
|||
_cmap_cache = {}
|
||||
_umap_cache = {}
|
||||
|
||||
class CMapNotFound(CMapError): pass
|
||||
class CMapNotFound(CMapError):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def _load_data(klass, name):
|
||||
filename = '%s.pickle.gz' % name
|
||||
if klass.debug:
|
||||
print >>sys.stderr, 'loading:', name
|
||||
cmap_paths = (
|
||||
os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||
os.path.join(os.path.dirname(__file__), 'cmap'),
|
||||
)
|
||||
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
|
||||
os.path.join(os.path.dirname(__file__), 'cmap'),)
|
||||
for directory in cmap_paths:
|
||||
path = os.path.join(directory, filename)
|
||||
if os.path.exists(path):
|
||||
|
@ -306,7 +306,8 @@ class CMapParser(PSStackParser):
|
|||
elif name == 'endcmap':
|
||||
self._in_cmap = False
|
||||
return
|
||||
if not self._in_cmap: return
|
||||
if not self._in_cmap:
|
||||
return
|
||||
#
|
||||
if name == 'def':
|
||||
try:
|
||||
|
@ -337,13 +338,15 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endcidrange':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, cid) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
not isinstance(cid, int) or len(s) != len(e)): continue
|
||||
not isinstance(cid, int) or len(s) != len(e)):
|
||||
continue
|
||||
sprefix = s[:-4]
|
||||
eprefix = e[:-4]
|
||||
if sprefix != eprefix: continue
|
||||
if sprefix != eprefix:
|
||||
continue
|
||||
svar = s[-4:]
|
||||
evar = e[-4:]
|
||||
s1 = nunpack(svar)
|
||||
|
@ -359,7 +362,7 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endcidchar':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (cid, code) in choplist(2, objs):
|
||||
if isinstance(code, str) and isinstance(cid, str):
|
||||
self.cmap.add_code2cid(code, nunpack(cid))
|
||||
|
@ -369,10 +372,11 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endbfrange':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (s, e, code) in choplist(3, objs):
|
||||
if (not isinstance(s, str) or not isinstance(e, str) or
|
||||
len(s) != len(e)): continue
|
||||
len(s) != len(e)):
|
||||
continue
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
#assert s1 <= e1
|
||||
|
@ -393,7 +397,7 @@ class CMapParser(PSStackParser):
|
|||
self.popall()
|
||||
return
|
||||
if name == 'endbfchar':
|
||||
objs = [ obj for (_,obj) in self.popall() ]
|
||||
objs = [obj for (__, obj) in self.popall()]
|
||||
for (cid, code) in choplist(2, objs):
|
||||
if isinstance(cid, str) and isinstance(code, str):
|
||||
self.cmap.add_cid2unichr(nunpack(cid), code)
|
||||
|
@ -409,6 +413,7 @@ class CMapParser(PSStackParser):
|
|||
self.push((pos, token))
|
||||
return
|
||||
|
||||
|
||||
# test
|
||||
def main(argv):
|
||||
args = argv[1:]
|
||||
|
@ -421,4 +426,5 @@ def main(argv):
|
|||
cmap.dump()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -176,7 +176,8 @@ class TextConverter(PDFConverter):
|
|||
# is text. This stops all the image and drawing ouput from being
|
||||
# recorded and taking up RAM.
|
||||
def render_image(self, name, stream):
|
||||
if self.imagewriter is None: return
|
||||
if self.imagewriter is None:
|
||||
return
|
||||
PDFConverter.render_image(self, name, stream)
|
||||
return
|
||||
|
||||
|
@ -318,6 +319,7 @@ class HTMLConverter(PDFConverter):
|
|||
for child in item:
|
||||
show_group(child)
|
||||
return
|
||||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self._yoffset += item.y1
|
||||
|
@ -415,6 +417,7 @@ class XMLConverter(PDFConverter):
|
|||
show_group(child)
|
||||
self.outfp.write('</textgroup>\n')
|
||||
return
|
||||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
|
|
|
@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
|
|||
from latin_enc import ENCODING
|
||||
|
||||
|
||||
STRIP_NAME = re.compile(r'[0-9]+')
|
||||
|
||||
|
||||
## name2unicode
|
||||
##
|
||||
STRIP_NAME = re.compile(r'[0-9]+')
|
||||
def name2unicode(name):
|
||||
"""Converts Adobe glyph names to Unicode numbers."""
|
||||
if name in glyphname2unicode:
|
||||
return glyphname2unicode[name]
|
||||
m = STRIP_NAME.search(name)
|
||||
if not m: raise KeyError(name)
|
||||
if not m:
|
||||
raise KeyError(name)
|
||||
return unichr(int(m.group(0)))
|
||||
|
||||
|
||||
|
@ -28,10 +31,14 @@ class EncodingDB(object):
|
|||
pdf2unicode = {}
|
||||
for (name, std, mac, win, pdf) in ENCODING:
|
||||
c = name2unicode(name)
|
||||
if std: std2unicode[std] = c
|
||||
if mac: mac2unicode[mac] = c
|
||||
if win: win2unicode[win] = c
|
||||
if pdf: pdf2unicode[pdf] = c
|
||||
if std:
|
||||
std2unicode[std] = c
|
||||
if mac:
|
||||
mac2unicode[mac] = c
|
||||
if win:
|
||||
win2unicode[win] = c
|
||||
if pdf:
|
||||
pdf2unicode[pdf] = c
|
||||
|
||||
encodings = {
|
||||
'StandardEncoding': std2unicode,
|
||||
|
|
|
@ -7,9 +7,11 @@ import os, os.path
|
|||
from pdftypes import LITERALS_DCT_DECODE
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||
|
||||
|
||||
def align32(x):
|
||||
return ((x+3)/4)*4
|
||||
|
||||
|
||||
## BMPWriter
|
||||
##
|
||||
class BMPWriter(object):
|
||||
|
|
|
@ -339,6 +339,7 @@ class LTTextLine(LTTextContainer):
|
|||
def find_neighbors(self, plane, ratio):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class LTTextLineHorizontal(LTTextLine):
|
||||
|
||||
def __init__(self, word_margin):
|
||||
|
@ -364,6 +365,7 @@ class LTTextLineHorizontal(LTTextLine):
|
|||
(abs(obj.x0-self.x0) < d or
|
||||
abs(obj.x1-self.x1) < d))]
|
||||
|
||||
|
||||
class LTTextLineVertical(LTTextLine):
|
||||
|
||||
def __init__(self, word_margin):
|
||||
|
@ -407,6 +409,7 @@ class LTTextBox(LTTextContainer):
|
|||
(self.__class__.__name__,
|
||||
self.index, bbox2str(self.bbox), self.get_text()))
|
||||
|
||||
|
||||
class LTTextBoxHorizontal(LTTextBox):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -417,6 +420,7 @@ class LTTextBoxHorizontal(LTTextBox):
|
|||
def get_writing_mode(self):
|
||||
return 'lr-tb'
|
||||
|
||||
|
||||
class LTTextBoxVertical(LTTextBox):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -437,6 +441,7 @@ class LTTextGroup(LTTextContainer):
|
|||
self.extend(objs)
|
||||
return
|
||||
|
||||
|
||||
class LTTextGroupLRTB(LTTextGroup):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -447,6 +452,7 @@ class LTTextGroupLRTB(LTTextGroup):
|
|||
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
|
||||
return
|
||||
|
||||
|
||||
class LTTextGroupTBRL(LTTextGroup):
|
||||
|
||||
def analyze(self, laparams):
|
||||
|
@ -555,7 +561,8 @@ class LTLayoutContainer(LTContainer):
|
|||
done = set()
|
||||
for line in lines:
|
||||
box = boxes[line]
|
||||
if box in done: continue
|
||||
if box in done:
|
||||
continue
|
||||
done.add(box)
|
||||
if not box.is_empty():
|
||||
yield box
|
||||
|
@ -563,6 +570,7 @@ class LTLayoutContainer(LTContainer):
|
|||
|
||||
def group_textboxes(self, laparams, boxes):
|
||||
assert boxes
|
||||
|
||||
def dist(obj1, obj2):
|
||||
"""A distance function between two TextBoxes.
|
||||
|
||||
|
@ -580,6 +588,7 @@ class LTLayoutContainer(LTContainer):
|
|||
x1 = max(obj1.x1, obj2.x1)
|
||||
y1 = max(obj1.y1, obj2.y1)
|
||||
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
|
||||
|
||||
def isany(obj1, obj2):
|
||||
"""Check if there's any other object between obj1 and obj2.
|
||||
"""
|
||||
|
@ -628,7 +637,8 @@ class LTLayoutContainer(LTContainer):
|
|||
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
|
||||
for obj in otherobjs:
|
||||
obj.analyze(laparams)
|
||||
if not textobjs: return
|
||||
if not textobjs:
|
||||
return
|
||||
textlines = list(self.get_textlines(laparams, textobjs))
|
||||
assert len(textobjs) <= sum(len(line._objs) for line in textlines)
|
||||
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
|
||||
|
@ -666,7 +676,8 @@ class LTFigure(LTLayoutContainer):
|
|||
bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||
|
||||
def analyze(self, laparams):
|
||||
if not laparams.all_texts: return
|
||||
if not laparams.all_texts:
|
||||
return
|
||||
LTLayoutContainer.analyze(self, laparams)
|
||||
return
|
||||
|
||||
|
|
|
@ -44,7 +44,8 @@ class LZWDecoder(object):
|
|||
v = (v << r) | (self.buff & ((1 << r)-1))
|
||||
bits -= r
|
||||
x = self.fp.read(1)
|
||||
if not x: raise EOFError
|
||||
if not x:
|
||||
raise EOFError
|
||||
self.buff = ord(x)
|
||||
self.bpos = 0
|
||||
return v
|
||||
|
@ -97,6 +98,7 @@ class LZWDecoder(object):
|
|||
(self.nbits, code, x, self.table[258:]))
|
||||
return
|
||||
|
||||
|
||||
# lzwdecode
|
||||
def lzwdecode(data):
|
||||
"""
|
||||
|
|
|
@ -8,6 +8,7 @@ LITERAL_DEVICE_GRAY = LIT('DeviceGray')
|
|||
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
|
||||
|
||||
|
||||
class PDFColorSpace(object):
|
||||
|
||||
def __init__(self, name, ncomponents):
|
||||
|
|
|
@ -28,24 +28,31 @@ class PDFDevice(object):
|
|||
|
||||
def begin_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def end_tag(self):
|
||||
return
|
||||
|
||||
def do_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def begin_page(self, page, ctm):
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
return
|
||||
|
||||
def begin_figure(self, name, bbox, matrix):
|
||||
return
|
||||
|
||||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
|
||||
return
|
||||
|
||||
def render_image(self, name, stream):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, seq):
|
||||
return
|
||||
|
||||
|
@ -132,7 +139,8 @@ class TagExtractor(PDFDevice):
|
|||
font = textstate.font
|
||||
text = ''
|
||||
for obj in seq:
|
||||
if not isinstance(obj, str): continue
|
||||
if not isinstance(obj, str):
|
||||
continue
|
||||
chars = font.decode(obj)
|
||||
for cid in chars:
|
||||
try:
|
||||
|
|
|
@ -23,11 +23,24 @@ from utils import decode_text
|
|||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFNoValidXRef(PDFSyntaxError): pass
|
||||
class PDFNoOutlines(PDFException): pass
|
||||
class PDFDestinationNotFound(PDFException): pass
|
||||
class PDFEncryptionError(PDFException): pass
|
||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||
class PDFNoValidXRef(PDFSyntaxError):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNoOutlines(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFDestinationNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFEncryptionError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFPasswordIncorrect(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_OBJSTM = LIT('ObjStm')
|
||||
|
@ -68,7 +81,8 @@ class PDFXRef(PDFBaseXRef):
|
|||
while 1:
|
||||
try:
|
||||
(pos, line) = parser.nextline()
|
||||
if not line.strip(): continue
|
||||
if not line.strip():
|
||||
continue
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
if not line:
|
||||
|
@ -92,7 +106,8 @@ class PDFXRef(PDFBaseXRef):
|
|||
if len(f) != 3:
|
||||
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
|
||||
(pos, genno, use) = f
|
||||
if use != 'n': continue
|
||||
if use != 'n':
|
||||
continue
|
||||
self.offsets[objid] = (None, long(pos), int(genno))
|
||||
if 1 <= debug:
|
||||
print >>sys.stderr, 'xref objects:', self.offsets
|
||||
|
@ -100,6 +115,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
return
|
||||
|
||||
KEYWORD_TRAILER = KWD('trailer')
|
||||
|
||||
def load_trailer(self, parser):
|
||||
try:
|
||||
(_, kwd) = parser.nexttoken()
|
||||
|
@ -134,6 +150,7 @@ class PDFXRefFallback(PDFXRef):
|
|||
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
|
||||
|
||||
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||
|
||||
def load(self, parser, debug=0):
|
||||
parser.seek(0)
|
||||
while 1:
|
||||
|
@ -148,7 +165,8 @@ class PDFXRefFallback(PDFXRef):
|
|||
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
|
||||
break
|
||||
m = self.PDFOBJ_CUE.match(line)
|
||||
if not m: continue
|
||||
if not m:
|
||||
continue
|
||||
(objid, genno) = m.groups()
|
||||
objid = int(objid)
|
||||
genno = int(genno)
|
||||
|
@ -292,7 +310,8 @@ class PDFDocument(object):
|
|||
self.xrefs.append(xref)
|
||||
for xref in self.xrefs:
|
||||
trailer = xref.get_trailer()
|
||||
if not trailer: continue
|
||||
if not trailer:
|
||||
continue
|
||||
# If there's an encryption info, remember it.
|
||||
if 'Encrypt' in trailer:
|
||||
#assert not self.encryption
|
||||
|
@ -316,6 +335,7 @@ class PDFDocument(object):
|
|||
# This step is mandatory even if there's no password associated
|
||||
# with the document.
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
def initialize(self, password=''):
|
||||
if not self.encryption:
|
||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||
|
@ -414,6 +434,7 @@ class PDFDocument(object):
|
|||
return (objs, n)
|
||||
|
||||
KEYWORD_OBJ = KWD('obj')
|
||||
|
||||
def _getobj_parse(self, pos, objid):
|
||||
self._parser.seek(pos)
|
||||
(_, objid1) = self._parser.nexttoken() # objid
|
||||
|
@ -465,6 +486,7 @@ class PDFDocument(object):
|
|||
def get_outlines(self):
|
||||
if 'Outlines' not in self.catalog:
|
||||
raise PDFNoOutlines
|
||||
|
||||
def search(entry, level):
|
||||
entry = dict_value(entry)
|
||||
if 'Title' in entry:
|
||||
|
@ -490,10 +512,12 @@ class PDFDocument(object):
|
|||
raise KeyError((cat, key))
|
||||
# may raise KeyError
|
||||
d0 = dict_value(names[cat])
|
||||
|
||||
def lookup(d):
|
||||
if 'Limits' in d:
|
||||
(k1, k2) = list_value(d['Limits'])
|
||||
if key < k1 or k2 < key: return None
|
||||
if key < k1 or k2 < key:
|
||||
return None
|
||||
if 'Names' in d:
|
||||
objs = list_value(d['Names'])
|
||||
names = dict(choplist(2, objs))
|
||||
|
@ -501,7 +525,8 @@ class PDFDocument(object):
|
|||
if 'Kids' in d:
|
||||
for c in list_value(d['Kids']):
|
||||
v = lookup(dict_value(c))
|
||||
if v: return v
|
||||
if v:
|
||||
return v
|
||||
raise KeyError((cat, key))
|
||||
return lookup(d0)
|
||||
|
||||
|
@ -528,7 +553,8 @@ class PDFDocument(object):
|
|||
line = line.strip()
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'find_xref: %r' % line
|
||||
if line == 'startxref': break
|
||||
if line == 'startxref':
|
||||
break
|
||||
if line:
|
||||
prev = line
|
||||
else:
|
||||
|
|
|
@ -40,6 +40,7 @@ def get_widths(seq):
|
|||
#assert get_widths([1,2,3]) == {1:3, 2:3}
|
||||
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
|
||||
|
||||
|
||||
def get_widths2(seq):
|
||||
widths = {}
|
||||
r = []
|
||||
|
@ -112,18 +113,21 @@ class Type1FontHeaderParser(PSStackParser):
|
|||
return
|
||||
|
||||
|
||||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
|
||||
|
||||
|
||||
## CFFFont
|
||||
## (Format specified in Adobe Technical Note: #5176
|
||||
## "The Compact Font Format Specification")
|
||||
##
|
||||
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
|
||||
def getdict(data):
|
||||
d = {}
|
||||
fp = StringIO(data)
|
||||
stack = []
|
||||
while 1:
|
||||
c = fp.read(1)
|
||||
if not c: break
|
||||
if not c:
|
||||
break
|
||||
b0 = ord(c)
|
||||
if b0 <= 21:
|
||||
d[b0] = stack
|
||||
|
@ -150,7 +154,8 @@ def getdict(data):
|
|||
value = -((b0-251) << 8)-b1-108
|
||||
else:
|
||||
b2 = ord(fp.read(1))
|
||||
if 128 <= b1: b1 -= 256
|
||||
if 128 <= b1:
|
||||
b1 -= 256
|
||||
if b0 == 28:
|
||||
value = b1 << 8 | b2
|
||||
else:
|
||||
|
@ -158,6 +163,7 @@ def getdict(data):
|
|||
stack.append(value)
|
||||
return d
|
||||
|
||||
|
||||
class CFFFont(object):
|
||||
|
||||
STANDARD_STRINGS = (
|
||||
|
@ -356,7 +362,8 @@ class CFFFont(object):
|
|||
##
|
||||
class TrueTypeFont(object):
|
||||
|
||||
class CMapNotFound(Exception): pass
|
||||
class CMapNotFound(Exception):
|
||||
pass
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
|
@ -397,7 +404,8 @@ class TrueTypeFont(object):
|
|||
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
|
||||
for (i, firstcode, entcount, delta, pos) in hdrs:
|
||||
if not entcount: continue
|
||||
if not entcount:
|
||||
continue
|
||||
first = firstcode + (firstbytes[i] << 8)
|
||||
fp.seek(pos)
|
||||
for c in xrange(entcount):
|
||||
|
@ -433,9 +441,12 @@ class TrueTypeFont(object):
|
|||
|
||||
## Fonts
|
||||
##
|
||||
class PDFFontError(PDFException):
|
||||
pass
|
||||
|
||||
class PDFFontError(PDFException): pass
|
||||
class PDFUnicodeNotDefined(PDFFontError): pass
|
||||
|
||||
class PDFUnicodeNotDefined(PDFFontError):
|
||||
pass
|
||||
|
||||
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
|
||||
LITERAL_TYPE1C = LIT('Type1C')
|
||||
|
@ -474,6 +485,7 @@ class PDFFont(object):
|
|||
|
||||
def get_ascent(self):
|
||||
return self.ascent * self.vscale
|
||||
|
||||
def get_descent(self):
|
||||
return self.descent * self.vscale
|
||||
|
||||
|
@ -482,6 +494,7 @@ class PDFFont(object):
|
|||
if w == 0:
|
||||
w = -self.default_width
|
||||
return w * self.hscale
|
||||
|
||||
def get_height(self):
|
||||
h = self.bbox[3]-self.bbox[1]
|
||||
if h == 0:
|
||||
|
@ -540,6 +553,7 @@ class PDFSimpleFont(PDFFont):
|
|||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
|
||||
|
||||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
|
@ -571,12 +585,14 @@ class PDFType1Font(PDFSimpleFont):
|
|||
def __repr__(self):
|
||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFTrueTypeFont
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||
|
||||
|
||||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
|
@ -689,7 +705,8 @@ class PDFCIDFont(PDFFont):
|
|||
|
||||
def to_unichr(self, cid):
|
||||
try:
|
||||
if not self.unicode_map: raise KeyError(cid)
|
||||
if not self.unicode_map:
|
||||
raise KeyError(cid)
|
||||
return self.unicode_map.get_unichr(cid)
|
||||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
|
@ -705,4 +722,5 @@ def main(argv):
|
|||
fp.close()
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv))
|
||||
|
|
|
@ -30,8 +30,12 @@ from utils import mult_matrix, MATRIX_IDENTITY
|
|||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFResourceError(PDFException): pass
|
||||
class PDFInterpreterError(PDFException): pass
|
||||
class PDFResourceError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFInterpreterError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## Constants
|
||||
|
@ -120,6 +124,7 @@ class PDFGraphicState(object):
|
|||
(self.linewidth, self.linecap, self.linejoin,
|
||||
self.miterlimit, self.dash, self.intent, self.flatness))
|
||||
|
||||
|
||||
## Resource Manager
|
||||
##
|
||||
class PDFResourceManager(object):
|
||||
|
@ -152,7 +157,8 @@ class PDFResourceManager(object):
|
|||
try:
|
||||
return CMapDB.get_cmap(cmapname)
|
||||
except CMapDB.CMapNotFound:
|
||||
if strict: raise
|
||||
if strict:
|
||||
raise
|
||||
return CMap()
|
||||
|
||||
def get_font(self, objid, spec):
|
||||
|
@ -227,12 +233,14 @@ class PDFContentParser(PSStackParser):
|
|||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf): return
|
||||
if self.charpos < len(self.buf):
|
||||
return
|
||||
while 1:
|
||||
self.fillfp()
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
if self.buf: break
|
||||
if self.buf:
|
||||
break
|
||||
self.fp = None
|
||||
self.charpos = 0
|
||||
return
|
||||
|
@ -274,6 +282,7 @@ class PDFContentParser(PSStackParser):
|
|||
KEYWORD_BI = KWD('BI')
|
||||
KEYWORD_ID = KWD('ID')
|
||||
KEYWORD_EI = KWD('EI')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
if token is self.KEYWORD_BI:
|
||||
# inline image within a content stream
|
||||
|
@ -289,7 +298,8 @@ class PDFContentParser(PSStackParser):
|
|||
self.push((pos, obj))
|
||||
self.push((pos, self.KEYWORD_EI))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
else:
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
@ -316,7 +326,9 @@ class PDFPageInterpreter(object):
|
|||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
self.csmap = PREDEFINED_COLORSPACE.copy()
|
||||
if not resources: return
|
||||
if not resources:
|
||||
return
|
||||
|
||||
def get_colorspace(spec):
|
||||
if isinstance(spec, list):
|
||||
name = literal_name(spec[0])
|
||||
|
@ -371,7 +383,8 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
|
||||
def pop(self, n):
|
||||
if n == 0: return []
|
||||
if n == 0:
|
||||
return []
|
||||
x = self.argstack[-n:]
|
||||
self.argstack = self.argstack[:-n]
|
||||
return x
|
||||
|
@ -388,6 +401,7 @@ class PDFPageInterpreter(object):
|
|||
def do_q(self):
|
||||
self.gstack.append(self.get_current_state())
|
||||
return
|
||||
|
||||
# grestore
|
||||
def do_Q(self):
|
||||
if self.gstack:
|
||||
|
@ -404,30 +418,37 @@ class PDFPageInterpreter(object):
|
|||
def do_w(self, linewidth):
|
||||
self.graphicstate.linewidth = linewidth
|
||||
return
|
||||
|
||||
# setlinecap
|
||||
def do_J(self, linecap):
|
||||
self.graphicstate.linecap = linecap
|
||||
return
|
||||
|
||||
# setlinejoin
|
||||
def do_j(self, linejoin):
|
||||
self.graphicstate.linejoin = linejoin
|
||||
return
|
||||
|
||||
# setmiterlimit
|
||||
def do_M(self, miterlimit):
|
||||
self.graphicstate.miterlimit = miterlimit
|
||||
return
|
||||
|
||||
# setdash
|
||||
def do_d(self, dash, phase):
|
||||
self.graphicstate.dash = (dash, phase)
|
||||
return
|
||||
|
||||
# setintent
|
||||
def do_ri(self, intent):
|
||||
self.graphicstate.intent = intent
|
||||
return
|
||||
|
||||
# setflatness
|
||||
def do_i(self, flatness):
|
||||
self.graphicstate.flatness = flatness
|
||||
return
|
||||
|
||||
# load-gstate
|
||||
def do_gs(self, name):
|
||||
#XXX
|
||||
|
@ -437,26 +458,32 @@ class PDFPageInterpreter(object):
|
|||
def do_m(self, x, y):
|
||||
self.curpath.append(('m', x, y))
|
||||
return
|
||||
|
||||
# lineto
|
||||
def do_l(self, x, y):
|
||||
self.curpath.append(('l', x, y))
|
||||
return
|
||||
|
||||
# curveto
|
||||
def do_c(self, x1, y1, x2, y2, x3, y3):
|
||||
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
# urveto
|
||||
def do_v(self, x2, y2, x3, y3):
|
||||
self.curpath.append(('v', x2, y2, x3, y3))
|
||||
return
|
||||
|
||||
# rveto
|
||||
def do_y(self, x1, y1, x3, y3):
|
||||
self.curpath.append(('y', x1, y1, x3, y3))
|
||||
return
|
||||
|
||||
# closepath
|
||||
def do_h(self):
|
||||
self.curpath.append(('h',))
|
||||
return
|
||||
|
||||
# rectangle
|
||||
def do_re(self, x, y, w, h):
|
||||
self.curpath.append(('m', x, y))
|
||||
|
@ -471,11 +498,13 @@ class PDFPageInterpreter(object):
|
|||
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# close-and-stroke
|
||||
def do_s(self):
|
||||
self.do_h()
|
||||
self.do_S()
|
||||
return
|
||||
|
||||
# fill
|
||||
def do_f(self):
|
||||
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
||||
|
@ -483,68 +512,85 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
# fill (obsolete)
|
||||
do_F = do_f
|
||||
|
||||
# fill-even-odd
|
||||
def do_f_a(self):
|
||||
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# fill-and-stroke
|
||||
def do_B(self):
|
||||
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# fill-and-stroke-even-odd
|
||||
def do_B_a(self):
|
||||
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# close-fill-and-stroke
|
||||
def do_b(self):
|
||||
self.do_h()
|
||||
self.do_B()
|
||||
return
|
||||
|
||||
# close-fill-and-stroke-even-odd
|
||||
def do_b_a(self):
|
||||
self.do_h()
|
||||
self.do_B_a()
|
||||
return
|
||||
|
||||
# close-only
|
||||
def do_n(self):
|
||||
self.curpath = []
|
||||
return
|
||||
|
||||
# clip
|
||||
def do_W(self): return
|
||||
def do_W(self):
|
||||
return
|
||||
|
||||
# clip-even-odd
|
||||
def do_W_a(self): return
|
||||
def do_W_a(self):
|
||||
return
|
||||
|
||||
# setcolorspace-stroking
|
||||
def do_CS(self, name):
|
||||
self.scs = self.csmap[literal_name(name)]
|
||||
return
|
||||
|
||||
# setcolorspace-non-strokine
|
||||
def do_cs(self, name):
|
||||
self.ncs = self.csmap[literal_name(name)]
|
||||
return
|
||||
|
||||
# setgray-stroking
|
||||
def do_G(self, gray):
|
||||
#self.do_CS(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
|
||||
# setgray-non-stroking
|
||||
def do_g(self, gray):
|
||||
#self.do_cs(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
|
||||
# setrgb-stroking
|
||||
def do_RG(self, r, g, b):
|
||||
#self.do_CS(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setrgb-non-stroking
|
||||
def do_rg(self, r, g, b):
|
||||
#self.do_cs(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
|
||||
# setcmyk-stroking
|
||||
def do_K(self, c, m, y, k):
|
||||
#self.do_CS(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
|
||||
# setcmyk-non-stroking
|
||||
def do_k(self, c, m, y, k):
|
||||
#self.do_cs(LITERAL_DEVICE_CMYK)
|
||||
|
@ -560,6 +606,7 @@ class PDFPageInterpreter(object):
|
|||
n = 1
|
||||
self.pop(n)
|
||||
return
|
||||
|
||||
def do_scn(self):
|
||||
if self.ncs:
|
||||
n = self.ncs.ncomponents
|
||||
|
@ -569,42 +616,53 @@ class PDFPageInterpreter(object):
|
|||
n = 1
|
||||
self.pop(n)
|
||||
return
|
||||
|
||||
def do_SC(self):
|
||||
self.do_SCN()
|
||||
return
|
||||
|
||||
def do_sc(self):
|
||||
self.do_scn()
|
||||
return
|
||||
|
||||
# sharing-name
|
||||
def do_sh(self, name): return
|
||||
def do_sh(self, name):
|
||||
return
|
||||
|
||||
# begin-text
|
||||
def do_BT(self):
|
||||
self.textstate.reset()
|
||||
return
|
||||
|
||||
# end-text
|
||||
def do_ET(self):
|
||||
return
|
||||
|
||||
# begin-compat
|
||||
def do_BX(self): return
|
||||
def do_BX(self):
|
||||
return
|
||||
|
||||
# end-compat
|
||||
def do_EX(self): return
|
||||
def do_EX(self):
|
||||
return
|
||||
|
||||
# marked content operators
|
||||
def do_MP(self, tag):
|
||||
self.device.do_tag(tag)
|
||||
return
|
||||
|
||||
def do_DP(self, tag, props):
|
||||
self.device.do_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_BMC(self, tag):
|
||||
self.device.begin_tag(tag)
|
||||
return
|
||||
|
||||
def do_BDC(self, tag, props):
|
||||
self.device.begin_tag(tag, props)
|
||||
return
|
||||
|
||||
def do_EMC(self):
|
||||
self.device.end_tag()
|
||||
return
|
||||
|
@ -613,18 +671,22 @@ class PDFPageInterpreter(object):
|
|||
def do_Tc(self, space):
|
||||
self.textstate.charspace = space
|
||||
return
|
||||
|
||||
# setwordspace
|
||||
def do_Tw(self, space):
|
||||
self.textstate.wordspace = space
|
||||
return
|
||||
|
||||
# textscale
|
||||
def do_Tz(self, scale):
|
||||
self.textstate.scaling = scale
|
||||
return
|
||||
|
||||
# setleading
|
||||
def do_TL(self, leading):
|
||||
self.textstate.leading = -leading
|
||||
return
|
||||
|
||||
# selectfont
|
||||
def do_Tf(self, fontid, fontsize):
|
||||
try:
|
||||
|
@ -635,10 +697,12 @@ class PDFPageInterpreter(object):
|
|||
self.textstate.font = self.rsrcmgr.get_font(None, {})
|
||||
self.textstate.fontsize = fontsize
|
||||
return
|
||||
|
||||
# setrendering
|
||||
def do_Tr(self, render):
|
||||
self.textstate.render = render
|
||||
return
|
||||
|
||||
# settextrise
|
||||
def do_Ts(self, rise):
|
||||
self.textstate.rise = rise
|
||||
|
@ -651,6 +715,7 @@ class PDFPageInterpreter(object):
|
|||
self.textstate.linematrix = (0, 0)
|
||||
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
|
||||
return
|
||||
|
||||
# text-move
|
||||
def do_TD(self, tx, ty):
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
|
@ -659,11 +724,13 @@ class PDFPageInterpreter(object):
|
|||
self.textstate.linematrix = (0, 0)
|
||||
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
|
||||
return
|
||||
|
||||
# textmatrix
|
||||
def do_Tm(self, a, b, c, d, e, f):
|
||||
self.textstate.matrix = (a, b, c, d, e, f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
# nextline
|
||||
def do_T_a(self):
|
||||
(a, b, c, d, e, f) = self.textstate.matrix
|
||||
|
@ -680,15 +747,18 @@ class PDFPageInterpreter(object):
|
|||
return
|
||||
self.device.render_string(self.textstate, seq)
|
||||
return
|
||||
|
||||
# show
|
||||
def do_Tj(self, s):
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# quote
|
||||
def do__q(self, s):
|
||||
self.do_T_a()
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# doublequote
|
||||
def do__w(self, aw, ac, s):
|
||||
self.do_Tw(aw)
|
||||
|
@ -699,8 +769,10 @@ class PDFPageInterpreter(object):
|
|||
# inline image
|
||||
def do_BI(self): # never called
|
||||
return
|
||||
|
||||
def do_ID(self): # never called
|
||||
return
|
||||
|
||||
def do_EI(self, obj):
|
||||
if 'W' in obj and 'H' in obj:
|
||||
iobjid = str(id(obj))
|
||||
|
|
|
@ -71,6 +71,7 @@ class PDFPage(object):
|
|||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
|
||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||
|
||||
@classmethod
|
||||
def create_pages(klass, document, debug=0):
|
||||
def search(obj, parent):
|
||||
|
@ -110,7 +111,8 @@ class PDFPage(object):
|
|||
pass
|
||||
return
|
||||
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
|
||||
class PDFTextExtractionNotAllowed(PDFEncryptionError):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def get_pages(klass, fp,
|
||||
|
@ -128,7 +130,9 @@ class PDFPage(object):
|
|||
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
|
||||
# Process each page contained in the document.
|
||||
for (pageno, page) in enumerate(klass.create_pages(doc)):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
if pagenos and (pageno not in pagenos):
|
||||
continue
|
||||
yield page
|
||||
if maxpages and maxpages <= pageno+1: break
|
||||
if maxpages and maxpages <= pageno+1:
|
||||
break
|
||||
return
|
||||
|
|
|
@ -15,7 +15,8 @@ from pdftypes import str_value, list_value, dict_value, stream_value
|
|||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFSyntaxError(PDFException): pass
|
||||
class PDFSyntaxError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## PDFParser
|
||||
|
@ -55,6 +56,7 @@ class PDFParser(PSStackParser):
|
|||
KEYWORD_STREAM = KWD('stream')
|
||||
KEYWORD_XREF = KWD('xref')
|
||||
KEYWORD_STARTXREF = KWD('startxref')
|
||||
|
||||
def do_keyword(self, pos, token):
|
||||
"""Handles PDF-related keywords."""
|
||||
|
||||
|
|
|
@ -23,13 +23,28 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
|
|||
|
||||
## PDF Objects
|
||||
##
|
||||
class PDFObject(PSObject): pass
|
||||
class PDFObject(PSObject):
|
||||
pass
|
||||
|
||||
class PDFException(PSException): pass
|
||||
class PDFTypeError(PDFException): pass
|
||||
class PDFValueError(PDFException): pass
|
||||
class PDFObjectNotFound(PDFException): pass
|
||||
class PDFNotImplementedError(PDFException): pass
|
||||
|
||||
class PDFException(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFTypeError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFValueError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFObjectNotFound(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
class PDFNotImplementedError(PDFException):
|
||||
pass
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
|
@ -66,6 +81,7 @@ def resolve1(x, default=None):
|
|||
x = x.resolve(default=default)
|
||||
return x
|
||||
|
||||
|
||||
def resolve_all(x, default=None):
|
||||
"""Recursively resolves the given object and all the internals.
|
||||
|
||||
|
@ -81,6 +97,7 @@ def resolve_all(x, default=None):
|
|||
x[k] = resolve_all(v, default=default)
|
||||
return x
|
||||
|
||||
|
||||
def decipher_all(decipher, objid, genno, x):
|
||||
"""Recursively deciphers the given object.
|
||||
"""
|
||||
|
@ -93,6 +110,7 @@ def decipher_all(decipher, objid, genno, x):
|
|||
x[k] = decipher_all(decipher, objid, genno, v)
|
||||
return x
|
||||
|
||||
|
||||
# Type cheking
|
||||
def int_value(x):
|
||||
x = resolve1(x)
|
||||
|
@ -102,6 +120,7 @@ def int_value(x):
|
|||
return 0
|
||||
return x
|
||||
|
||||
|
||||
def float_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, float):
|
||||
|
@ -110,6 +129,7 @@ def float_value(x):
|
|||
return 0.0
|
||||
return x
|
||||
|
||||
|
||||
def num_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, int) or isinstance(x, float)):
|
||||
|
@ -118,6 +138,7 @@ def num_value(x):
|
|||
return 0
|
||||
return x
|
||||
|
||||
|
||||
def str_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, str):
|
||||
|
@ -126,6 +147,7 @@ def str_value(x):
|
|||
return ''
|
||||
return x
|
||||
|
||||
|
||||
def list_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||
|
@ -134,6 +156,7 @@ def list_value(x):
|
|||
return []
|
||||
return x
|
||||
|
||||
|
||||
def dict_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, dict):
|
||||
|
@ -142,6 +165,7 @@ def dict_value(x):
|
|||
return {}
|
||||
return x
|
||||
|
||||
|
||||
def stream_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, PDFStream):
|
||||
|
@ -195,12 +219,14 @@ class PDFStream(PDFObject):
|
|||
|
||||
def get_filters(self):
|
||||
filters = self.get_any(('F', 'Filter'))
|
||||
if not filters: return []
|
||||
if isinstance(filters, list): return filters
|
||||
if not filters:
|
||||
return []
|
||||
if isinstance(filters, list):
|
||||
return filters
|
||||
return [filters]
|
||||
|
||||
def decode(self):
|
||||
assert self.data is None and self.rawdata != None
|
||||
assert self.data is None and self.rawdata is not None
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
|
|
|
@ -8,11 +8,24 @@ STRICT = 0
|
|||
|
||||
## PS Exceptions
|
||||
##
|
||||
class PSException(Exception): pass
|
||||
class PSEOF(PSException): pass
|
||||
class PSSyntaxError(PSException): pass
|
||||
class PSTypeError(PSException): pass
|
||||
class PSValueError(PSException): pass
|
||||
class PSException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PSEOF(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSSyntaxError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSTypeError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
class PSValueError(PSException):
|
||||
pass
|
||||
|
||||
|
||||
## Basic PostScript Types
|
||||
|
@ -114,6 +127,7 @@ def literal_name(x):
|
|||
return str(x)
|
||||
return x.name
|
||||
|
||||
|
||||
def keyword_name(x):
|
||||
if not isinstance(x, PSKeyword):
|
||||
if STRICT:
|
||||
|
@ -137,6 +151,8 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
|
|||
END_STRING = re.compile(r'[()\134]')
|
||||
OCT_STRING = re.compile(r'[0-7]')
|
||||
ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92}
|
||||
|
||||
|
||||
class PSBaseParser(object):
|
||||
|
||||
"""Most basic PostScript parser that performs only tokenization.
|
||||
|
@ -190,7 +206,8 @@ class PSBaseParser(object):
|
|||
return
|
||||
|
||||
def fillbuf(self):
|
||||
if self.charpos < len(self.buf): return
|
||||
if self.charpos < len(self.buf):
|
||||
return
|
||||
# fetch next chunk.
|
||||
self.bufpos = self.fp.tell()
|
||||
self.buf = self.fp.read(self.BUFSIZ)
|
||||
|
@ -242,7 +259,8 @@ class PSBaseParser(object):
|
|||
pos = max(0, pos-self.BUFSIZ)
|
||||
self.fp.seek(pos)
|
||||
s = self.fp.read(prevpos-pos)
|
||||
if not s: break
|
||||
if not s:
|
||||
break
|
||||
while 1:
|
||||
n = max(s.rfind('\r'), s.rfind('\n'))
|
||||
if n == -1:
|
||||
|
@ -553,7 +571,8 @@ class PSStackParser(PSBaseParser):
|
|||
try:
|
||||
self.push(self.end_type('a'))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_DICT_BEGIN:
|
||||
# begin dictionary
|
||||
self.start_type(pos, 'd')
|
||||
|
@ -567,7 +586,8 @@ class PSStackParser(PSBaseParser):
|
|||
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
|
||||
self.push((pos, d))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
elif token == KEYWORD_PROC_BEGIN:
|
||||
# begin proc
|
||||
self.start_type(pos, 'p')
|
||||
|
@ -576,7 +596,8 @@ class PSStackParser(PSBaseParser):
|
|||
try:
|
||||
self.push(self.end_type('p'))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
if STRICT:
|
||||
raise
|
||||
else:
|
||||
if 2 <= self.debug:
|
||||
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
||||
|
@ -592,9 +613,11 @@ class PSStackParser(PSBaseParser):
|
|||
return obj
|
||||
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
## Simplistic Test cases
|
||||
##
|
||||
import unittest
|
||||
class TestPSBaseParser(unittest.TestCase):
|
||||
|
||||
TESTDATA = r'''%!PS
|
||||
|
@ -645,6 +668,7 @@ func/a/b{(c)do*}def
|
|||
|
||||
def get_tokens(self, s):
|
||||
import StringIO
|
||||
|
||||
class MyParser(PSBaseParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
|
@ -659,6 +683,7 @@ func/a/b{(c)do*}def
|
|||
|
||||
def get_objects(self, s):
|
||||
import StringIO
|
||||
|
||||
class MyParser(PSStackParser):
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
|
@ -683,4 +708,5 @@ func/a/b{(c)do*}def
|
|||
self.assertEqual(objs, self.OBJS)
|
||||
return
|
||||
|
||||
if __name__ == '__main__': unittest.main()
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
|
@ -13,9 +13,17 @@ by Philip J. Erdelsky:
|
|||
import sys
|
||||
import struct
|
||||
|
||||
def KEYLENGTH(keybits): return (keybits)/8
|
||||
def RKLENGTH(keybits): return (keybits)/8+28
|
||||
def NROUNDS(keybits): return (keybits)/32+6
|
||||
|
||||
def KEYLENGTH(keybits):
|
||||
return (keybits)/8
|
||||
|
||||
|
||||
def RKLENGTH(keybits):
|
||||
return (keybits)/8+28
|
||||
|
||||
|
||||
def NROUNDS(keybits):
|
||||
return (keybits)/32+6
|
||||
|
||||
Te0 = [
|
||||
0xc66363a5L, 0xf87c7c84L, 0xee777799L, 0xf67b7b8dL,
|
||||
|
@ -703,6 +711,7 @@ else:
|
|||
def GETU32(x): return struct.unpack('>I', x)[0]
|
||||
def PUTU32(x): return struct.pack('>I', x)
|
||||
|
||||
|
||||
# Expand the cipher key into the encryption key schedule.
|
||||
#
|
||||
# @return the number of rounds for the given cipher key size.
|
||||
|
@ -1051,6 +1060,7 @@ class RijndaelDecryptor(object):
|
|||
assert len(ciphertext) == 16
|
||||
return rijndaelDecrypt(self.rk, self.nrounds, ciphertext)
|
||||
|
||||
|
||||
# encrypt(key, fin, fout, keybits=256)
|
||||
class RijndaelEncryptor(object):
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
import sys
|
||||
|
||||
|
||||
def rldecode(data):
|
||||
"""
|
||||
RunLength decoder (Adobe version) implementation based on PDF Reference
|
||||
|
|
|
@ -52,20 +52,24 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
|||
##
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
|
||||
|
||||
def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
|
||||
"""Returns the multiplication of two matrices."""
|
||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||
|
||||
|
||||
def translate_matrix((a, b, c, d, e, f), (x, y)):
|
||||
"""Translates a matrix by (x, y)."""
|
||||
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
|
||||
|
||||
|
||||
def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
|
||||
"""Applies a matrix to a point."""
|
||||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
|
||||
|
||||
def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
|
||||
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
|
||||
return (a*p+c*q, b*p+d*q)
|
||||
|
@ -79,17 +83,20 @@ def uniq(objs):
|
|||
"""Eliminates duplicated elements."""
|
||||
done = set()
|
||||
for obj in objs:
|
||||
if obj in done: continue
|
||||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
yield obj
|
||||
return
|
||||
|
||||
|
||||
# csort
|
||||
def csort(objs, key=lambda x: x):
|
||||
"""Order-preserving sorting function."""
|
||||
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
|
||||
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
|
||||
|
||||
|
||||
# fsplit
|
||||
def fsplit(pred, objs):
|
||||
"""Split a list into two classes according to the predicate."""
|
||||
|
@ -102,12 +109,14 @@ def fsplit(pred, objs):
|
|||
f.append(obj)
|
||||
return (t, f)
|
||||
|
||||
|
||||
# drange
|
||||
def drange(v0, v1, d):
|
||||
"""Returns a discrete range."""
|
||||
assert v0 < v1
|
||||
return xrange(int(v0)/d, int(v1+d)/d)
|
||||
|
||||
|
||||
# get_bound
|
||||
def get_bound(pts):
|
||||
"""Compute a minimal rectangle that covers all the points."""
|
||||
|
@ -119,6 +128,7 @@ def get_bound(pts):
|
|||
y1 = max(y1, y)
|
||||
return (x0, y0, x1, y1)
|
||||
|
||||
|
||||
# pick
|
||||
def pick(seq, func, maxobj=None):
|
||||
"""Picks the object obj where func(obj) has the highest value."""
|
||||
|
@ -129,6 +139,7 @@ def pick(seq, func, maxobj=None):
|
|||
(maxscore, maxobj) = (score, obj)
|
||||
return maxobj
|
||||
|
||||
|
||||
# choplist
|
||||
def choplist(n, seq):
|
||||
"""Groups every n elements of the list."""
|
||||
|
@ -140,6 +151,7 @@ def choplist(n, seq):
|
|||
r = []
|
||||
return
|
||||
|
||||
|
||||
# nunpack
|
||||
def nunpack(s, default=0):
|
||||
"""Unpacks 1 to 4 byte integers (big endian)."""
|
||||
|
@ -157,6 +169,7 @@ def nunpack(s, default=0):
|
|||
else:
|
||||
raise TypeError('invalid length: %d' % l)
|
||||
|
||||
|
||||
# decode_text
|
||||
PDFDocEncoding = ''.join(unichr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
|
@ -192,6 +205,8 @@ PDFDocEncoding = ''.join( unichr(x) for x in (
|
|||
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||
))
|
||||
|
||||
|
||||
def decode_text(s):
|
||||
"""Decodes a PDFDocEncoding string to Unicode."""
|
||||
if s.startswith('\xfe\xff'):
|
||||
|
@ -199,15 +214,18 @@ def decode_text(s):
|
|||
else:
|
||||
return ''.join(PDFDocEncoding[ord(c)] for c in s)
|
||||
|
||||
|
||||
# enc
|
||||
def enc(x, codec='ascii'):
|
||||
"""Encodes a string for SGML/XML/HTML"""
|
||||
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
|
||||
|
||||
def bbox2str((x0, y0, x1, y1)):
|
||||
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
||||
|
||||
|
||||
def matrix2str((a, b, c, d, e, f)):
|
||||
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
|
||||
|
||||
|
@ -282,11 +300,14 @@ class Plane(object):
|
|||
def find(self, (x0, y0, x1, y1)):
|
||||
done = set()
|
||||
for k in self._getrange((x0, y0, x1, y1)):
|
||||
if k not in self._grid: continue
|
||||
if k not in self._grid:
|
||||
continue
|
||||
for obj in self._grid[k]:
|
||||
if obj in done: continue
|
||||
if obj in done:
|
||||
continue
|
||||
done.add(obj)
|
||||
if (obj.x1 <= x0 or x1 <= obj.x0 or
|
||||
obj.y1 <= y0 or y1 <= obj.y0): continue
|
||||
obj.y1 <= y0 or y1 <= obj.y0):
|
||||
continue
|
||||
yield obj
|
||||
return
|
||||
|
|
Loading…
Reference in New Issue