pull/1/head
Yusuke Shinyama 2013-11-07 19:50:41 +09:00
commit 2b56b2eedf
28 changed files with 1484 additions and 1216 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
__version__ = '20131022' __version__ = '20131022'
if __name__ == '__main__': print __version__ if __name__ == '__main__':
print __version__

View File

@ -6,6 +6,7 @@ This code is in the public domain.
""" """
## Arcfour ## Arcfour
## ##
class Arcfour(object): class Arcfour(object):

View File

@ -9,6 +9,7 @@ This code is in the public domain.
import re import re
import struct import struct
# ascii85decode(data) # ascii85decode(data)
def ascii85decode(data): def ascii85decode(data):
""" """
@ -16,13 +17,13 @@ def ascii85decode(data):
letters, using 85 different types of characters (as 256**4 < 85**5). letters, using 85 different types of characters (as 256**4 < 85**5).
When the length of the original bytes is not a multiple of 4, a special When the length of the original bytes is not a multiple of 4, a special
rule is used for round up. rule is used for round up.
The Adobe's ASCII85 implementation is slightly different from The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters. its original in handling the last characters.
The sample string is taken from: The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85 http://en.wikipedia.org/w/index.php?title=Ascii85
>>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q') >>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q')
'Man is distinguished' 'Man is distinguished'
>>> ascii85decode('E,9)oF*2M7/c~>') >>> ascii85decode('E,9)oF*2M7/c~>')
@ -35,7 +36,7 @@ def ascii85decode(data):
n += 1 n += 1
b = b*85+(ord(c)-33) b = b*85+(ord(c)-33)
if n == 5: if n == 5:
out += struct.pack('>L',b) out += struct.pack('>L', b)
n = b = 0 n = b = 0
elif c == 'z': elif c == 'z':
assert n == 0 assert n == 0
@ -44,13 +45,15 @@ def ascii85decode(data):
if n: if n:
for _ in range(5-n): for _ in range(5-n):
b = b*85+84 b = b*85+84
out += struct.pack('>L',b)[:n-1] out += struct.pack('>L', b)[:n-1]
break break
return out return out
# asciihexdecode(data) # asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE) hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data): def asciihexdecode(data):
""" """
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
@ -60,7 +63,7 @@ def asciihexdecode(data):
EOD. Any other characters will cause an error. If the filter encounters EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit. will behave as if a 0 followed the last digit.
>>> asciihexdecode('61 62 2e6364 65') >>> asciihexdecode('61 62 2e6364 65')
'ab.cde' 'ab.cde'
>>> asciihexdecode('61 62 2e6364 657>') >>> asciihexdecode('61 62 2e6364 657>')

View File

@ -29,7 +29,7 @@ class BitParser(object):
for i in xrange(len(bits)): for i in xrange(len(bits)):
if 0 < i: if 0 < i:
if p[b] is None: if p[b] is None:
p[b] = [None,None] p[b] = [None, None]
p = p[b] p = p[b]
if bits[i] == '1': if bits[i] == '1':
b = 1 b = 1
@ -41,7 +41,7 @@ class BitParser(object):
def feedbytes(self, data): def feedbytes(self, data):
for c in data: for c in data:
b = ord(c) b = ord(c)
for m in (128,64,32,16,8,4,2,1): for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m) self._parse_bit(b & m)
return return
@ -62,7 +62,7 @@ class BitParser(object):
## ##
class CCITTG4Parser(BitParser): class CCITTG4Parser(BitParser):
MODE = [None,None] MODE = [None, None]
BitParser.add(MODE, 0, '1') BitParser.add(MODE, 0, '1')
BitParser.add(MODE, +1, '011') BitParser.add(MODE, +1, '011')
BitParser.add(MODE, -1, '010') BitParser.add(MODE, -1, '010')
@ -82,7 +82,7 @@ class CCITTG4Parser(BitParser):
BitParser.add(MODE, 'x7', '0000001110') BitParser.add(MODE, 'x7', '0000001110')
BitParser.add(MODE, 'e', '000000000001000000000001') BitParser.add(MODE, 'e', '000000000001000000000001')
WHITE = [None,None] WHITE = [None, None]
BitParser.add(WHITE, 0 , '00110101') BitParser.add(WHITE, 0 , '00110101')
BitParser.add(WHITE, 1 , '000111') BitParser.add(WHITE, 1 , '000111')
BitParser.add(WHITE, 2 , '0111') BitParser.add(WHITE, 2 , '0111')
@ -188,7 +188,7 @@ class CCITTG4Parser(BitParser):
BitParser.add(WHITE, 2496, '000000011110') BitParser.add(WHITE, 2496, '000000011110')
BitParser.add(WHITE, 2560, '000000011111') BitParser.add(WHITE, 2560, '000000011111')
BLACK = [None,None] BLACK = [None, None]
BitParser.add(BLACK, 0 , '0000110111') BitParser.add(BLACK, 0 , '0000110111')
BitParser.add(BLACK, 1 , '010') BitParser.add(BLACK, 1 , '010')
BitParser.add(BLACK, 2 , '11') BitParser.add(BLACK, 2 , '11')
@ -294,25 +294,30 @@ class CCITTG4Parser(BitParser):
BitParser.add(BLACK, 2496, '000000011110') BitParser.add(BLACK, 2496, '000000011110')
BitParser.add(BLACK, 2560, '000000011111') BitParser.add(BLACK, 2560, '000000011111')
UNCOMPRESSED = [None,None] UNCOMPRESSED = [None, None]
BitParser.add(UNCOMPRESSED, '1' , '1') BitParser.add(UNCOMPRESSED, '1', '1')
BitParser.add(UNCOMPRESSED, '01' , '01') BitParser.add(UNCOMPRESSED, '01', '01')
BitParser.add(UNCOMPRESSED, '001' , '001') BitParser.add(UNCOMPRESSED, '001', '001')
BitParser.add(UNCOMPRESSED, '0001' , '0001') BitParser.add(UNCOMPRESSED, '0001', '0001')
BitParser.add(UNCOMPRESSED, '00001' , '00001') BitParser.add(UNCOMPRESSED, '00001', '00001')
BitParser.add(UNCOMPRESSED, '00000' , '000001') BitParser.add(UNCOMPRESSED, '00000', '000001')
BitParser.add(UNCOMPRESSED, 'T00' , '00000011') BitParser.add(UNCOMPRESSED, 'T00', '00000011')
BitParser.add(UNCOMPRESSED, 'T10' , '00000010') BitParser.add(UNCOMPRESSED, 'T10', '00000010')
BitParser.add(UNCOMPRESSED, 'T000' , '000000011') BitParser.add(UNCOMPRESSED, 'T000', '000000011')
BitParser.add(UNCOMPRESSED, 'T100' , '000000010') BitParser.add(UNCOMPRESSED, 'T100', '000000010')
BitParser.add(UNCOMPRESSED, 'T0000' , '0000000011') BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
BitParser.add(UNCOMPRESSED, 'T1000' , '0000000010') BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
BitParser.add(UNCOMPRESSED, 'T00000' , '00000000011') BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000' , '00000000010') BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
class EOFB(Exception): pass class EOFB(Exception):
class InvalidData(Exception): pass pass
class ByteSkip(Exception): pass
class InvalidData(Exception):
pass
class ByteSkip(Exception):
pass
def __init__(self, width, bytealign=False): def __init__(self, width, bytealign=False):
BitParser.__init__(self) BitParser.__init__(self)
@ -325,7 +330,7 @@ class CCITTG4Parser(BitParser):
for c in data: for c in data:
b = ord(c) b = ord(c)
try: try:
for m in (128,64,32,16,8,4,2,1): for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m) self._parse_bit(b & m)
except self.ByteSkip: except self.ByteSkip:
self._accept = self._parse_mode self._accept = self._parse_mode
@ -359,7 +364,8 @@ class CCITTG4Parser(BitParser):
raise self.InvalidData(mode) raise self.InvalidData(mode)
def _parse_horiz1(self, n): def _parse_horiz1(self, n):
if n is None: raise self.InvalidData if n is None:
raise self.InvalidData
self._n1 += n self._n1 += n
if n < 64: if n < 64:
self._n2 = 0 self._n2 = 0
@ -371,7 +377,8 @@ class CCITTG4Parser(BitParser):
return self.BLACK return self.BLACK
def _parse_horiz2(self, n): def _parse_horiz2(self, n):
if n is None: raise self.InvalidData if n is None:
raise self.InvalidData
self._n2 += n self._n2 += n
if n < 64: if n < 64:
self._color = 1-self._color self._color = 1-self._color
@ -385,9 +392,10 @@ class CCITTG4Parser(BitParser):
return self.BLACK return self.BLACK
def _parse_uncompressed(self, bits): def _parse_uncompressed(self, bits):
if not bits: raise self.InvalidData if not bits:
raise self.InvalidData
if bits.startswith('T'): if bits.startswith('T'):
self._accept = self._parse_mode self._accept = self._parse_mode
self._color = int(bits[1]) self._color = int(bits[1])
self._do_uncompressed(bits[2:]) self._do_uncompressed(bits[2:])
return self.MODE return self.MODE
@ -396,17 +404,17 @@ class CCITTG4Parser(BitParser):
return self.UNCOMPRESSED return self.UNCOMPRESSED
def _get_bits(self): def _get_bits(self):
return ''.join( str(b) for b in self._curline[:self._curpos] ) return ''.join(str(b) for b in self._curline[:self._curpos])
def _get_refline(self, i): def _get_refline(self, i):
if i < 0: if i < 0:
return '[]'+''.join( str(b) for b in self._refline ) return '[]'+''.join(str(b) for b in self._refline)
elif len(self._refline) <= i: elif len(self._refline) <= i:
return ''.join( str(b) for b in self._refline )+'[]' return ''.join(str(b) for b in self._refline)+'[]'
else: else:
return (''.join( str(b) for b in self._refline[:i] )+ return (''.join(str(b) for b in self._refline[:i]) +
'['+str(self._refline[i])+']'+ '['+str(self._refline[i])+']' +
''.join( str(b) for b in self._refline[i+1:] )) ''.join(str(b) for b in self._refline[i+1:]))
def reset(self): def reset(self):
self._y = 0 self._y = 0
@ -417,16 +425,16 @@ class CCITTG4Parser(BitParser):
return return
def output_line(self, y, bits): def output_line(self, y, bits):
print y, ''.join( str(b) for b in bits ) print y, ''.join(str(b) for b in bits)
return return
def _reset_line(self): def _reset_line(self):
self._refline = self._curline self._refline = self._curline
self._curline = array.array('b', [1]*self.width) self._curline = array.array('b', [1]*self.width)
self._curpos = -1 self._curpos = -1
self._color = 1 self._color = 1
return return
def _flush_line(self): def _flush_line(self):
if self.width <= self._curpos: if self.width <= self._curpos:
self.output_line(self._y, self._curline) self.output_line(self._y, self._curline)
@ -442,12 +450,13 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos+1 x1 = self._curpos+1
while 1: while 1:
if x1 == 0: if x1 == 0:
if (self._color == 1 and if (self._color == 1 and self._refline[x1] != self._color):
self._refline[x1] != self._color): break break
elif x1 == len(self._refline): elif x1 == len(self._refline):
break break
elif (self._refline[x1-1] == self._color and elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break self._refline[x1] != self._color):
break
x1 += 1 x1 += 1
x1 += dx x1 += dx
x0 = max(0, self._curpos) x0 = max(0, self._curpos)
@ -461,50 +470,54 @@ class CCITTG4Parser(BitParser):
self._curpos = x1 self._curpos = x1
self._color = 1-self._color self._color = 1-self._color
return return
def _do_pass(self): def _do_pass(self):
#print '* pass: curpos=%r, color=%r' % (self._curpos, self._color) #print '* pass: curpos=%r, color=%r' % (self._curpos, self._color)
#print ' refline:', self._get_refline(self._curpos+1) #print ' refline:', self._get_refline(self._curpos+1)
x1 = self._curpos+1 x1 = self._curpos+1
while 1: while 1:
if x1 == 0: if x1 == 0:
if (self._color == 1 and if (self._color == 1 and self._refline[x1] != self._color):
self._refline[x1] != self._color): break break
elif x1 == len(self._refline): elif x1 == len(self._refline):
break break
elif (self._refline[x1-1] == self._color and elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break self._refline[x1] != self._color):
break
x1 += 1 x1 += 1
while 1: while 1:
if x1 == 0: if x1 == 0:
if (self._color == 0 and if (self._color == 0 and self._refline[x1] == self._color):
self._refline[x1] == self._color): break break
elif x1 == len(self._refline): elif x1 == len(self._refline):
break break
elif (self._refline[x1-1] != self._color and elif (self._refline[x1-1] != self._color and
self._refline[x1] == self._color): break self._refline[x1] == self._color):
break
x1 += 1 x1 += 1
for x in xrange(self._curpos, x1): for x in xrange(self._curpos, x1):
self._curline[x] = self._color self._curline[x] = self._color
self._curpos = x1 self._curpos = x1
return return
def _do_horizontal(self, n1, n2): def _do_horizontal(self, n1, n2):
#print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color) #print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color)
if self._curpos < 0: if self._curpos < 0:
self._curpos = 0 self._curpos = 0
x = self._curpos x = self._curpos
for _ in xrange(n1): for _ in xrange(n1):
if len(self._curline) <= x: break if len(self._curline) <= x:
break
self._curline[x] = self._color self._curline[x] = self._color
x += 1 x += 1
for _ in xrange(n2): for _ in xrange(n2):
if len(self._curline) <= x: break if len(self._curline) <= x:
break
self._curline[x] = 1-self._color self._curline[x] = 1-self._color
x += 1 x += 1
self._curpos = x self._curpos = x
return return
def _do_uncompressed(self, bits): def _do_uncompressed(self, bits):
#print '* uncompressed(%r): curpos=%r' % (bits, self._curpos) #print '* uncompressed(%r): curpos=%r' % (bits, self._curpos)
for c in bits: for c in bits:
@ -513,15 +526,16 @@ class CCITTG4Parser(BitParser):
self._flush_line() self._flush_line()
return return
import unittest
## Test cases ## Test cases
## ##
import unittest
class TestCCITTG4Parser(unittest.TestCase): class TestCCITTG4Parser(unittest.TestCase):
def get_parser(self, bits): def get_parser(self, bits):
parser = CCITTG4Parser(len(bits)) parser = CCITTG4Parser(len(bits))
parser._curline = [ int(c) for c in bits ] parser._curline = [int(c) for c in bits]
parser._reset_line() parser._reset_line()
return parser return parser
@ -656,7 +670,7 @@ class TestCCITTG4Parser(unittest.TestCase):
parser._do_vertical(-1) parser._do_vertical(-1)
parser._do_vertical(-1) parser._do_vertical(-1)
parser._do_vertical(1) parser._do_vertical(1)
parser._do_horizontal(1,1) parser._do_horizontal(1, 1)
self.assertEqual(parser._get_bits(), '011101') self.assertEqual(parser._get_bits(), '011101')
return return
@ -673,23 +687,23 @@ class TestCCITTG4Parser(unittest.TestCase):
## CCITTFaxDecoder ## CCITTFaxDecoder
## ##
class CCITTFaxDecoder(CCITTG4Parser): class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width, bytealign=False, reversed=False): def __init__(self, width, bytealign=False, reversed=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign) CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed self.reversed = reversed
self._buf = '' self._buf = ''
return return
def close(self): def close(self):
return self._buf return self._buf
def output_line(self, y, bits): def output_line(self, y, bits):
bytes = array.array('B', [0]*((len(bits)+7)/8)) bytes = array.array('B', [0]*((len(bits)+7)/8))
if self.reversed: if self.reversed:
bits = [ 1-b for b in bits ] bits = [1-b for b in bits]
for (i,b) in enumerate(bits): for (i, b) in enumerate(bits):
if b: if b:
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8] bytes[i/8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
self._buf += bytes.tostring() self._buf += bytes.tostring()
return return
@ -705,35 +719,39 @@ def ccittfaxdecode(data, params):
raise ValueError(K) raise ValueError(K)
parser.feedbytes(data) parser.feedbytes(data)
return parser.close() return parser.close()
# test # test
def main(argv): def main(argv):
import pygame import pygame
if not argv[1:]: if not argv[1:]:
return unittest.main() return unittest.main()
class Parser(CCITTG4Parser): class Parser(CCITTG4Parser):
def __init__(self, width, bytealign=False): def __init__(self, width, bytealign=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign) CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width,1000)) self.img = pygame.Surface((self.width, 1000))
return return
def output_line(self, y, bits): def output_line(self, y, bits):
for (x,b) in enumerate(bits): for (x, b) in enumerate(bits):
if b: if b:
self.img.set_at((x,y), (255,255,255)) self.img.set_at((x, y), (255, 255, 255))
else: else:
self.img.set_at((x,y), (0,0,0)) self.img.set_at((x, y), (0, 0, 0))
return return
def close(self): def close(self):
pygame.image.save(self.img, 'out.bmp') pygame.image.save(self.img, 'out.bmp')
return return
for path in argv[1:]: for path in argv[1:]:
fp = file(path,'rb') fp = file(path, 'rb')
(_,_,k,w,h,_) = path.split('.') (_, _, k, w, h, _) = path.split('.')
parser = Parser(int(w)) parser = Parser(int(w))
parser.feedbytes(fp.read()) parser.feedbytes(fp.read())
parser.close() parser.close()
fp.close() fp.close()
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -25,7 +25,8 @@ from encodingdb import name2unicode
from utils import choplist, nunpack from utils import choplist, nunpack
class CMapError(Exception): pass class CMapError(Exception):
pass
## CMap ## CMap
@ -43,8 +44,9 @@ class CMap(object):
def use_cmap(self, cmap): def use_cmap(self, cmap):
assert isinstance(cmap, CMap) assert isinstance(cmap, CMap)
def copy(dst, src): def copy(dst, src):
for (k,v) in src.iteritems(): for (k, v) in src.iteritems():
if isinstance(v, dict): if isinstance(v, dict):
d = {} d = {}
dst[k] = d dst[k] = d
@ -73,14 +75,14 @@ class CMap(object):
if code2cid is None: if code2cid is None:
code2cid = self.code2cid code2cid = self.code2cid
code = () code = ()
for (k,v) in sorted(code2cid.iteritems()): for (k, v) in sorted(code2cid.iteritems()):
c = code+(k,) c = code+(k,)
if isinstance(v, int): if isinstance(v, int):
out.write('code %r = cid %d\n' % (c,v)) out.write('code %r = cid %d\n' % (c, v))
else: else:
self.dump(out=out, code2cid=v, code=c) self.dump(out=out, code2cid=v, code=c)
return return
## IdentityCMap ## IdentityCMap
## ##
@ -99,8 +101,7 @@ class IdentityCMap(object):
return struct.unpack('>%dH' % n, code) return struct.unpack('>%dH' % n, code)
else: else:
return () return ()
## UnicodeMap ## UnicodeMap
## ##
@ -118,8 +119,8 @@ class UnicodeMap(object):
return self.cid2unichr[cid] return self.cid2unichr[cid]
def dump(self, out=sys.stdout): def dump(self, out=sys.stdout):
for (k,v) in sorted(self.cid2unichr.iteritems()): for (k, v) in sorted(self.cid2unichr.iteritems()):
out.write('cid %d = unicode %r\n' % (k,v)) out.write('cid %d = unicode %r\n' % (k, v))
return return
@ -152,7 +153,7 @@ class FileCMap(CMap):
else: else:
t = {} t = {}
d[c] = t d[c] = t
d =t d = t
c = ord(code[-1]) c = ord(code[-1])
d[c] = cid d[c] = cid
return return
@ -161,7 +162,7 @@ class FileCMap(CMap):
## FileUnicodeMap ## FileUnicodeMap
## ##
class FileUnicodeMap(UnicodeMap): class FileUnicodeMap(UnicodeMap):
def __init__(self): def __init__(self):
UnicodeMap.__init__(self) UnicodeMap.__init__(self)
self.attrs = {} self.attrs = {}
@ -204,12 +205,12 @@ class PyCMap(CMap):
def is_vertical(self): def is_vertical(self):
return self._is_vertical return self._is_vertical
## PyUnicodeMap ## PyUnicodeMap
## ##
class PyUnicodeMap(UnicodeMap): class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical): def __init__(self, name, module, vertical):
if vertical: if vertical:
cid2unichr = module.CID2UNICHR_V cid2unichr = module.CID2UNICHR_V
@ -230,18 +231,17 @@ class CMapDB(object):
debug = 0 debug = 0
_cmap_cache = {} _cmap_cache = {}
_umap_cache = {} _umap_cache = {}
class CMapNotFound(CMapError): pass class CMapNotFound(CMapError):
pass
@classmethod @classmethod
def _load_data(klass, name): def _load_data(klass, name):
filename = '%s.pickle.gz' % name filename = '%s.pickle.gz' % name
if klass.debug: if klass.debug:
print >>sys.stderr, 'loading:', name print >>sys.stderr, 'loading:', name
cmap_paths = ( cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'), os.path.join(os.path.dirname(__file__), 'cmap'),)
os.path.join(os.path.dirname(__file__), 'cmap'),
)
for directory in cmap_paths: for directory in cmap_paths:
path = os.path.join(directory, filename) path = os.path.join(directory, filename)
if os.path.exists(path): if os.path.exists(path):
@ -305,11 +305,12 @@ class CMapParser(PSStackParser):
elif name == 'endcmap': elif name == 'endcmap':
self._in_cmap = False self._in_cmap = False
return return
if not self._in_cmap: return if not self._in_cmap:
return
# #
if name == 'def': if name == 'def':
try: try:
((_,k),(_,v)) = self.pop(2) ((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v) self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError: except PSSyntaxError:
pass pass
@ -317,7 +318,7 @@ class CMapParser(PSStackParser):
if name == 'usecmap': if name == 'usecmap':
try: try:
((_,cmapname),) = self.pop(1) ((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname))) self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError: except PSSyntaxError:
pass pass
@ -336,13 +337,15 @@ class CMapParser(PSStackParser):
self.popall() self.popall()
return return
if name == 'endcidrange': if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ] objs = [obj for (__, obj) in self.popall()]
for (s,e,cid) in choplist(3, objs): for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4] sprefix = s[:-4]
eprefix = e[:-4] eprefix = e[:-4]
if sprefix != eprefix: continue if sprefix != eprefix:
continue
svar = s[-4:] svar = s[-4:]
evar = e[-4:] evar = e[-4:]
s1 = nunpack(svar) s1 = nunpack(svar)
@ -350,7 +353,7 @@ class CMapParser(PSStackParser):
vlen = len(svar) vlen = len(svar)
#assert s1 <= e1 #assert s1 <= e1
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = sprefix+struct.pack('>L',s1+i)[-vlen:] x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i) self.cmap.add_code2cid(x, cid+i)
return return
@ -358,8 +361,8 @@ class CMapParser(PSStackParser):
self.popall() self.popall()
return return
if name == 'endcidchar': if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ] objs = [obj for (__, obj) in self.popall()]
for (cid,code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str): if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid)) self.cmap.add_code2cid(code, nunpack(cid))
return return
@ -368,10 +371,11 @@ class CMapParser(PSStackParser):
self.popall() self.popall()
return return
if name == 'endbfrange': if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ] objs = [obj for (__, obj) in self.popall()]
for (s,e,code) in choplist(3, objs): for (s, e, code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue len(s) != len(e)):
continue
s1 = nunpack(s) s1 = nunpack(s)
e1 = nunpack(e) e1 = nunpack(e)
#assert s1 <= e1 #assert s1 <= e1
@ -384,7 +388,7 @@ class CMapParser(PSStackParser):
prefix = code[:-4] prefix = code[:-4]
vlen = len(var) vlen = len(var)
for i in xrange(e1-s1+1): for i in xrange(e1-s1+1):
x = prefix+struct.pack('>L',base+i)[-vlen:] x = prefix+struct.pack('>L', base+i)[-vlen:]
self.cmap.add_cid2unichr(s1+i, x) self.cmap.add_cid2unichr(s1+i, x)
return return
@ -392,8 +396,8 @@ class CMapParser(PSStackParser):
self.popall() self.popall()
return return
if name == 'endbfchar': if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ] objs = [obj for (__, obj) in self.popall()]
for (cid,code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str): if isinstance(cid, str) and isinstance(code, str):
self.cmap.add_cid2unichr(nunpack(cid), code) self.cmap.add_cid2unichr(nunpack(cid), code)
return return
@ -408,6 +412,7 @@ class CMapParser(PSStackParser):
self.push((pos, token)) self.push((pos, token))
return return
# test # test
def main(argv): def main(argv):
args = argv[1:] args = argv[1:]
@ -420,4 +425,5 @@ def main(argv):
cmap.dump() cmap.dump()
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -21,9 +21,9 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
(x0,y0,x1,y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
(x0,y0) = apply_matrix_pt(ctm, (x0,y0)) (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1,y1) = apply_matrix_pt(ctm, (x1,y1)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox) self.cur_item = LTPage(self.pageno, mediabox)
return return
@ -61,26 +61,26 @@ class PDFLayoutAnalyzer(PDFTextDevice):
shape = ''.join(x[0] for x in path) shape = ''.join(x[0] for x in path)
if shape == 'ml': if shape == 'ml':
# horizontal/vertical line # horizontal/vertical line
(_,x0,y0) = path[0] (_, x0, y0) = path[0]
(_,x1,y1) = path[1] (_, x1, y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
if x0 == x1 or y0 == y1: if x0 == x1 or y0 == y1:
self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1))) self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1)))
return return
if shape == 'mlllh': if shape == 'mlllh':
# rectangle # rectangle
(_,x0,y0) = path[0] (_, x0, y0) = path[0]
(_,x1,y1) = path[1] (_, x1, y1) = path[1]
(_,x2,y2) = path[2] (_, x2, y2) = path[2]
(_,x3,y3) = path[3] (_, x3, y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) (x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3)) (x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2)))
return return
# other shapes # other shapes
pts = [] pts = []
@ -119,7 +119,7 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.result = None self.result = None
return return
def receive_layout(self, ltpage): def receive_layout(self, ltpage):
self.result = ltpage self.result = ltpage
return return
@ -137,7 +137,7 @@ class PDFConverter(PDFLayoutAnalyzer):
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
## TextConverter ## TextConverter
## ##
@ -176,10 +176,11 @@ class TextConverter(PDFConverter):
# is text. This stops all the image and drawing ouput from being # is text. This stops all the image and drawing ouput from being
# recorded and taking up RAM. # recorded and taking up RAM.
def render_image(self, name, stream): def render_image(self, name, stream):
if self.imagewriter is None: return if self.imagewriter is None:
return
PDFConverter.render_image(self, name, stream) PDFConverter.render_image(self, name, stream)
return return
def paint_path(self, gstate, stroke, fill, evenodd, path): def paint_path(self, gstate, stroke, fill, evenodd, path):
return return
@ -196,18 +197,18 @@ class HTMLConverter(PDFConverter):
'textgroup': 'red', 'textgroup': 'red',
'curve': 'black', 'curve': 'black',
'page': 'gray', 'page': 'gray',
} }
TEXT_COLORS = { TEXT_COLORS = {
'textbox': 'blue', 'textbox': 'blue',
'char': 'black', 'char': 'black',
} }
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True, scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
pagemargin=50, imagewriter=None, pagemargin=50, imagewriter=None,
rect_colors={'curve':'black', 'page':'gray'}, rect_colors={'curve': 'black', 'page': 'gray'},
text_colors={'char':'black'}): text_colors={'char': 'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.scale = scale self.scale = scale
self.fontscale = fontscale self.fontscale = fontscale
@ -238,7 +239,7 @@ class HTMLConverter(PDFConverter):
def write_footer(self): def write_footer(self):
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' % self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno))) ', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno)))
self.write('</body></html>\n') self.write('</body></html>\n')
return return
@ -295,7 +296,7 @@ class HTMLConverter(PDFConverter):
self._font = self._fontstack.pop() self._font = self._fontstack.pop()
self.write('</div>') self.write('</div>')
return return
def put_text(self, text, fontname, fontsize): def put_text(self, text, fontname, fontsize):
font = (fontname, fontsize) font = (fontname, fontsize)
if font != self._font: if font != self._font:
@ -318,6 +319,7 @@ class HTMLConverter(PDFConverter):
for child in item: for child in item:
show_group(child) show_group(child)
return return
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self._yoffset += item.y1 self._yoffset += item.y1
@ -399,7 +401,7 @@ class XMLConverter(PDFConverter):
def write_footer(self): def write_footer(self):
self.outfp.write('</pages>\n') self.outfp.write('</pages>\n')
return return
def write_text(self, text): def write_text(self, text):
self.outfp.write(enc(text, self.codec)) self.outfp.write(enc(text, self.codec))
return return
@ -415,6 +417,7 @@ class XMLConverter(PDFConverter):
show_group(child) show_group(child)
self.outfp.write('</textgroup>\n') self.outfp.write('</textgroup>\n')
return return
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %

View File

@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
from latin_enc import ENCODING from latin_enc import ENCODING
STRIP_NAME = re.compile(r'[0-9]+')
## name2unicode ## name2unicode
## ##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name): def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers.""" """Converts Adobe glyph names to Unicode numbers."""
if name in glyphname2unicode: if name in glyphname2unicode:
return glyphname2unicode[name] return glyphname2unicode[name]
m = STRIP_NAME.search(name) m = STRIP_NAME.search(name)
if not m: raise KeyError(name) if not m:
raise KeyError(name)
return unichr(int(m.group(0))) return unichr(int(m.group(0)))
@ -26,19 +29,23 @@ class EncodingDB(object):
mac2unicode = {} mac2unicode = {}
win2unicode = {} win2unicode = {}
pdf2unicode = {} pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING: for (name, std, mac, win, pdf) in ENCODING:
c = name2unicode(name) c = name2unicode(name)
if std: std2unicode[std] = c if std:
if mac: mac2unicode[mac] = c std2unicode[std] = c
if win: win2unicode[win] = c if mac:
if pdf: pdf2unicode[pdf] = c mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = { encodings = {
'StandardEncoding': std2unicode, 'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode, 'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode, 'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode, 'PDFDocEncoding': pdf2unicode,
} }
@classmethod @classmethod
def get_encoding(klass, name, diff=None): def get_encoding(klass, name, diff=None):

View File

@ -8,7 +8,7 @@ written with a proportional font.
The following data were extracted from the AFM files: The following data were extracted from the AFM files:
http://www.ctan.org/tex-archive/fonts/adobe/afm/ http://www.ctan.org/tex-archive/fonts/adobe/afm/
""" """
### BEGIN Verbatim copy of the license part ### BEGIN Verbatim copy of the license part

View File

@ -5,9 +5,11 @@ import os, os.path
from pdftypes import LITERALS_DCT_DECODE from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
def align32(x): def align32(x):
return ((x+3)/4)*4 return ((x+3)/4)*4
## BMPWriter ## BMPWriter
## ##
class BMPWriter(object): class BMPWriter(object):
@ -36,12 +38,12 @@ class BMPWriter(object):
self.fp.write(info) self.fp.write(info)
if ncols == 2: if ncols == 2:
# B&W color table # B&W color table
for i in (0,255): for i in (0, 255):
self.fp.write(struct.pack('BBBx', i,i,i)) self.fp.write(struct.pack('BBBx', i, i, i))
elif ncols == 256: elif ncols == 256:
# grayscale color table # grayscale color table
for i in xrange(256): for i in xrange(256):
self.fp.write(struct.pack('BBBx', i,i,i)) self.fp.write(struct.pack('BBBx', i, i, i))
self.pos0 = self.fp.tell() self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize self.pos1 = self.pos0 + self.datasize
return return
@ -68,7 +70,7 @@ class ImageWriter(object):
(width, height) = image.srcsize (width, height) = image.srcsize
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE: if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
ext = '.jpg' ext = '.jpg'
elif (image.bits == 1 or elif (image.bits == 1 or
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)): image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
ext = '.%dx%d.bmp' % (width, height) ext = '.%dx%d.bmp' % (width, height)
else: else:
@ -82,7 +84,7 @@ class ImageWriter(object):
from PIL import Image from PIL import Image
from PIL import ImageChops from PIL import ImageChops
ifp = cStringIO.StringIO(raw_data) ifp = cStringIO.StringIO(raw_data)
i = Image.open(ifp) i = Image.open(ifp)
i = ImageChops.invert(i) i = ImageChops.invert(i)
i = i.convert('RGB') i = i.convert('RGB')
i.save(fp, 'JPEG') i.save(fp, 'JPEG')

View File

@ -81,7 +81,7 @@ class LTComponent(LTItem):
return ('<%s %s>' % return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox))) (self.__class__.__name__, bbox2str(self.bbox)))
def set_bbox(self, (x0,y0,x1,y1)): def set_bbox(self, (x0, y0, x1, y1)):
self.x0 = x0 self.x0 = x0
self.y0 = y0 self.y0 = y0
self.x1 = x1 self.x1 = x1
@ -93,7 +93,7 @@ class LTComponent(LTItem):
def is_empty(self): def is_empty(self):
return self.width <= 0 or self.height <= 0 return self.width <= 0 or self.height <= 0
def is_hoverlap(self, obj): def is_hoverlap(self, obj):
assert isinstance(obj, LTComponent) assert isinstance(obj, LTComponent)
return obj.x0 <= self.x1 and self.x0 <= obj.x1 return obj.x0 <= self.x1 and self.x0 <= obj.x1
@ -142,7 +142,7 @@ class LTCurve(LTComponent):
return return
def get_pts(self): def get_pts(self):
return ','.join( '%.3f,%.3f' % p for p in self.pts ) return ','.join('%.3f,%.3f' % p for p in self.pts)
## LTLine ## LTLine
@ -158,8 +158,8 @@ class LTLine(LTCurve):
## ##
class LTRect(LTCurve): class LTRect(LTCurve):
def __init__(self, linewidth, (x0,y0,x1,y1)): def __init__(self, linewidth, (x0, y0, x1, y1)):
LTCurve.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)]) LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
return return
@ -212,7 +212,7 @@ class LTChar(LTComponent, LTText):
if font.is_vertical(): if font.is_vertical():
# vertical # vertical
width = font.get_width() * fontsize width = font.get_width() * fontsize
(vx,vy) = textdisp (vx, vy) = textdisp
if vx is None: if vx is None:
vx = width/2 vx = width/2
else: else:
@ -229,15 +229,15 @@ class LTChar(LTComponent, LTText):
ty = descent + rise ty = descent + rise
bll = (0, ty) bll = (0, ty)
bur = (self.adv, ty+height) bur = (self.adv, ty+height)
(a,b,c,d,e,f) = self.matrix (a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0) self.upright = (0 < a*d*scaling and b*c <= 0)
(x0,y0) = apply_matrix_pt(self.matrix, bll) (x0, y0) = apply_matrix_pt(self.matrix, bll)
(x1,y1) = apply_matrix_pt(self.matrix, bur) (x1, y1) = apply_matrix_pt(self.matrix, bur)
if x1 < x0: if x1 < x0:
(x0,x1) = (x1,x0) (x0, x1) = (x1, x0)
if y1 < y0: if y1 < y0:
(y0,y1) = (y1,y0) (y0, y1) = (y1, y0)
LTComponent.__init__(self, (x0,y0,x1,y1)) LTComponent.__init__(self, (x0, y0, x1, y1))
if font.is_vertical(): if font.is_vertical():
self.size = self.width self.size = self.width
else: else:
@ -246,7 +246,7 @@ class LTChar(LTComponent, LTText):
def __repr__(self): def __repr__(self):
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' % return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
(self.__class__.__name__, bbox2str(self.bbox), (self.__class__.__name__, bbox2str(self.bbox),
matrix2str(self.matrix), self.fontname, self.adv, matrix2str(self.matrix), self.fontname, self.adv,
self.get_text())) self.get_text()))
@ -257,7 +257,7 @@ class LTChar(LTComponent, LTText):
"""Returns True if two characters can coexist in the same line.""" """Returns True if two characters can coexist in the same line."""
return True return True
## LTContainer ## LTContainer
## ##
class LTContainer(LTComponent): class LTContainer(LTComponent):
@ -286,14 +286,14 @@ class LTContainer(LTComponent):
for obj in self._objs: for obj in self._objs:
obj.analyze(laparams) obj.analyze(laparams)
return return
## LTExpandableContainer ## LTExpandableContainer
## ##
class LTExpandableContainer(LTContainer): class LTExpandableContainer(LTContainer):
def __init__(self): def __init__(self):
LTContainer.__init__(self, (+INF,+INF,-INF,-INF)) LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
return return
def add(self, obj): def add(self, obj):
@ -313,8 +313,8 @@ class LTTextContainer(LTExpandableContainer, LTText):
return return
def get_text(self): def get_text(self):
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) ) return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
## LTTextLine ## LTTextLine
## ##
@ -338,6 +338,7 @@ class LTTextLine(LTTextContainer):
def find_neighbors(self, plane, ratio): def find_neighbors(self, plane, ratio):
raise NotImplementedError raise NotImplementedError
class LTTextLineHorizontal(LTTextLine): class LTTextLineHorizontal(LTTextLine):
def __init__(self, word_margin): def __init__(self, word_margin):
@ -357,12 +358,13 @@ class LTTextLineHorizontal(LTTextLine):
def find_neighbors(self, plane, ratio): def find_neighbors(self, plane, ratio):
d = ratio*self.height d = ratio*self.height
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d)) objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
return [ obj for obj in objs return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and if (isinstance(obj, LTTextLineHorizontal) and
abs(obj.height-self.height) < d and abs(obj.height-self.height) < d and
(abs(obj.x0-self.x0) < d or (abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d)) ] abs(obj.x1-self.x1) < d))]
class LTTextLineVertical(LTTextLine): class LTTextLineVertical(LTTextLine):
def __init__(self, word_margin): def __init__(self, word_margin):
@ -378,16 +380,16 @@ class LTTextLineVertical(LTTextLine):
self._y0 = obj.y0 self._y0 = obj.y0
LTTextLine.add(self, obj) LTTextLine.add(self, obj)
return return
def find_neighbors(self, plane, ratio): def find_neighbors(self, plane, ratio):
d = ratio*self.width d = ratio*self.width
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1)) objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
return [ obj for obj in objs return [obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and if (isinstance(obj, LTTextLineVertical) and
abs(obj.width-self.width) < d and abs(obj.width-self.width) < d and
(abs(obj.y0-self.y0) < d or (abs(obj.y0-self.y0) < d or
abs(obj.y1-self.y1) < d)) ] abs(obj.y1-self.y1) < d))]
## LTTextBox ## LTTextBox
## ##
@ -406,8 +408,9 @@ class LTTextBox(LTTextContainer):
(self.__class__.__name__, (self.__class__.__name__,
self.index, bbox2str(self.bbox), self.get_text())) self.index, bbox2str(self.bbox), self.get_text()))
class LTTextBoxHorizontal(LTTextBox): class LTTextBoxHorizontal(LTTextBox):
def analyze(self, laparams): def analyze(self, laparams):
LTTextBox.analyze(self, laparams) LTTextBox.analyze(self, laparams)
self._objs = csort(self._objs, key=lambda obj: -obj.y1) self._objs = csort(self._objs, key=lambda obj: -obj.y1)
@ -416,6 +419,7 @@ class LTTextBoxHorizontal(LTTextBox):
def get_writing_mode(self): def get_writing_mode(self):
return 'lr-tb' return 'lr-tb'
class LTTextBoxVertical(LTTextBox): class LTTextBoxVertical(LTTextBox):
def analyze(self, laparams): def analyze(self, laparams):
@ -436,8 +440,9 @@ class LTTextGroup(LTTextContainer):
self.extend(objs) self.extend(objs)
return return
class LTTextGroupLRTB(LTTextGroup): class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams): def analyze(self, laparams):
LTTextGroup.analyze(self, laparams) LTTextGroup.analyze(self, laparams)
# reorder the objects from top-left to bottom-right. # reorder the objects from top-left to bottom-right.
@ -446,14 +451,15 @@ class LTTextGroupLRTB(LTTextGroup):
(1+laparams.boxes_flow)*(obj.y0+obj.y1)) (1+laparams.boxes_flow)*(obj.y0+obj.y1))
return return
class LTTextGroupTBRL(LTTextGroup): class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams): def analyze(self, laparams):
LTTextGroup.analyze(self, laparams) LTTextGroup.analyze(self, laparams)
# reorder the objects from top-right to bottom-left. # reorder the objects from top-right to bottom-left.
self._objs = csort(self._objs, key=lambda obj: self._objs = csort(self._objs, key=lambda obj:
-(1+laparams.boxes_flow)*(obj.x0+obj.x1) -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
-(1-laparams.boxes_flow)*(obj.y1)) - (1-laparams.boxes_flow)*(obj.y1))
return return
@ -465,14 +471,14 @@ class LTLayoutContainer(LTContainer):
LTContainer.__init__(self, bbox) LTContainer.__init__(self, bbox)
self.groups = None self.groups = None
return return
def get_textlines(self, laparams, objs): def get_textlines(self, laparams, objs):
obj0 = None obj0 = None
line = None line = None
for obj1 in objs: for obj1 in objs:
if obj0 is not None: if obj0 is not None:
k = 0 k = 0
if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin): obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin):
# obj0 and obj1 is horizontally aligned: # obj0 and obj1 is horizontally aligned:
@ -487,7 +493,7 @@ class LTLayoutContainer(LTContainer):
# (char_margin) # (char_margin)
k |= 1 k |= 1
if (laparams.detect_vertical and if (laparams.detect_vertical and
obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin): obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin):
# obj0 and obj1 is vertically aligned: # obj0 and obj1 is vertically aligned:
@ -505,8 +511,8 @@ class LTLayoutContainer(LTContainer):
# |<-->| # |<-->|
# (line_overlap) # (line_overlap)
k |= 2 k |= 2
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or if ((k & 1 and isinstance(line, LTTextLineHorizontal)) or
(k & 2 and isinstance(line, LTTextLineVertical)) ): (k & 2 and isinstance(line, LTTextLineVertical))):
line.add(obj1) line.add(obj1)
elif line is not None: elif line is not None:
yield line yield line
@ -554,7 +560,8 @@ class LTLayoutContainer(LTContainer):
done = set() done = set()
for line in lines: for line in lines:
box = boxes[line] box = boxes[line]
if box in done: continue if box in done:
continue
done.add(box) done.add(box)
if not box.is_empty(): if not box.is_empty():
yield box yield box
@ -562,32 +569,34 @@ class LTLayoutContainer(LTContainer):
def group_textboxes(self, laparams, boxes): def group_textboxes(self, laparams, boxes):
assert boxes assert boxes
def dist(obj1, obj2): def dist(obj1, obj2):
"""A distance function between two TextBoxes. """A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2. Consider the bounding rectangle for obj1 and obj2.
Return its area less the areas of obj1 and obj2, Return its area less the areas of obj1 and obj2,
shown as 'www' below. This value may be negative. shown as 'www' below. This value may be negative.
+------+..........+ (x1,y1) +------+..........+ (x1, y1)
| obj1 |wwwwwwwwww: | obj1 |wwwwwwwwww:
+------+www+------+ +------+www+------+
:wwwwwwwwww| obj2 | :wwwwwwwwww| obj2 |
(x0,y0) +..........+------+ (x0, y0) +..........+------+
""" """
x0 = min(obj1.x0,obj2.x0) x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0,obj2.y0) y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1,obj2.x1) x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1,obj2.y1) y1 = max(obj1.y1, obj2.y1)
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
def isany(obj1, obj2): def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2. """Check if there's any other object between obj1 and obj2.
""" """
x0 = min(obj1.x0,obj2.x0) x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0,obj2.y0) y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1,obj2.x1) x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1,obj2.y1) y1 = max(obj1.y1, obj2.y1)
objs = set(plane.find((x0,y0,x1,y1))) objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1,obj2)) return objs.difference((obj1, obj2))
# XXX this still takes O(n^2) :( # XXX this still takes O(n^2) :(
dists = [] dists = []
for i in xrange(len(boxes)): for i in xrange(len(boxes)):
@ -599,49 +608,50 @@ class LTLayoutContainer(LTContainer):
plane = Plane(self.bbox) plane = Plane(self.bbox)
plane.extend(boxes) plane.extend(boxes)
while dists: while dists:
(c,d,obj1,obj2) = dists.pop(0) (c, d, obj1, obj2) = dists.pop(0)
if c == 0 and isany(obj1, obj2): if c == 0 and isany(obj1, obj2):
dists.append((1,d,obj1,obj2)) dists.append((1, d, obj1, obj2))
continue continue
if (isinstance(obj1, LTTextBoxVertical) or if (isinstance(obj1, LTTextBoxVertical) or
isinstance(obj1, LTTextGroupTBRL) or isinstance(obj1, LTTextGroupTBRL) or
isinstance(obj2, LTTextBoxVertical) or isinstance(obj2, LTTextBoxVertical) or
isinstance(obj2, LTTextGroupTBRL)): isinstance(obj2, LTTextGroupTBRL)):
group = LTTextGroupTBRL([obj1,obj2]) group = LTTextGroupTBRL([obj1, obj2])
else: else:
group = LTTextGroupLRTB([obj1,obj2]) group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1) plane.remove(obj1)
plane.remove(obj2) plane.remove(obj2)
# this line is optimized -- don't change without profiling # this line is optimized -- don't change without profiling
dists = [ n for n in dists if n[2] in plane._objs and n[3] in plane._objs ] dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
for other in plane: for other in plane:
dists.append((0, dist(group,other), group, other)) dists.append((0, dist(group, other), group, other))
dists.sort() dists.sort()
plane.add(group) plane.add(group)
assert len(plane) == 1 assert len(plane) == 1
return list(plane) return list(plane)
def analyze(self, laparams): def analyze(self, laparams):
# textobjs is a list of LTChar objects, i.e. # textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page. # it has all the individual characters in the page.
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs) (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
for obj in otherobjs: for obj in otherobjs:
obj.analyze(laparams) obj.analyze(laparams)
if not textobjs: return if not textobjs:
return
textlines = list(self.get_textlines(laparams, textobjs)) textlines = list(self.get_textlines(laparams, textobjs))
assert len(textobjs) <= sum( len(line._objs) for line in textlines ) assert len(textobjs) <= sum(len(line._objs) for line in textlines)
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
for obj in empties: for obj in empties:
obj.analyze(laparams) obj.analyze(laparams)
textboxes = list(self.get_textboxes(laparams, textlines)) textboxes = list(self.get_textboxes(laparams, textlines))
assert len(textlines) == sum( len(box._objs) for box in textboxes ) assert len(textlines) == sum(len(box._objs) for box in textboxes)
if textboxes: if textboxes:
self.groups = self.group_textboxes(laparams, textboxes) self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner() assigner = IndexAssigner()
for group in self.groups: for group in self.groups:
group.analyze(laparams) group.analyze(laparams)
assigner.run(group) assigner.run(group)
textboxes.sort(key=lambda box:box.index) textboxes.sort(key=lambda box: box.index)
self._objs = textboxes + otherobjs + empties self._objs = textboxes + otherobjs + empties
return return
@ -653,9 +663,9 @@ class LTFigure(LTLayoutContainer):
def __init__(self, name, bbox, matrix): def __init__(self, name, bbox, matrix):
self.name = name self.name = name
self.matrix = matrix self.matrix = matrix
(x,y,w,h) = bbox (x, y, w, h) = bbox
bbox = get_bound( apply_matrix_pt(matrix, (p,q)) bbox = get_bound(apply_matrix_pt(matrix, (p, q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
LTLayoutContainer.__init__(self, bbox) LTLayoutContainer.__init__(self, bbox)
return return
@ -665,9 +675,10 @@ class LTFigure(LTLayoutContainer):
bbox2str(self.bbox), matrix2str(self.matrix))) bbox2str(self.bbox), matrix2str(self.matrix)))
def analyze(self, laparams): def analyze(self, laparams):
if not laparams.all_texts: return if not laparams.all_texts:
return
LTLayoutContainer.analyze(self, laparams) LTLayoutContainer.analyze(self, laparams)
return return
## LTPage ## LTPage

View File

@ -34,17 +34,18 @@ class LZWDecoder(object):
# |-----8-bits-----| # |-----8-bits-----|
# |-bpos-|-bits-| | # |-bpos-|-bits-| |
# | |----r----| # | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1)) v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
self.bpos += bits self.bpos += bits
break break
else: else:
# |-----8-bits-----| # |-----8-bits-----|
# |-bpos-|---bits----... # |-bpos-|---bits----...
# | |----r----| # | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1)) v = (v << r) | (self.buff & ((1 << r)-1))
bits -= r bits -= r
x = self.fp.read(1) x = self.fp.read(1)
if not x: raise EOFError if not x:
raise EOFError
self.buff = ord(x) self.buff = ord(x)
self.bpos = 0 self.bpos = 0
return v return v
@ -52,9 +53,9 @@ class LZWDecoder(object):
def feed(self, code): def feed(self, code):
x = '' x = ''
if code == 256: if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255 self.table = [chr(c) for c in xrange(256)] # 0-255
self.table.append(None) # 256 self.table.append(None) # 256
self.table.append(None) # 257 self.table.append(None) # 257
self.prevbuf = '' self.prevbuf = ''
self.nbits = 9 self.nbits = 9
elif code == 257: elif code == 257:
@ -97,6 +98,7 @@ class LZWDecoder(object):
(self.nbits, code, x, self.table[258:])) (self.nbits, code, x, self.table[258:]))
return return
# lzwdecode # lzwdecode
def lzwdecode(data): def lzwdecode(data):
""" """

View File

@ -8,6 +8,7 @@ LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB') LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK') LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
class PDFColorSpace(object): class PDFColorSpace(object):
def __init__(self, name, ncomponents): def __init__(self, name, ncomponents):
@ -20,14 +21,14 @@ class PDFColorSpace(object):
PREDEFINED_COLORSPACE = dict( PREDEFINED_COLORSPACE = dict(
(name, PDFColorSpace(name,n)) for (name,n) in { (name, PDFColorSpace(name, n)) for (name, n) in {
'CalRGB': 3, 'CalRGB': 3,
'CalGray': 1, 'CalGray': 1,
'Lab': 3, 'Lab': 3,
'DeviceRGB': 3, 'DeviceRGB': 3,
'DeviceCMYK': 4, 'DeviceCMYK': 4,
'DeviceGray': 1, 'DeviceGray': 1,
'Separation': 1, 'Separation': 1,
'Indexed': 1, 'Indexed': 1,
'Pattern': 1, 'Pattern': 1,
}.iteritems()) }.iteritems())

View File

@ -27,24 +27,31 @@ class PDFDevice(object):
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
return return
def end_tag(self): def end_tag(self):
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
return return
def end_page(self, page): def end_page(self, page):
return return
def begin_figure(self, name, bbox, matrix): def begin_figure(self, name, bbox, matrix):
return return
def end_figure(self, name): def end_figure(self, name):
return return
def paint_path(self, graphicstate, stroke, fill, evenodd, path): def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return return
def render_image(self, name, stream): def render_image(self, name, stream):
return return
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
return return
@ -73,8 +80,8 @@ class PDFTextDevice(PDFDevice):
seq, matrix, textstate.linematrix, font, fontsize, seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale) scaling, charspace, wordspace, rise, dxscale)
return return
def render_string_horizontal(self, seq, matrix, (x,y), def render_string_horizontal(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale): font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
@ -85,14 +92,14 @@ class PDFTextDevice(PDFDevice):
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
x += charspace x += charspace
x += self.render_char(translate_matrix(matrix, (x,y)), x += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid) font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace: if cid == 32 and wordspace:
x += wordspace x += wordspace
needcharspace = True needcharspace = True
return (x, y) return (x, y)
def render_string_vertical(self, seq, matrix, (x,y), def render_string_vertical(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale): font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
@ -103,7 +110,7 @@ class PDFTextDevice(PDFDevice):
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
y += charspace y += charspace
y += self.render_char(translate_matrix(matrix, (x,y)), y += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid) font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace: if cid == 32 and wordspace:
y += wordspace y += wordspace
@ -131,7 +138,8 @@ class TagExtractor(PDFDevice):
font = textstate.font font = textstate.font
text = '' text = ''
for obj in seq: for obj in seq:
if not isinstance(obj, str): continue if not isinstance(obj, str):
continue
chars = font.decode(obj) chars = font.decode(obj)
for cid in chars: for cid in chars:
try: try:
@ -155,8 +163,8 @@ class TagExtractor(PDFDevice):
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
s = '' s = ''
if isinstance(props, dict): if isinstance(props, dict):
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v) s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
in sorted(props.iteritems()) ) in sorted(props.iteritems()))
self.outfp.write('<%s%s>' % (enc(tag.name), s)) self.outfp.write('<%s%s>' % (enc(tag.name), s))
self._stack.append(tag) self._stack.append(tag)
return return

View File

@ -23,11 +23,24 @@ from utils import decode_text
## Exceptions ## Exceptions
## ##
class PDFNoValidXRef(PDFSyntaxError): pass class PDFNoValidXRef(PDFSyntaxError):
class PDFNoOutlines(PDFException): pass pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFNoOutlines(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
pass
# some predefined literals and keywords. # some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm') LITERAL_OBJSTM = LIT('ObjStm')
@ -68,7 +81,8 @@ class PDFXRef(PDFBaseXRef):
while 1: while 1:
try: try:
(pos, line) = parser.nextline() (pos, line) = parser.nextline()
if not line.strip(): continue if not line.strip():
continue
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?') raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line: if not line:
@ -92,7 +106,8 @@ class PDFXRef(PDFBaseXRef):
if len(f) != 3: if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f (pos, genno, use) = f
if use != 'n': continue if use != 'n':
continue
self.offsets[objid] = (None, long(pos), int(genno)) self.offsets[objid] = (None, long(pos), int(genno))
if 1 <= debug: if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets print >>sys.stderr, 'xref objects:', self.offsets
@ -100,16 +115,17 @@ class PDFXRef(PDFBaseXRef):
return return
KEYWORD_TRAILER = KWD('trailer') KEYWORD_TRAILER = KWD('trailer')
def load_trailer(self, parser): def load_trailer(self, parser):
try: try:
(_,kwd) = parser.nexttoken() (_, kwd) = parser.nexttoken()
assert kwd is self.KEYWORD_TRAILER assert kwd is self.KEYWORD_TRAILER
(_,dic) = parser.nextobject() (_, dic) = parser.nextobject()
except PSEOF: except PSEOF:
x = parser.pop(1) x = parser.pop(1)
if not x: if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted') raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0] (_, dic) = x[0]
self.trailer.update(dict_value(dic)) self.trailer.update(dict_value(dic))
return return
@ -134,6 +150,7 @@ class PDFXRefFallback(PDFXRef):
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys()) return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load(self, parser, debug=0): def load(self, parser, debug=0):
parser.seek(0) parser.seek(0)
while 1: while 1:
@ -148,14 +165,15 @@ class PDFXRefFallback(PDFXRef):
print >>sys.stderr, 'trailer: %r' % self.get_trailer() print >>sys.stderr, 'trailer: %r' % self.get_trailer()
break break
m = self.PDFOBJ_CUE.match(line) m = self.PDFOBJ_CUE.match(line)
if not m: continue if not m:
continue
(objid, genno) = m.groups() (objid, genno) = m.groups()
objid = int(objid) objid = int(objid)
genno = int(genno) genno = int(genno)
self.offsets[objid] = (None, pos, genno) self.offsets[objid] = (None, pos, genno)
# expand ObjStm. # expand ObjStm.
parser.seek(pos) parser.seek(pos)
(_,obj) = parser.nextobject() (_, obj) = parser.nextobject()
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM: if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
stream = stream_value(obj) stream = stream_value(obj)
try: try:
@ -168,7 +186,7 @@ class PDFXRefFallback(PDFXRef):
objs = [] objs = []
try: try:
while 1: while 1:
(_,obj) = parser1.nextobject() (_, obj) = parser1.nextobject()
objs.append(obj) objs.append(obj)
except PSEOF: except PSEOF:
pass pass
@ -193,14 +211,14 @@ class PDFXRefStream(PDFBaseXRef):
return '<PDFXRefStream: ranges=%r>' % (self.ranges) return '<PDFXRefStream: ranges=%r>' % (self.ranges)
def load(self, parser, debug=0): def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored (_, objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken() (_, kwd) = parser.nexttoken()
(_,stream) = parser.nextobject() (_, stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.') raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size'] size = stream['Size']
index_array = stream.get('Index', (1,size)) index_array = stream.get('Index', (1, size))
if len(index_array) % 2 != 0: if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number') raise PDFSyntaxError('Invalid index number')
self.ranges.extend(choplist(2, index_array)) self.ranges.extend(choplist(2, index_array))
@ -210,22 +228,22 @@ class PDFXRefStream(PDFBaseXRef):
self.trailer = stream.attrs self.trailer = stream.attrs
if 1 <= debug: if 1 <= debug:
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.ranges)), (', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)) self.fl1, self.fl2, self.fl3))
return return
def get_trailer(self): def get_trailer(self):
return self.trailer return self.trailer
def get_objids(self): def get_objids(self):
for (start,nobjs) in self.ranges: for (start, nobjs) in self.ranges:
for i in xrange(nobjs): for i in xrange(nobjs):
yield start+i yield start+i
return return
def get_pos(self, objid): def get_pos(self, objid):
index = 0 index = 0
for (start,nobjs) in self.ranges: for (start, nobjs) in self.ranges:
if start <= objid and objid < start+nobjs: if start <= objid and objid < start+nobjs:
index += objid - start index += objid - start
else: else:
@ -260,7 +278,7 @@ class PDFDocument(object):
doc = PDFDocument(parser) doc = PDFDocument(parser)
doc.initialize(password) doc.initialize(password)
obj = doc.getobj(objid) obj = doc.getobj(objid)
""" """
debug = 0 debug = 0
@ -292,7 +310,8 @@ class PDFDocument(object):
self.xrefs.append(xref) self.xrefs.append(xref)
for xref in self.xrefs: for xref in self.xrefs:
trailer = xref.get_trailer() trailer = xref.get_trailer()
if not trailer: continue if not trailer:
continue
# If there's an encryption info, remember it. # If there's an encryption info, remember it.
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
#assert not self.encryption #assert not self.encryption
@ -316,6 +335,7 @@ class PDFDocument(object):
# This step is mandatory even if there's no password associated # This step is mandatory even if there's no password associated
# with the document. # with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''): def initialize(self, password=''):
if not self.encryption: if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True self.is_printable = self.is_modifiable = self.is_extractable = True
@ -326,9 +346,9 @@ class PDFDocument(object):
V = int_value(param.get('V', 0)) V = int_value(param.get('V', 0))
if not (V == 1 or V == 2): if not (V == 1 or V == 2):
raise PDFEncryptionError('Unknown algorithm: param=%r' % param) raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
length = int_value(param.get('Length', 40)) # Key length (bits) length = int_value(param.get('Length', 40)) # Key length (bits)
O = str_value(param['O']) O = str_value(param['O'])
R = int_value(param['R']) # Revision R = int_value(param['R']) # Revision
if 5 <= R: if 5 <= R:
raise PDFEncryptionError('Unknown revision: %r' % R) raise PDFEncryptionError('Unknown revision: %r' % R)
U = str_value(param['U']) U = str_value(param['U'])
@ -337,11 +357,11 @@ class PDFDocument(object):
self.is_modifiable = bool(P & 8) self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16) self.is_extractable = bool(P & 16)
# Algorithm 3.2 # Algorithm 3.2
password = (password+self.PASSWORD_PADDING)[:32] # 1 password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2 hash = md5.md5(password) # 2
hash.update(O) # 3 hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4 hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5 hash.update(docid[0]) # 5
if 4 <= R: if 4 <= R:
# 6 # 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported') raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
@ -355,13 +375,13 @@ class PDFDocument(object):
u1 = Arcfour(key).process(self.PASSWORD_PADDING) u1 = Arcfour(key).process(self.PASSWORD_PADDING)
elif R == 3: elif R == 3:
# Algorithm 3.5 # Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2 hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3 hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4 x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1): for i in xrange(1, 19+1):
k = ''.join( chr(ord(c) ^ i) for c in key ) k = ''.join(chr(ord(c) ^ i) for c in key)
x = Arcfour(k).process(x) x = Arcfour(k).process(x)
u1 = x+x # 32bytes total u1 = x+x # 32bytes total
if R == 2: if R == 2:
is_authenticated = (u1 == U) is_authenticated = (u1 == U)
else: else:
@ -373,18 +393,18 @@ class PDFDocument(object):
return return
def decrypt_rc4(self, objid, genno, data): def decrypt_rc4(self, objid, genno, data):
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2] key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
hash = md5.md5(key) hash = md5.md5(key)
key = hash.digest()[:min(len(key),16)] key = hash.digest()[:min(len(key), 16)]
return Arcfour(key).process(data) return Arcfour(key).process(data)
def _getobj_objstm(self, stream, index, objid): def _getobj_objstm(self, stream, index, objid):
if stream.objid in self._parsed_objs: if stream.objid in self._parsed_objs:
(objs,n) = self._parsed_objs[stream.objid] (objs, n) = self._parsed_objs[stream.objid]
else: else:
(objs,n) = self._get_objects(stream) (objs, n) = self._get_objects(stream)
if self.caching: if self.caching:
self._parsed_objs[stream.objid] = (objs,n) self._parsed_objs[stream.objid] = (objs, n)
i = n*2+index i = n*2+index
try: try:
obj = objs[i] obj = objs[i]
@ -407,25 +427,26 @@ class PDFDocument(object):
objs = [] objs = []
try: try:
while 1: while 1:
(_,obj) = parser.nextobject() (_, obj) = parser.nextobject()
objs.append(obj) objs.append(obj)
except PSEOF: except PSEOF:
pass pass
return (objs, n) return (objs, n)
KEYWORD_OBJ = KWD('obj') KEYWORD_OBJ = KWD('obj')
def _getobj_parse(self, pos, objid): def _getobj_parse(self, pos, objid):
self._parser.seek(pos) self._parser.seek(pos)
(_,objid1) = self._parser.nexttoken() # objid (_, objid1) = self._parser.nexttoken() # objid
if objid1 != objid: if objid1 != objid:
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid)) raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
(_,genno) = self._parser.nexttoken() # genno (_, genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken() (_, kwd) = self._parser.nexttoken()
if kwd is not self.KEYWORD_OBJ: if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos) raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
(_,obj) = self._parser.nextobject() (_, obj) = self._parser.nextobject()
return obj return obj
# can raise PDFObjectNotFound # can raise PDFObjectNotFound
def getobj(self, objid): def getobj(self, objid):
assert objid != 0 assert objid != 0
@ -465,6 +486,7 @@ class PDFDocument(object):
def get_outlines(self): def get_outlines(self):
if 'Outlines' not in self.catalog: if 'Outlines' not in self.catalog:
raise PDFNoOutlines raise PDFNoOutlines
def search(entry, level): def search(entry, level):
entry = dict_value(entry) entry = dict_value(entry)
if 'Title' in entry: if 'Title' in entry:
@ -487,13 +509,15 @@ class PDFDocument(object):
try: try:
names = dict_value(self.catalog['Names']) names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError): except (PDFTypeError, KeyError):
raise KeyError((cat,key)) raise KeyError((cat, key))
# may raise KeyError # may raise KeyError
d0 = dict_value(names[cat]) d0 = dict_value(names[cat])
def lookup(d): def lookup(d):
if 'Limits' in d: if 'Limits' in d:
(k1,k2) = list_value(d['Limits']) (k1, k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None if key < k1 or k2 < key:
return None
if 'Names' in d: if 'Names' in d:
objs = list_value(d['Names']) objs = list_value(d['Names'])
names = dict(choplist(2, objs)) names = dict(choplist(2, objs))
@ -501,8 +525,9 @@ class PDFDocument(object):
if 'Kids' in d: if 'Kids' in d:
for c in list_value(d['Kids']): for c in list_value(d['Kids']):
v = lookup(dict_value(c)) v = lookup(dict_value(c))
if v: return v if v:
raise KeyError((cat,key)) return v
raise KeyError((cat, key))
return lookup(d0) return lookup(d0)
def get_dest(self, name): def get_dest(self, name):
@ -528,7 +553,8 @@ class PDFDocument(object):
line = line.strip() line = line.strip()
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'find_xref: %r' % line print >>sys.stderr, 'find_xref: %r' % line
if line == 'startxref': break if line == 'startxref':
break
if line: if line:
prev = line prev = line
else: else:

View File

@ -25,13 +25,13 @@ def get_widths(seq):
if isinstance(v, list): if isinstance(v, list):
if r: if r:
char1 = r[-1] char1 = r[-1]
for (i,w) in enumerate(v): for (i, w) in enumerate(v):
widths[char1+i] = w widths[char1+i] = w
r = [] r = []
elif isinstance(v, int): elif isinstance(v, int):
r.append(v) r.append(v)
if len(r) == 3: if len(r) == 3:
(char1,char2,w) = r (char1, char2, w) = r
for i in xrange(char1, char2+1): for i in xrange(char1, char2+1):
widths[i] = w widths[i] = w
r = [] r = []
@ -40,6 +40,7 @@ def get_widths(seq):
#assert get_widths([1,2,3]) == {1:3, 2:3} #assert get_widths([1,2,3]) == {1:3, 2:3}
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8} #assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
def get_widths2(seq): def get_widths2(seq):
widths = {} widths = {}
r = [] r = []
@ -47,20 +48,20 @@ def get_widths2(seq):
if isinstance(v, list): if isinstance(v, list):
if r: if r:
char1 = r[-1] char1 = r[-1]
for (i,(w,vx,vy)) in enumerate(choplist(3,v)): for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
widths[char1+i] = (w,(vx,vy)) widths[char1+i] = (w, (vx, vy))
r = [] r = []
elif isinstance(v, int): elif isinstance(v, int):
r.append(v) r.append(v)
if len(r) == 5: if len(r) == 5:
(char1,char2,w,vx,vy) = r (char1, char2, w, vx, vy) = r
for i in xrange(char1, char2+1): for i in xrange(char1, char2+1):
widths[i] = (w,(vx,vy)) widths[i] = (w, (vx, vy))
r = [] r = []
return widths return widths
#assert get_widths2([1]) == {} #assert get_widths2([1]) == {}
#assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))} #assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))} #assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
## FontMetricsDB ## FontMetricsDB
@ -94,7 +95,7 @@ class Type1FontHeaderParser(PSStackParser):
def get_encoding(self): def get_encoding(self):
while 1: while 1:
try: try:
(cid,name) = self.nextobject() (cid, name) = self.nextobject()
except PSEOF: except PSEOF:
break break
try: try:
@ -102,28 +103,31 @@ class Type1FontHeaderParser(PSStackParser):
except KeyError: except KeyError:
pass pass
return self._cid2unicode return self._cid2unicode
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
if token is self.KEYWORD_PUT: if token is self.KEYWORD_PUT:
((_,key),(_,value)) = self.pop(2) ((_, key), (_, value)) = self.pop(2)
if (isinstance(key, int) and if (isinstance(key, int) and
isinstance(value, PSLiteral)): isinstance(value, PSLiteral)):
self.add_results((key, literal_name(value))) self.add_results((key, literal_name(value)))
return return
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
## CFFFont ## CFFFont
## (Format specified in Adobe Technical Note: #5176 ## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification") ## "The Compact Font Format Specification")
## ##
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
def getdict(data): def getdict(data):
d = {} d = {}
fp = StringIO(data) fp = StringIO(data)
stack = [] stack = []
while 1: while 1:
c = fp.read(1) c = fp.read(1)
if not c: break if not c:
break
b0 = ord(c) b0 = ord(c)
if b0 <= 21: if b0 <= 21:
d[b0] = stack d[b0] = stack
@ -145,19 +149,21 @@ def getdict(data):
else: else:
b1 = ord(fp.read(1)) b1 = ord(fp.read(1))
if 247 <= b0 and b0 <= 250: if 247 <= b0 and b0 <= 250:
value = ((b0-247)<<8)+b1+108 value = ((b0-247) << 8)+b1+108
elif 251 <= b0 and b0 <= 254: elif 251 <= b0 and b0 <= 254:
value = -((b0-251)<<8)-b1-108 value = -((b0-251) << 8)-b1-108
else: else:
b2 = ord(fp.read(1)) b2 = ord(fp.read(1))
if 128 <= b1: b1 -= 256 if 128 <= b1:
b1 -= 256
if b0 == 28: if b0 == 28:
value = b1<<8 | b2 value = b1 << 8 | b2
else: else:
value = b1<<24 | b2<<16 | struct.unpack('>H', fp.read(2))[0] value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
stack.append(value) stack.append(value)
return d return d
class CFFFont(object): class CFFFont(object):
STANDARD_STRINGS = ( STANDARD_STRINGS = (
@ -239,7 +245,7 @@ class CFFFont(object):
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000', 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
'001.001', '001.002', '001.003', 'Black', 'Bold', 'Book', '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
'Light', 'Medium', 'Regular', 'Roman', 'Semibold', 'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
) )
class INDEX(object): class INDEX(object):
@ -264,13 +270,13 @@ class CFFFont(object):
return self.fp.read(self.offsets[i+1]-self.offsets[i]) return self.fp.read(self.offsets[i+1]-self.offsets[i])
def __iter__(self): def __iter__(self):
return iter( self[i] for i in xrange(len(self)) ) return iter(self[i] for i in xrange(len(self)))
def __init__(self, name, fp): def __init__(self, name, fp):
self.name = name self.name = name
self.fp = fp self.fp = fp
# Header # Header
(_major,_minor,hdrsize,offsize) = struct.unpack('BBBB', self.fp.read(4)) (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
self.fp.read(hdrsize-4) self.fp.read(hdrsize-4)
# Name INDEX # Name INDEX
self.name_index = self.INDEX(self.fp) self.name_index = self.INDEX(self.fp)
@ -297,7 +303,7 @@ class CFFFont(object):
if format == '\x00': if format == '\x00':
# Format 0 # Format 0
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
for (code,gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))): for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
self.code2gid[code] = gid self.code2gid[code] = gid
self.gid2code[gid] = code self.gid2code[gid] = code
elif format == '\x01': elif format == '\x01':
@ -305,8 +311,8 @@ class CFFFont(object):
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
code = 0 code = 0
for i in xrange(n): for i in xrange(n):
(first,nleft) = struct.unpack('BB', self.fp.read(2)) (first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first,first+nleft+1): for gid in xrange(first, first+nleft+1):
self.code2gid[code] = gid self.code2gid[code] = gid
self.gid2code[gid] = code self.gid2code[gid] = code
code += 1 code += 1
@ -320,7 +326,7 @@ class CFFFont(object):
if format == '\x00': if format == '\x00':
# Format 0 # Format 0
n = self.nglyphs-1 n = self.nglyphs-1
for (gid,sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))): for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
gid += 1 gid += 1
name = self.getstr(sid) name = self.getstr(sid)
self.name2gid[name] = gid self.name2gid[name] = gid
@ -330,8 +336,8 @@ class CFFFont(object):
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
sid = 0 sid = 0
for i in xrange(n): for i in xrange(n):
(first,nleft) = struct.unpack('BB', self.fp.read(2)) (first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first,first+nleft+1): for gid in xrange(first, first+nleft+1):
name = self.getstr(sid) name = self.getstr(sid)
self.name2gid[name] = gid self.name2gid[name] = gid
self.gid2name[gid] = name self.gid2name[gid] = name
@ -356,7 +362,8 @@ class CFFFont(object):
## ##
class TrueTypeFont(object): class TrueTypeFont(object):
class CMapNotFound(Exception): pass class CMapNotFound(Exception):
pass
def __init__(self, name, fp): def __init__(self, name, fp):
self.name = name self.name = name
@ -389,15 +396,16 @@ class TrueTypeFont(object):
elif fmttype == 2: elif fmttype == 2:
subheaderkeys = struct.unpack('>256H', fp.read(512)) subheaderkeys = struct.unpack('>256H', fp.read(512))
firstbytes = [0]*8192 firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys): for (i, k) in enumerate(subheaderkeys):
firstbytes[k/8] = i firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1 nhdrs = max(subheaderkeys)/8 + 1
hdrs = [] hdrs = []
for i in xrange(nhdrs): for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = struct.unpack('>HHhH', fp.read(8)) (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs: for (i, firstcode, entcount, delta, pos) in hdrs:
if not entcount: continue if not entcount:
continue
first = firstcode + (firstbytes[i] << 8) first = firstcode + (firstbytes[i] << 8)
fp.seek(pos) fp.seek(pos)
for c in xrange(entcount): for c in xrange(entcount):
@ -414,7 +422,7 @@ class TrueTypeFont(object):
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount)) idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell() pos = fp.tell()
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount)) idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
if idr: if idr:
fp.seek(pos+idr) fp.seek(pos+idr)
for c in xrange(sc, ec+1): for c in xrange(sc, ec+1):
@ -426,16 +434,19 @@ class TrueTypeFont(object):
assert 0 assert 0
# create unicode map # create unicode map
unicode_map = FileUnicodeMap() unicode_map = FileUnicodeMap()
for (char,gid) in char2gid.iteritems(): for (char, gid) in char2gid.iteritems():
unicode_map.add_cid2unichr(gid, char) unicode_map.add_cid2unichr(gid, char)
return unicode_map return unicode_map
## Fonts ## Fonts
## ##
class PDFFontError(PDFException):
pass
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass class PDFUnicodeNotDefined(PDFFontError):
pass
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
LITERAL_TYPE1C = LIT('Type1C') LITERAL_TYPE1C = LIT('Type1C')
@ -456,7 +467,7 @@ class PDFFont(object):
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0)) self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0)) self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
self.leading = num_value(descriptor.get('Leading', 0)) self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
self.hscale = self.vscale = .001 self.hscale = self.vscale = .001
return return
@ -474,6 +485,7 @@ class PDFFont(object):
def get_ascent(self): def get_ascent(self):
return self.ascent * self.vscale return self.ascent * self.vscale
def get_descent(self): def get_descent(self):
return self.descent * self.vscale return self.descent * self.vscale
@ -482,6 +494,7 @@ class PDFFont(object):
if w == 0: if w == 0:
w = -self.default_width w = -self.default_width
return w * self.hscale return w * self.hscale
def get_height(self): def get_height(self):
h = self.bbox[3]-self.bbox[1] h = self.bbox[3]-self.bbox[1]
if h == 0: if h == 0:
@ -501,7 +514,7 @@ class PDFFont(object):
return 0 return 0
def string_width(self, s): def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) ) return sum(self.char_width(cid) for cid in self.decode(s))
# PDFSimpleFont # PDFSimpleFont
@ -540,6 +553,7 @@ class PDFSimpleFont(PDFFont):
except KeyError: except KeyError:
raise PDFUnicodeNotDefined(None, cid) raise PDFUnicodeNotDefined(None, cid)
# PDFType1Font # PDFType1Font
class PDFType1Font(PDFSimpleFont): class PDFType1Font(PDFSimpleFont):
@ -557,7 +571,7 @@ class PDFType1Font(PDFSimpleFont):
firstchar = int_value(spec.get('FirstChar', 0)) firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 255)) lastchar = int_value(spec.get('LastChar', 255))
widths = list_value(spec.get('Widths', [0]*256)) widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
PDFSimpleFont.__init__(self, descriptor, widths, spec) PDFSimpleFont.__init__(self, descriptor, widths, spec)
if 'Encoding' not in spec and 'FontFile' in descriptor: if 'Encoding' not in spec and 'FontFile' in descriptor:
# try to recover the missing encoding info from the font file. # try to recover the missing encoding info from the font file.
@ -571,12 +585,14 @@ class PDFType1Font(PDFSimpleFont):
def __repr__(self): def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont return '<PDFType1Font: basefont=%r>' % self.basefont
# PDFTrueTypeFont # PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font): class PDFTrueTypeFont(PDFType1Font):
def __repr__(self): def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
# PDFType3Font # PDFType3Font
class PDFType3Font(PDFSimpleFont): class PDFType3Font(PDFSimpleFont):
@ -584,16 +600,16 @@ class PDFType3Font(PDFSimpleFont):
firstchar = int_value(spec.get('FirstChar', 0)) firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 0)) lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256)) widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths)) widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
if 'FontDescriptor' in spec: if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor']) descriptor = dict_value(spec['FontDescriptor'])
else: else:
descriptor = {'Ascent':0, 'Descent':0, descriptor = {'Ascent': 0, 'Descent': 0,
'FontBBox':spec['FontBBox']} 'FontBBox': spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec) PDFSimpleFont.__init__(self, descriptor, widths, spec)
self.matrix = tuple(list_value(spec.get('FontMatrix'))) self.matrix = tuple(list_value(spec.get('FontMatrix')))
(_,self.descent,_,self.ascent) = self.bbox (_, self.descent, _, self.ascent) = self.bbox
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1)) (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
return return
def __repr__(self): def __repr__(self):
@ -657,10 +673,10 @@ class PDFCIDFont(PDFFont):
if self.vertical: if self.vertical:
# writing mode: vertical # writing mode: vertical
widths = get_widths2(list_value(spec.get('W2', []))) widths = get_widths2(list_value(spec.get('W2', [])))
self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.iteritems() ) self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
(vy,w) = spec.get('DW2', [880, -1000]) (vy, w) = spec.get('DW2', [880, -1000])
self.default_disp = (None,vy) self.default_disp = (None, vy)
widths = dict( (cid,w) for (cid,(w,_)) in widths.iteritems() ) widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
default_width = w default_width = w
else: else:
# writing mode: horizontal # writing mode: horizontal
@ -689,7 +705,8 @@ class PDFCIDFont(PDFFont):
def to_unichr(self, cid): def to_unichr(self, cid):
try: try:
if not self.unicode_map: raise KeyError(cid) if not self.unicode_map:
raise KeyError(cid)
return self.unicode_map.get_unichr(cid) return self.unicode_map.get_unichr(cid)
except KeyError: except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid) raise PDFUnicodeNotDefined(self.cidcoding, cid)
@ -705,4 +722,5 @@ def main(argv):
fp.close() fp.close()
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -26,8 +26,12 @@ from utils import mult_matrix, MATRIX_IDENTITY
## Exceptions ## Exceptions
## ##
class PDFResourceError(PDFException): pass class PDFResourceError(PDFException):
class PDFInterpreterError(PDFException): pass pass
class PDFInterpreterError(PDFException):
pass
## Constants ## Constants
@ -116,12 +120,13 @@ class PDFGraphicState(object):
(self.linewidth, self.linecap, self.linejoin, (self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness)) self.miterlimit, self.dash, self.intent, self.flatness))
## Resource Manager ## Resource Manager
## ##
class PDFResourceManager(object): class PDFResourceManager(object):
"""Repository of shared resources. """Repository of shared resources.
ResourceManager facilitates reuse of shared resources ResourceManager facilitates reuse of shared resources
such as fonts and images so that large objects are not such as fonts and images so that large objects are not
allocated multiple times. allocated multiple times.
@ -148,7 +153,8 @@ class PDFResourceManager(object):
try: try:
return CMapDB.get_cmap(cmapname) return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound: except CMapDB.CMapNotFound:
if strict: raise if strict:
raise
return CMap() return CMap()
def get_font(self, objid, spec): def get_font(self, objid, spec):
@ -191,7 +197,7 @@ class PDFResourceManager(object):
else: else:
if STRICT: if STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec) raise PDFFontError('Invalid Font spec: %r' % spec)
font = PDFType1Font(self, spec) # this is so wrong! font = PDFType1Font(self, spec) # this is so wrong!
if objid and self.caching: if objid and self.caching:
self._cached_fonts[objid] = font self._cached_fonts[objid] = font
return font return font
@ -223,12 +229,14 @@ class PDFContentParser(PSStackParser):
return return
def fillbuf(self): def fillbuf(self):
if self.charpos < len(self.buf): return if self.charpos < len(self.buf):
return
while 1: while 1:
self.fillfp() self.fillfp()
self.bufpos = self.fp.tell() self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ) self.buf = self.fp.read(self.BUFSIZ)
if self.buf: break if self.buf:
break
self.fp = None self.fp = None
self.charpos = 0 self.charpos = 0
return return
@ -259,7 +267,7 @@ class PDFContentParser(PSStackParser):
except ValueError: except ValueError:
data += self.buf[self.charpos:] data += self.buf[self.charpos:]
self.charpos = len(self.buf) self.charpos = len(self.buf)
data = data[:-(len(target)+1)] # strip the last part data = data[:-(len(target)+1)] # strip the last part
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data) data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data)
return (pos, data) return (pos, data)
@ -270,6 +278,7 @@ class PDFContentParser(PSStackParser):
KEYWORD_BI = KWD('BI') KEYWORD_BI = KWD('BI')
KEYWORD_ID = KWD('ID') KEYWORD_ID = KWD('ID')
KEYWORD_EI = KWD('EI') KEYWORD_EI = KWD('EI')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
if token is self.KEYWORD_BI: if token is self.KEYWORD_BI:
# inline image within a content stream # inline image within a content stream
@ -279,13 +288,14 @@ class PDFContentParser(PSStackParser):
(_, objs) = self.end_type('inline') (_, objs) = self.end_type('inline')
if len(objs) % 2 != 0: if len(objs) % 2 != 0:
raise PSTypeError('Invalid dictionary construct: %r' % objs) raise PSTypeError('Invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
(pos, data) = self.get_inline_data(pos+len('ID ')) (pos, data) = self.get_inline_data(pos+len('ID '))
obj = PDFStream(d, data) obj = PDFStream(d, data)
self.push((pos, obj)) self.push((pos, obj))
self.push((pos, self.KEYWORD_EI)) self.push((pos, self.KEYWORD_EI))
except PSTypeError: except PSTypeError:
if STRICT: raise if STRICT:
raise
else: else:
self.push((pos, token)) self.push((pos, token))
return return
@ -312,7 +322,9 @@ class PDFPageInterpreter(object):
self.fontmap = {} self.fontmap = {}
self.xobjmap = {} self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy() self.csmap = PREDEFINED_COLORSPACE.copy()
if not resources: return if not resources:
return
def get_colorspace(spec): def get_colorspace(spec):
if isinstance(spec, list): if isinstance(spec, list):
name = literal_name(spec[0]) name = literal_name(spec[0])
@ -324,23 +336,23 @@ class PDFPageInterpreter(object):
return PDFColorSpace(name, len(list_value(spec[1]))) return PDFColorSpace(name, len(list_value(spec[1])))
else: else:
return PREDEFINED_COLORSPACE.get(name) return PREDEFINED_COLORSPACE.get(name)
for (k,v) in dict_value(resources).iteritems(): for (k, v) in dict_value(resources).iteritems():
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'Resource: %r: %r' % (k,v) print >>sys.stderr, 'Resource: %r: %r' % (k, v)
if k == 'Font': if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems(): for (fontid, spec) in dict_value(v).iteritems():
objid = None objid = None
if isinstance(spec, PDFObjRef): if isinstance(spec, PDFObjRef):
objid = spec.objid objid = spec.objid
spec = dict_value(spec) spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace': elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems(): for (csid, spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec)) self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet': elif k == 'ProcSet':
self.rsrcmgr.get_procset(list_value(v)) self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject': elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems(): for (xobjid, xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm self.xobjmap[xobjid] = xobjstrm
return return
@ -367,7 +379,8 @@ class PDFPageInterpreter(object):
return return
def pop(self, n): def pop(self, n):
if n == 0: return [] if n == 0:
return []
x = self.argstack[-n:] x = self.argstack[-n:]
self.argstack = self.argstack[:-n] self.argstack = self.argstack[:-n]
return x return x
@ -384,6 +397,7 @@ class PDFPageInterpreter(object):
def do_q(self): def do_q(self):
self.gstack.append(self.get_current_state()) self.gstack.append(self.get_current_state())
return return
# grestore # grestore
def do_Q(self): def do_Q(self):
if self.gstack: if self.gstack:
@ -392,7 +406,7 @@ class PDFPageInterpreter(object):
# concat-matrix # concat-matrix
def do_cm(self, a1, b1, c1, d1, e1, f1): def do_cm(self, a1, b1, c1, d1, e1, f1):
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm) self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
self.device.set_ctm(self.ctm) self.device.set_ctm(self.ctm)
return return
@ -400,30 +414,37 @@ class PDFPageInterpreter(object):
def do_w(self, linewidth): def do_w(self, linewidth):
self.graphicstate.linewidth = linewidth self.graphicstate.linewidth = linewidth
return return
# setlinecap # setlinecap
def do_J(self, linecap): def do_J(self, linecap):
self.graphicstate.linecap = linecap self.graphicstate.linecap = linecap
return return
# setlinejoin # setlinejoin
def do_j(self, linejoin): def do_j(self, linejoin):
self.graphicstate.linejoin = linejoin self.graphicstate.linejoin = linejoin
return return
# setmiterlimit # setmiterlimit
def do_M(self, miterlimit): def do_M(self, miterlimit):
self.graphicstate.miterlimit = miterlimit self.graphicstate.miterlimit = miterlimit
return return
# setdash # setdash
def do_d(self, dash, phase): def do_d(self, dash, phase):
self.graphicstate.dash = (dash, phase) self.graphicstate.dash = (dash, phase)
return return
# setintent # setintent
def do_ri(self, intent): def do_ri(self, intent):
self.graphicstate.intent = intent self.graphicstate.intent = intent
return return
# setflatness # setflatness
def do_i(self, flatness): def do_i(self, flatness):
self.graphicstate.flatness = flatness self.graphicstate.flatness = flatness
return return
# load-gstate # load-gstate
def do_gs(self, name): def do_gs(self, name):
#XXX #XXX
@ -431,34 +452,40 @@ class PDFPageInterpreter(object):
# moveto # moveto
def do_m(self, x, y): def do_m(self, x, y):
self.curpath.append(('m',x,y)) self.curpath.append(('m', x, y))
return return
# lineto # lineto
def do_l(self, x, y): def do_l(self, x, y):
self.curpath.append(('l',x,y)) self.curpath.append(('l', x, y))
return return
# curveto # curveto
def do_c(self, x1, y1, x2, y2, x3, y3): def do_c(self, x1, y1, x2, y2, x3, y3):
self.curpath.append(('c',x1,y1,x2,y2,x3,y3)) self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
return return
# urveto # urveto
def do_v(self, x2, y2, x3, y3): def do_v(self, x2, y2, x3, y3):
self.curpath.append(('v',x2,y2,x3,y3)) self.curpath.append(('v', x2, y2, x3, y3))
return return
# rveto # rveto
def do_y(self, x1, y1, x3, y3): def do_y(self, x1, y1, x3, y3):
self.curpath.append(('y',x1,y1,x3,y3)) self.curpath.append(('y', x1, y1, x3, y3))
return return
# closepath # closepath
def do_h(self): def do_h(self):
self.curpath.append(('h',)) self.curpath.append(('h',))
return return
# rectangle # rectangle
def do_re(self, x, y, w, h): def do_re(self, x, y, w, h):
self.curpath.append(('m',x,y)) self.curpath.append(('m', x, y))
self.curpath.append(('l',x+w,y)) self.curpath.append(('l', x+w, y))
self.curpath.append(('l',x+w,y+h)) self.curpath.append(('l', x+w, y+h))
self.curpath.append(('l',x,y+h)) self.curpath.append(('l', x, y+h))
self.curpath.append(('h',)) self.curpath.append(('h',))
return return
@ -467,11 +494,13 @@ class PDFPageInterpreter(object):
self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = [] self.curpath = []
return return
# close-and-stroke # close-and-stroke
def do_s(self): def do_s(self):
self.do_h() self.do_h()
self.do_S() self.do_S()
return return
# fill # fill
def do_f(self): def do_f(self):
self.device.paint_path(self.graphicstate, False, True, False, self.curpath) self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
@ -479,68 +508,85 @@ class PDFPageInterpreter(object):
return return
# fill (obsolete) # fill (obsolete)
do_F = do_f do_F = do_f
# fill-even-odd # fill-even-odd
def do_f_a(self): def do_f_a(self):
self.device.paint_path(self.graphicstate, False, True, True, self.curpath) self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = [] self.curpath = []
return return
# fill-and-stroke # fill-and-stroke
def do_B(self): def do_B(self):
self.device.paint_path(self.graphicstate, True, True, False, self.curpath) self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = [] self.curpath = []
return return
# fill-and-stroke-even-odd # fill-and-stroke-even-odd
def do_B_a(self): def do_B_a(self):
self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = [] self.curpath = []
return return
# close-fill-and-stroke # close-fill-and-stroke
def do_b(self): def do_b(self):
self.do_h() self.do_h()
self.do_B() self.do_B()
return return
# close-fill-and-stroke-even-odd # close-fill-and-stroke-even-odd
def do_b_a(self): def do_b_a(self):
self.do_h() self.do_h()
self.do_B_a() self.do_B_a()
return return
# close-only # close-only
def do_n(self): def do_n(self):
self.curpath = [] self.curpath = []
return return
# clip # clip
def do_W(self): return def do_W(self):
return
# clip-even-odd # clip-even-odd
def do_W_a(self): return def do_W_a(self):
return
# setcolorspace-stroking # setcolorspace-stroking
def do_CS(self, name): def do_CS(self, name):
self.scs = self.csmap[literal_name(name)] self.scs = self.csmap[literal_name(name)]
return return
# setcolorspace-non-strokine # setcolorspace-non-strokine
def do_cs(self, name): def do_cs(self, name):
self.ncs = self.csmap[literal_name(name)] self.ncs = self.csmap[literal_name(name)]
return return
# setgray-stroking # setgray-stroking
def do_G(self, gray): def do_G(self, gray):
#self.do_CS(LITERAL_DEVICE_GRAY) #self.do_CS(LITERAL_DEVICE_GRAY)
return return
# setgray-non-stroking # setgray-non-stroking
def do_g(self, gray): def do_g(self, gray):
#self.do_cs(LITERAL_DEVICE_GRAY) #self.do_cs(LITERAL_DEVICE_GRAY)
return return
# setrgb-stroking # setrgb-stroking
def do_RG(self, r, g, b): def do_RG(self, r, g, b):
#self.do_CS(LITERAL_DEVICE_RGB) #self.do_CS(LITERAL_DEVICE_RGB)
return return
# setrgb-non-stroking # setrgb-non-stroking
def do_rg(self, r, g, b): def do_rg(self, r, g, b):
#self.do_cs(LITERAL_DEVICE_RGB) #self.do_cs(LITERAL_DEVICE_RGB)
return return
# setcmyk-stroking # setcmyk-stroking
def do_K(self, c, m, y, k): def do_K(self, c, m, y, k):
#self.do_CS(LITERAL_DEVICE_CMYK) #self.do_CS(LITERAL_DEVICE_CMYK)
return return
# setcmyk-non-stroking # setcmyk-non-stroking
def do_k(self, c, m, y, k): def do_k(self, c, m, y, k):
#self.do_cs(LITERAL_DEVICE_CMYK) #self.do_cs(LITERAL_DEVICE_CMYK)
@ -556,6 +602,7 @@ class PDFPageInterpreter(object):
n = 1 n = 1
self.pop(n) self.pop(n)
return return
def do_scn(self): def do_scn(self):
if self.ncs: if self.ncs:
n = self.ncs.ncomponents n = self.ncs.ncomponents
@ -565,42 +612,53 @@ class PDFPageInterpreter(object):
n = 1 n = 1
self.pop(n) self.pop(n)
return return
def do_SC(self): def do_SC(self):
self.do_SCN() self.do_SCN()
return return
def do_sc(self): def do_sc(self):
self.do_scn() self.do_scn()
return return
# sharing-name # sharing-name
def do_sh(self, name): return def do_sh(self, name):
return
# begin-text # begin-text
def do_BT(self): def do_BT(self):
self.textstate.reset() self.textstate.reset()
return return
# end-text # end-text
def do_ET(self): def do_ET(self):
return return
# begin-compat # begin-compat
def do_BX(self): return def do_BX(self):
return
# end-compat # end-compat
def do_EX(self): return def do_EX(self):
return
# marked content operators # marked content operators
def do_MP(self, tag): def do_MP(self, tag):
self.device.do_tag(tag) self.device.do_tag(tag)
return return
def do_DP(self, tag, props): def do_DP(self, tag, props):
self.device.do_tag(tag, props) self.device.do_tag(tag, props)
return return
def do_BMC(self, tag): def do_BMC(self, tag):
self.device.begin_tag(tag) self.device.begin_tag(tag)
return return
def do_BDC(self, tag, props): def do_BDC(self, tag, props):
self.device.begin_tag(tag, props) self.device.begin_tag(tag, props)
return return
def do_EMC(self): def do_EMC(self):
self.device.end_tag() self.device.end_tag()
return return
@ -609,18 +667,22 @@ class PDFPageInterpreter(object):
def do_Tc(self, space): def do_Tc(self, space):
self.textstate.charspace = space self.textstate.charspace = space
return return
# setwordspace # setwordspace
def do_Tw(self, space): def do_Tw(self, space):
self.textstate.wordspace = space self.textstate.wordspace = space
return return
# textscale # textscale
def do_Tz(self, scale): def do_Tz(self, scale):
self.textstate.scaling = scale self.textstate.scaling = scale
return return
# setleading # setleading
def do_TL(self, leading): def do_TL(self, leading):
self.textstate.leading = -leading self.textstate.leading = -leading
return return
# selectfont # selectfont
def do_Tf(self, fontid, fontsize): def do_Tf(self, fontid, fontsize):
try: try:
@ -631,10 +693,12 @@ class PDFPageInterpreter(object):
self.textstate.font = self.rsrcmgr.get_font(None, {}) self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize self.textstate.fontsize = fontsize
return return
# setrendering # setrendering
def do_Tr(self, render): def do_Tr(self, render):
self.textstate.render = render self.textstate.render = render
return return
# settextrise # settextrise
def do_Ts(self, rise): def do_Ts(self, rise):
self.textstate.rise = rise self.textstate.rise = rise
@ -642,49 +706,55 @@ class PDFPageInterpreter(object):
# text-move # text-move
def do_Td(self, tx, ty): def do_Td(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix (a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f) self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.linematrix = (0, 0) self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate) #print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
return return
# text-move # text-move
def do_TD(self, tx, ty): def do_TD(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix (a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f) self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.leading = ty self.textstate.leading = ty
self.textstate.linematrix = (0, 0) self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate) #print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
return return
# textmatrix # textmatrix
def do_Tm(self, a,b,c,d,e,f): def do_Tm(self, a, b, c, d, e, f):
self.textstate.matrix = (a,b,c,d,e,f) self.textstate.matrix = (a, b, c, d, e, f)
self.textstate.linematrix = (0, 0) self.textstate.linematrix = (0, 0)
return return
# nextline # nextline
def do_T_a(self): def do_T_a(self):
(a,b,c,d,e,f) = self.textstate.matrix (a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f) self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
self.textstate.linematrix = (0, 0) self.textstate.linematrix = (0, 0)
return return
# show-pos # show-pos
def do_TJ(self, seq): def do_TJ(self, seq):
#print >>sys.stderr, 'TJ(%r): %r' % (seq,self.textstate) #print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
if self.textstate.font is None: if self.textstate.font is None:
if STRICT: if STRICT:
raise PDFInterpreterError('No font specified!') raise PDFInterpreterError('No font specified!')
return return
self.device.render_string(self.textstate, seq) self.device.render_string(self.textstate, seq)
return return
# show # show
def do_Tj(self, s): def do_Tj(self, s):
self.do_TJ([s]) self.do_TJ([s])
return return
# quote # quote
def do__q(self, s): def do__q(self, s):
self.do_T_a() self.do_T_a()
self.do_TJ([s]) self.do_TJ([s])
return return
# doublequote # doublequote
def do__w(self, aw, ac, s): def do__w(self, aw, ac, s):
self.do_Tw(aw) self.do_Tw(aw)
@ -693,14 +763,16 @@ class PDFPageInterpreter(object):
return return
# inline image # inline image
def do_BI(self): # never called def do_BI(self): # never called
return return
def do_ID(self): # never called
def do_ID(self): # never called
return return
def do_EI(self, obj): def do_EI(self, obj):
if 'W' in obj and 'H' in obj: if 'W' in obj and 'H' in obj:
iobjid = str(id(obj)) iobjid = str(id(obj))
self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(iobjid, obj) self.device.render_image(iobjid, obj)
self.device.end_figure(iobjid) self.device.end_figure(iobjid)
return return
@ -721,7 +793,7 @@ class PDFPageInterpreter(object):
interpreter = self.dup() interpreter = self.dup()
bbox = list_value(xobj['BBox']) bbox = list_value(xobj['BBox'])
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
# According to PDF reference 1.7 section 4.9.1, XObjects in # According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry # earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry. # instead of having their own Resources entry.
resources = dict_value(xobj.get('Resources')) or self.resources.copy() resources = dict_value(xobj.get('Resources')) or self.resources.copy()
@ -729,7 +801,7 @@ class PDFPageInterpreter(object):
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(xobjid, xobj) self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
else: else:
@ -740,15 +812,15 @@ class PDFPageInterpreter(object):
def process_page(self, page): def process_page(self, page):
if 1 <= self.debug: if 1 <= self.debug:
print >>sys.stderr, 'Processing page: %r' % page print >>sys.stderr, 'Processing page: %r' % page
(x0,y0,x1,y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
if page.rotate == 90: if page.rotate == 90:
ctm = (0,-1,1,0, -y0,x1) ctm = (0, -1, 1, 0, -y0, x1)
elif page.rotate == 180: elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1) ctm = (-1, 0, 0, -1, x1, y1)
elif page.rotate == 270: elif page.rotate == 270:
ctm = (0,1,-1,0, y1,-x0) ctm = (0, 1, -1, 0, y1, -x0)
else: else:
ctm = (1,0,0,1, -x0,-y0) ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm) self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm) self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page) self.device.end_page(page)
@ -760,7 +832,7 @@ class PDFPageInterpreter(object):
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
if 1 <= self.debug: if 1 <= self.debug:
print >>sys.stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' % print >>sys.stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' %
(resources, streams, ctm)) (resources, streams, ctm))
self.init_resources(resources) self.init_resources(resources)
self.init_state(ctm) self.init_state(ctm)
self.execute(list_value(streams)) self.execute(list_value(streams))
@ -774,12 +846,12 @@ class PDFPageInterpreter(object):
return return
while 1: while 1:
try: try:
(_,obj) = parser.nextobject() (_, obj) = parser.nextobject()
except PSEOF: except PSEOF:
break break
if isinstance(obj, PSKeyword): if isinstance(obj, PSKeyword):
name = keyword_name(obj) name = keyword_name(obj)
method = 'do_%s' % name.replace('*','_a').replace('"','_w').replace("'",'_q') method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
if hasattr(self, method): if hasattr(self, method):
func = getattr(self, method) func = getattr(self, method)
nargs = func.func_code.co_argcount-1 nargs = func.func_code.co_argcount-1

View File

@ -39,7 +39,7 @@ class PDFPage(object):
def __init__(self, doc, pageid, attrs): def __init__(self, doc, pageid, attrs):
"""Initialize a page object. """Initialize a page object.
doc: a PDFDocument object. doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page. pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes. attrs: a dictionary of page attributes.
@ -62,7 +62,7 @@ class PDFPage(object):
else: else:
contents = [] contents = []
if not isinstance(contents, list): if not isinstance(contents, list):
contents = [ contents ] contents = [contents]
self.contents = contents self.contents = contents
return return
@ -70,6 +70,7 @@ class PDFPage(object):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox) return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod @classmethod
def create_pages(klass, document, debug=0): def create_pages(klass, document, debug=0):
def search(obj, parent): def search(obj, parent):
@ -79,7 +80,7 @@ class PDFPage(object):
else: else:
objid = obj.objid objid = obj.objid
tree = dict_value(obj).copy() tree = dict_value(obj).copy()
for (k,v) in parent.iteritems(): for (k, v) in parent.iteritems():
if k in klass.INHERITABLE_ATTRS and k not in tree: if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
@ -94,7 +95,7 @@ class PDFPage(object):
yield (objid, tree) yield (objid, tree)
pages = False pages = False
if 'Pages' in document.catalog: if 'Pages' in document.catalog:
for (objid,tree) in search(document.catalog['Pages'], document.catalog): for (objid, tree) in search(document.catalog['Pages'], document.catalog):
yield klass(document, objid, tree) yield klass(document, objid, tree)
pages = True pages = True
if not pages: if not pages:
@ -109,7 +110,8 @@ class PDFPage(object):
pass pass
return return
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
@classmethod @classmethod
def get_pages(klass, fp, def get_pages(klass, fp,
@ -126,8 +128,10 @@ class PDFPage(object):
if check_extractable and not doc.is_extractable: if check_extractable and not doc.is_extractable:
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document. # Process each page contained in the document.
for (pageno,page) in enumerate(klass.create_pages(doc)): for (pageno, page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos): continue if pagenos and (pageno not in pagenos):
continue
yield page yield page
if maxpages and maxpages <= pageno+1: break if maxpages and maxpages <= pageno+1:
break
return return

View File

@ -15,7 +15,8 @@ from pdftypes import dict_value
## Exceptions ## Exceptions
## ##
class PDFSyntaxError(PDFException): pass class PDFSyntaxError(PDFException):
pass
## PDFParser ## PDFParser
@ -35,7 +36,7 @@ class PDFParser(PSStackParser):
parser.set_document(doc) parser.set_document(doc)
parser.seek(offset) parser.seek(offset)
parser.nextobject() parser.nextobject()
""" """
def __init__(self, fp): def __init__(self, fp):
@ -55,12 +56,13 @@ class PDFParser(PSStackParser):
KEYWORD_STREAM = KWD('stream') KEYWORD_STREAM = KWD('stream')
KEYWORD_XREF = KWD('xref') KEYWORD_XREF = KWD('xref')
KEYWORD_STARTXREF = KWD('startxref') KEYWORD_STARTXREF = KWD('startxref')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
"""Handles PDF-related keywords.""" """Handles PDF-related keywords."""
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1)) self.add_results(*self.pop(1))
elif token is self.KEYWORD_ENDOBJ: elif token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4)) self.add_results(*self.pop(4))
@ -71,7 +73,7 @@ class PDFParser(PSStackParser):
elif token is self.KEYWORD_R: elif token is self.KEYWORD_R:
# reference to indirect object # reference to indirect object
try: try:
((_,objid), (_,genno)) = self.pop(2) ((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno)) (objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno) obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj)) self.push((pos, obj))
@ -80,7 +82,7 @@ class PDFParser(PSStackParser):
elif token is self.KEYWORD_STREAM: elif token is self.KEYWORD_STREAM:
# stream object # stream object
((_,dic),) = self.pop(1) ((_, dic),) = self.pop(1)
dic = dict_value(dic) dic = dict_value(dic)
objlen = 0 objlen = 0
if not self.fallback: if not self.fallback:
@ -118,14 +120,14 @@ class PDFParser(PSStackParser):
# XXX limit objlen not to exceed object boundary # XXX limit objlen not to exceed object boundary
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10]) (pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher) obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj)) self.push((pos, obj))
else: else:
# others # others
self.push((pos, token)) self.push((pos, token))
return return
@ -153,7 +155,7 @@ class PDFStreamParser(PDFParser):
if token is self.KEYWORD_R: if token is self.KEYWORD_R:
# reference to indirect object # reference to indirect object
try: try:
((_,objid), (_,genno)) = self.pop(2) ((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno)) (objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno) obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj)) self.push((pos, obj))

View File

@ -22,13 +22,28 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
## PDF Objects ## PDF Objects
## ##
class PDFObject(PSObject): pass class PDFObject(PSObject):
pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass class PDFException(PSException):
class PDFValueError(PDFException): pass pass
class PDFObjectNotFound(PDFException): pass
class PDFNotImplementedError(PDFException): pass
class PDFTypeError(PDFException):
pass
class PDFValueError(PDFException):
pass
class PDFObjectNotFound(PDFException):
pass
class PDFNotImplementedError(PDFException):
pass
## PDFObjRef ## PDFObjRef
@ -65,33 +80,36 @@ def resolve1(x, default=None):
x = x.resolve(default=default) x = x.resolve(default=default)
return x return x
def resolve_all(x, default=None): def resolve_all(x, default=None):
"""Recursively resolves the given object and all the internals. """Recursively resolves the given object and all the internals.
Make sure there is no indirect reference within the nested object. Make sure there is no indirect reference within the nested object.
This procedure might be slow. This procedure might be slow.
""" """
while isinstance(x, PDFObjRef): while isinstance(x, PDFObjRef):
x = x.resolve(default=default) x = x.resolve(default=default)
if isinstance(x, list): if isinstance(x, list):
x = [ resolve_all(v, default=default) for v in x ] x = [resolve_all(v, default=default) for v in x]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k, v) in x.iteritems():
x[k] = resolve_all(v, default=default) x[k] = resolve_all(v, default=default)
return x return x
def decipher_all(decipher, objid, genno, x): def decipher_all(decipher, objid, genno, x):
"""Recursively deciphers the given object. """Recursively deciphers the given object.
""" """
if isinstance(x, str): if isinstance(x, str):
return decipher(objid, genno, x) return decipher(objid, genno, x)
if isinstance(x, list): if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ] x = [decipher_all(decipher, objid, genno, v) for v in x]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k,v) in x.iteritems(): for (k, v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v) x[k] = decipher_all(decipher, objid, genno, v)
return x return x
# Type cheking # Type cheking
def int_value(x): def int_value(x):
x = resolve1(x) x = resolve1(x)
@ -101,6 +119,7 @@ def int_value(x):
return 0 return 0
return x return x
def float_value(x): def float_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, float): if not isinstance(x, float):
@ -109,6 +128,7 @@ def float_value(x):
return 0.0 return 0.0
return x return x
def num_value(x): def num_value(x):
x = resolve1(x) x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)): if not (isinstance(x, int) or isinstance(x, float)):
@ -117,6 +137,7 @@ def num_value(x):
return 0 return 0
return x return x
def str_value(x): def str_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, str): if not isinstance(x, str):
@ -125,6 +146,7 @@ def str_value(x):
return '' return ''
return x return x
def list_value(x): def list_value(x):
x = resolve1(x) x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)): if not (isinstance(x, list) or isinstance(x, tuple)):
@ -133,6 +155,7 @@ def list_value(x):
return [] return []
return x return x
def dict_value(x): def dict_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, dict): if not isinstance(x, dict):
@ -141,6 +164,7 @@ def dict_value(x):
return {} return {}
return x return x
def stream_value(x): def stream_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, PDFStream): if not isinstance(x, PDFStream):
@ -179,13 +203,13 @@ class PDFStream(PDFObject):
def __contains__(self, name): def __contains__(self, name):
return name in self.attrs return name in self.attrs
def __getitem__(self, name): def __getitem__(self, name):
return self.attrs[name] return self.attrs[name]
def get(self, name, default=None): def get(self, name, default=None):
return self.attrs.get(name, default) return self.attrs.get(name, default)
def get_any(self, names, default=None): def get_any(self, names, default=None):
for name in names: for name in names:
if name in self.attrs: if name in self.attrs:
@ -194,12 +218,14 @@ class PDFStream(PDFObject):
def get_filters(self): def get_filters(self):
filters = self.get_any(('F', 'Filter')) filters = self.get_any(('F', 'Filter'))
if not filters: return [] if not filters:
if isinstance(filters, list): return filters return []
return [ filters ] if isinstance(filters, list):
return filters
return [filters]
def decode(self): def decode(self):
assert self.data is None and self.rawdata != None assert self.data is None and self.rawdata is not None
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher:
# Handle encryption # Handle encryption

View File

@ -8,11 +8,24 @@ STRICT = 0
## PS Exceptions ## PS Exceptions
## ##
class PSException(Exception): pass class PSException(Exception):
class PSEOF(PSException): pass pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass class PSEOF(PSException):
pass
class PSSyntaxError(PSException):
pass
class PSTypeError(PSException):
pass
class PSValueError(PSException):
pass
## Basic PostScript Types ## Basic PostScript Types
@ -32,7 +45,7 @@ class PSObject(object):
class PSLiteral(PSObject): class PSLiteral(PSObject):
"""A class that represents a PostScript literal. """A class that represents a PostScript literal.
Postscript literals are used as identifiers, such as Postscript literals are used as identifiers, such as
variable names, property names and dictionary keys. variable names, property names and dictionary keys.
Literals are case sensitive and denoted by a preceding Literals are case sensitive and denoted by a preceding
@ -55,11 +68,11 @@ class PSLiteral(PSObject):
class PSKeyword(PSObject): class PSKeyword(PSObject):
"""A class that represents a PostScript keyword. """A class that represents a PostScript keyword.
PostScript keywords are a dozen of predefined words. PostScript keywords are a dozen of predefined words.
Commands and directives in PostScript are expressed by keywords. Commands and directives in PostScript are expressed by keywords.
They are also used to denote the content boundaries. They are also used to denote the content boundaries.
Note: Do not create an instance of PSKeyword directly. Note: Do not create an instance of PSKeyword directly.
Always use PSKeywordTable.intern(). Always use PSKeywordTable.intern().
""" """
@ -80,7 +93,7 @@ class PSSymbolTable(object):
Interned objects can be checked its identity with "is" operator. Interned objects can be checked its identity with "is" operator.
""" """
def __init__(self, klass): def __init__(self, klass):
self.dict = {} self.dict = {}
self.klass = klass self.klass = klass
@ -114,6 +127,7 @@ def literal_name(x):
return str(x) return str(x)
return x.name return x.name
def keyword_name(x): def keyword_name(x):
if not isinstance(x, PSKeyword): if not isinstance(x, PSKeyword):
if STRICT: if STRICT:
@ -136,7 +150,9 @@ END_NUMBER = re.compile(r'[^0-9]')
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]') END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]') OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92}
class PSBaseParser(object): class PSBaseParser(object):
"""Most basic PostScript parser that performs only tokenization. """Most basic PostScript parser that performs only tokenization.
@ -190,7 +206,8 @@ class PSBaseParser(object):
return return
def fillbuf(self): def fillbuf(self):
if self.charpos < len(self.buf): return if self.charpos < len(self.buf):
return
# fetch next chunk. # fetch next chunk.
self.bufpos = self.fp.tell() self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ) self.buf = self.fp.read(self.BUFSIZ)
@ -242,7 +259,8 @@ class PSBaseParser(object):
pos = max(0, pos-self.BUFSIZ) pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos) self.fp.seek(pos)
s = self.fp.read(prevpos-pos) s = self.fp.read(prevpos-pos)
if not s: break if not s:
break
while 1: while 1:
n = max(s.rfind('\r'), s.rfind('\n')) n = max(s.rfind('\r'), s.rfind('\n'))
if n == -1: if n == -1:
@ -357,7 +375,7 @@ class PSBaseParser(object):
pass pass
self._parse1 = self._parse_main self._parse1 = self._parse_main
return j return j
def _parse_float(self, s, i): def _parse_float(self, s, i):
m = END_NUMBER.search(s, i) m = END_NUMBER.search(s, i)
if not m: if not m:
@ -407,7 +425,7 @@ class PSBaseParser(object):
return j+1 return j+1
if c == ')': if c == ')':
self.paren -= 1 self.paren -= 1
if self.paren: # WTF, they said balanced parens need no special treatment. if self.paren: # WTF, they said balanced parens need no special treatment.
self._curtoken += c self._curtoken += c
return j+1 return j+1
self._add_token(self._curtoken) self._add_token(self._curtoken)
@ -493,17 +511,17 @@ class PSStackParser(PSBaseParser):
def push(self, *objs): def push(self, *objs):
self.curstack.extend(objs) self.curstack.extend(objs)
return return
def pop(self, n): def pop(self, n):
objs = self.curstack[-n:] objs = self.curstack[-n:]
self.curstack[-n:] = [] self.curstack[-n:] = []
return objs return objs
def popall(self): def popall(self):
objs = self.curstack objs = self.curstack
self.curstack = [] self.curstack = []
return objs return objs
def add_results(self, *objs): def add_results(self, *objs):
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'add_results: %r' % (objs,) print >>sys.stderr, 'add_results: %r' % (objs,)
@ -516,11 +534,11 @@ class PSStackParser(PSBaseParser):
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'start_type: pos=%r, type=%r' % (pos, type) print >>sys.stderr, 'start_type: pos=%r, type=%r' % (pos, type)
return return
def end_type(self, type): def end_type(self, type):
if self.curtype != type: if self.curtype != type:
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [ obj for (_,obj) in self.curstack ] objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop() (pos, self.curtype, self.curstack) = self.context.pop()
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs) print >>sys.stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
@ -553,7 +571,8 @@ class PSStackParser(PSBaseParser):
try: try:
self.push(self.end_type('a')) self.push(self.end_type('a'))
except PSTypeError: except PSTypeError:
if STRICT: raise if STRICT:
raise
elif token == KEYWORD_DICT_BEGIN: elif token == KEYWORD_DICT_BEGIN:
# begin dictionary # begin dictionary
self.start_type(pos, 'd') self.start_type(pos, 'd')
@ -564,10 +583,11 @@ class PSStackParser(PSBaseParser):
if len(objs) % 2 != 0: if len(objs) % 2 != 0:
raise PSSyntaxError('Invalid dictionary construct: %r' % objs) raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
# construct a Python dictionary. # construct a Python dictionary.
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None ) d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
self.push((pos, d)) self.push((pos, d))
except PSTypeError: except PSTypeError:
if STRICT: raise if STRICT:
raise
elif token == KEYWORD_PROC_BEGIN: elif token == KEYWORD_PROC_BEGIN:
# begin proc # begin proc
self.start_type(pos, 'p') self.start_type(pos, 'p')
@ -576,7 +596,8 @@ class PSStackParser(PSBaseParser):
try: try:
self.push(self.end_type('p')) self.push(self.end_type('p'))
except PSTypeError: except PSTypeError:
if STRICT: raise if STRICT:
raise
else: else:
if 2 <= self.debug: if 2 <= self.debug:
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \ print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
@ -592,9 +613,11 @@ class PSStackParser(PSBaseParser):
return obj return obj
import unittest
## Simplistic Test cases ## Simplistic Test cases
## ##
import unittest
class TestPSBaseParser(unittest.TestCase): class TestPSBaseParser(unittest.TestCase):
TESTDATA = r'''%!PS TESTDATA = r'''%!PS
@ -630,7 +653,7 @@ func/a/b{(c)do*}def
(242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')), (242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')),
(256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'), (256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'),
(272, KWD('>>')) (272, KWD('>>'))
] ]
OBJS = [ OBJS = [
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
@ -641,10 +664,11 @@ func/a/b{(c)do*}def
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'), (191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'),
(230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']), (230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']),
(258, {'foo': 'bar'}), (258, {'foo': 'bar'}),
] ]
def get_tokens(self, s): def get_tokens(self, s):
import StringIO import StringIO
class MyParser(PSBaseParser): class MyParser(PSBaseParser):
def flush(self): def flush(self):
self.add_results(*self.popall()) self.add_results(*self.popall())
@ -659,6 +683,7 @@ func/a/b{(c)do*}def
def get_objects(self, s): def get_objects(self, s):
import StringIO import StringIO
class MyParser(PSStackParser): class MyParser(PSStackParser):
def flush(self): def flush(self):
self.add_results(*self.popall()) self.add_results(*self.popall())
@ -683,4 +708,5 @@ func/a/b{(c)do*}def
self.assertEqual(objs, self.OBJS) self.assertEqual(objs, self.OBJS)
return return
if __name__ == '__main__': unittest.main() if __name__ == '__main__':
unittest.main()

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,7 @@ def rldecode(data):
'1234567777777abcde' '1234567777777abcde'
""" """
decoded = [] decoded = []
i=0 i = 0
while i < len(data): while i < len(data):
#print "data[%d]=:%d:" % (i,ord(data[i])) #print "data[%d]=:%d:" % (i,ord(data[i]))
length = ord(data[i]) length = ord(data[i])

View File

@ -32,13 +32,13 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
buf += chr(c) buf += chr(c)
elif pred == '\x02': elif pred == '\x02':
# PNG up # PNG up
for (a,b) in zip(line0,line1): for (a, b) in zip(line0, line1):
c = (ord(a)+ord(b)) & 255 c = (ord(a)+ord(b)) & 255
buf += chr(c) buf += chr(c)
elif pred == '\x03': elif pred == '\x03':
# PNG average (UNTESTED) # PNG average (UNTESTED)
c = 0 c = 0
for (a,b) in zip(line0,line1): for (a, b) in zip(line0, line1):
c = ((c+ord(a)+ord(b))/2) & 255 c = ((c+ord(a)+ord(b))/2) & 255
buf += chr(c) buf += chr(c)
else: else:
@ -52,21 +52,25 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
## ##
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
"""Returns the multiplication of two matrices.""" """Returns the multiplication of two matrices."""
return (a0*a1+c0*b1, b0*a1+d0*b1, return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1, a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
"""Translates a matrix by (x,y)."""
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
def apply_matrix_pt((a,b,c,d,e,f), (x,y)): def translate_matrix((a, b, c, d, e, f), (x, y)):
"""Translates a matrix by (x, y)."""
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
"""Applies a matrix to a point.""" """Applies a matrix to a point."""
return (a*x+c*y+e, b*x+d*y+f) return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))""" """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
return (a*p+c*q, b*p+d*q) return (a*p+c*q, b*p+d*q)
@ -79,17 +83,20 @@ def uniq(objs):
"""Eliminates duplicated elements.""" """Eliminates duplicated elements."""
done = set() done = set()
for obj in objs: for obj in objs:
if obj in done: continue if obj in done:
continue
done.add(obj) done.add(obj)
yield obj yield obj
return return
# csort # csort
def csort(objs, key=lambda x:x): def csort(objs, key=lambda x: x):
"""Order-preserving sorting function.""" """Order-preserving sorting function."""
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) ) idxs = dict((obj, i) for (i, obj) in enumerate(objs))
return sorted(objs, key=lambda obj: (key(obj), idxs[obj])) return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
# fsplit # fsplit
def fsplit(pred, objs): def fsplit(pred, objs):
"""Split a list into two classes according to the predicate.""" """Split a list into two classes according to the predicate."""
@ -100,7 +107,8 @@ def fsplit(pred, objs):
t.append(obj) t.append(obj)
else: else:
f.append(obj) f.append(obj)
return (t,f) return (t, f)
# drange # drange
def drange(v0, v1, d): def drange(v0, v1, d):
@ -108,16 +116,18 @@ def drange(v0, v1, d):
assert v0 < v1 assert v0 < v1
return xrange(int(v0)/d, int(v1+d)/d) return xrange(int(v0)/d, int(v1+d)/d)
# get_bound # get_bound
def get_bound(pts): def get_bound(pts):
"""Compute a minimal rectangle that covers all the points.""" """Compute a minimal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF) (x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts: for (x, y) in pts:
x0 = min(x0, x) x0 = min(x0, x)
y0 = min(y0, y) y0 = min(y0, y)
x1 = max(x1, x) x1 = max(x1, x)
y1 = max(y1, y) y1 = max(y1, y)
return (x0,y0,x1,y1) return (x0, y0, x1, y1)
# pick # pick
def pick(seq, func, maxobj=None): def pick(seq, func, maxobj=None):
@ -126,9 +136,10 @@ def pick(seq, func, maxobj=None):
for obj in seq: for obj in seq:
score = func(obj) score = func(obj)
if maxscore is None or maxscore < score: if maxscore is None or maxscore < score:
(maxscore,maxobj) = (score,obj) (maxscore, maxobj) = (score, obj)
return maxobj return maxobj
# choplist # choplist
def choplist(n, seq): def choplist(n, seq):
"""Groups every n elements of the list.""" """Groups every n elements of the list."""
@ -140,6 +151,7 @@ def choplist(n, seq):
r = [] r = []
return return
# nunpack # nunpack
def nunpack(s, default=0): def nunpack(s, default=0):
"""Unpacks 1 to 4 byte integers (big endian).""" """Unpacks 1 to 4 byte integers (big endian)."""
@ -157,59 +169,65 @@ def nunpack(s, default=0):
else: else:
raise TypeError('invalid length: %d' % l) raise TypeError('invalid length: %d' % l)
# decode_text # decode_text
PDFDocEncoding = ''.join( unichr(x) for x in ( PDFDocEncoding = ''.join(unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018, 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
)) ))
def decode_text(s): def decode_text(s):
"""Decodes a PDFDocEncoding string to Unicode.""" """Decodes a PDFDocEncoding string to Unicode."""
if s.startswith('\xfe\xff'): if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore') return unicode(s[2:], 'utf-16be', 'ignore')
else: else:
return ''.join( PDFDocEncoding[ord(c)] for c in s ) return ''.join(PDFDocEncoding[ord(c)] for c in s)
# enc # enc
def enc(x, codec='ascii'): def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML""" """Encodes a string for SGML/XML/HTML"""
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;') x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')
def bbox2str((x0,y0,x1,y1)):
def bbox2str((x0, y0, x1, y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
def matrix2str((a,b,c,d,e,f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a,b,c,d,e,f) def matrix2str((a, b, c, d, e, f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
## Plane ## Plane
@ -240,14 +258,14 @@ class Plane(object):
def __contains__(self, obj): def __contains__(self, obj):
return obj in self._objs return obj in self._objs
def _getrange(self, (x0,y0,x1,y1)): def _getrange(self, (x0, y0, x1, y1)):
x0 = max(self.x0, x0) x0 = max(self.x0, x0)
y0 = max(self.y0, y0) y0 = max(self.y0, y0)
x1 = min(self.x1, x1) x1 = min(self.x1, x1)
y1 = min(self.y1, y1) y1 = min(self.y1, y1)
for y in drange(y0, y1, self.gridsize): for y in drange(y0, y1, self.gridsize):
for x in drange(x0, x1, self.gridsize): for x in drange(x0, x1, self.gridsize):
yield (x,y) yield (x, y)
return return
# extend(objs) # extend(objs)
@ -255,7 +273,7 @@ class Plane(object):
for obj in objs: for obj in objs:
self.add(obj) self.add(obj)
return return
# add(obj): place an object. # add(obj): place an object.
def add(self, obj): def add(self, obj):
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
@ -279,14 +297,17 @@ class Plane(object):
return return
# find(): finds objects that are in a certain area. # find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)): def find(self, (x0, y0, x1, y1)):
done = set() done = set()
for k in self._getrange((x0,y0,x1,y1)): for k in self._getrange((x0, y0, x1, y1)):
if k not in self._grid: continue if k not in self._grid:
continue
for obj in self._grid[k]: for obj in self._grid[k]:
if obj in done: continue if obj in done:
continue
done.add(obj) done.add(obj)
if (obj.x1 <= x0 or x1 <= obj.x0 or if (obj.x1 <= x0 or x1 <= obj.x0 or
obj.y1 <= y0 or y1 <= obj.y0): continue obj.y1 <= y0 or y1 <= obj.y0):
continue
yield obj yield obj
return return

View File

@ -7,9 +7,9 @@ setup(
version=__version__, version=__version__,
description='PDF parser and analyzer', description='PDF parser and analyzer',
long_description='''PDFMiner is a tool for extracting information from PDF documents. long_description='''PDFMiner is a tool for extracting information from PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting Unlike other PDF-related tools, it focuses entirely on getting
and analyzing text data. PDFMiner allows to obtain and analyzing text data. PDFMiner allows to obtain
the exact location of texts in a page, as well as the exact location of texts in a page, as well as
other information such as fonts or lines. other information such as fonts or lines.
It includes a PDF converter that can transform PDF files It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible into other text formats (such as HTML). It has an extensible

View File

@ -50,7 +50,7 @@ class CMapConverter(object):
assert values[0] == 'CID' assert values[0] == 'CID'
encs = values encs = values
continue continue
def put(dmap, code, cid, force=False): def put(dmap, code, cid, force=False):
for b in code[:-1]: for b in code[:-1]:
b = ord(b) b = ord(b)
@ -64,7 +64,7 @@ class CMapConverter(object):
if force or ((b not in dmap) or dmap[b] == cid): if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid dmap[b] = cid
return return
def add(unimap, enc, code): def add(unimap, enc, code):
try: try:
codec = self.enc2codec[enc] codec = self.enc2codec[enc]
@ -78,20 +78,20 @@ class CMapConverter(object):
except UnicodeError: except UnicodeError:
pass pass
return return
def pick(unimap): def pick(unimap):
chars = unimap.items() chars = unimap.items()
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True) chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
(c,_) = chars[0] (c,_) = chars[0]
return c return c
cid = int(values[0]) cid = int(values[0])
unimap_h = {} unimap_h = {}
unimap_v = {} unimap_v = {}
for (enc,value) in zip(encs, values): for (enc,value) in zip(encs, values):
if enc == 'CID': continue if enc == 'CID': continue
if value == '*': continue if value == '*': continue
# hcodes, vcodes: encoded bytes for each writing mode. # hcodes, vcodes: encoded bytes for each writing mode.
hcodes = [] hcodes = []
vcodes = [] vcodes = []
@ -121,7 +121,7 @@ class CMapConverter(object):
for code in hcodes: for code in hcodes:
put(hmap, code, cid) put(hmap, code, cid)
put(vmap, code, cid) put(vmap, code, cid)
# Determine the "most popular" candidate. # Determine the "most popular" candidate.
if unimap_h: if unimap_h:
self.cid2unichr_h[cid] = pick(unimap_h) self.cid2unichr_h[cid] = pick(unimap_h)
@ -137,7 +137,7 @@ class CMapConverter(object):
) )
fp.write(pickle.dumps(data)) fp.write(pickle.dumps(data))
return return
def dump_unicodemap(self, fp): def dump_unicodemap(self, fp):
data = dict( data = dict(
CID2UNICHR_H=self.cid2unichr_h, CID2UNICHR_H=self.cid2unichr_h,
@ -151,7 +151,7 @@ def main(argv):
import getopt import getopt
import gzip import gzip
import os.path import os.path
def usage(): def usage():
print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0] print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
return 100 return 100

View File

@ -25,7 +25,7 @@ def dumpxml(out, obj, codec=None):
if obj is None: if obj is None:
out.write('<null />') out.write('<null />')
return return
if isinstance(obj, dict): if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj)) out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems(): for (k,v) in obj.iteritems():
@ -179,7 +179,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
out.write(fileobj.get_data()) out.write(fileobj.get_data())
out.close() out.close()
return return
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser) doc = PDFDocument(parser)

View File

@ -14,7 +14,7 @@ This is an in-house mapping table for some Latin-1 characters
LATIN2ASCII = { LATIN2ASCII = {
#0x00a0: '', #0x00a0: '',
#0x00a7: '', #0x00a7: '',
# iso-8859-1 # iso-8859-1
0x00c0: 'A`', 0x00c0: 'A`',
0x00c1: "A'", 0x00c1: "A'",

View File

@ -159,7 +159,7 @@ class WebApp(object):
def convert(self): def convert(self):
self.form = cgi.FieldStorage(fp=self.infp, environ=self.environ) self.form = cgi.FieldStorage(fp=self.infp, environ=self.environ)
if (self.method != 'POST' or if (self.method != 'POST' or
'c' not in self.form or 'c' not in self.form or
'f' not in self.form): 'f' not in self.form):
self.response_200() self.response_200()