pull/1/head
Yusuke Shinyama 2013-11-07 19:50:41 +09:00
commit 2b56b2eedf
28 changed files with 1484 additions and 1216 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
__version__ = '20131022'
if __name__ == '__main__': print __version__
if __name__ == '__main__':
print __version__

View File

@ -6,6 +6,7 @@ This code is in the public domain.
"""
## Arcfour
##
class Arcfour(object):

View File

@ -9,6 +9,7 @@ This code is in the public domain.
import re
import struct
# ascii85decode(data)
def ascii85decode(data):
"""
@ -16,13 +17,13 @@ def ascii85decode(data):
letters, using 85 different types of characters (as 256**4 < 85**5).
When the length of the original bytes is not a multiple of 4, a special
rule is used for round up.
The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters.
The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85
>>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q')
'Man is distinguished'
>>> ascii85decode('E,9)oF*2M7/c~>')
@ -35,7 +36,7 @@ def ascii85decode(data):
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
out += struct.pack('>L', b)
n = b = 0
elif c == 'z':
assert n == 0
@ -44,13 +45,15 @@ def ascii85decode(data):
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
out += struct.pack('>L', b)[:n-1]
break
return out
# asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
@ -60,7 +63,7 @@ def asciihexdecode(data):
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode('61 62 2e6364 65')
'ab.cde'
>>> asciihexdecode('61 62 2e6364 657>')

View File

@ -29,7 +29,7 @@ class BitParser(object):
for i in xrange(len(bits)):
if 0 < i:
if p[b] is None:
p[b] = [None,None]
p[b] = [None, None]
p = p[b]
if bits[i] == '1':
b = 1
@ -41,7 +41,7 @@ class BitParser(object):
def feedbytes(self, data):
for c in data:
b = ord(c)
for m in (128,64,32,16,8,4,2,1):
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m)
return
@ -62,7 +62,7 @@ class BitParser(object):
##
class CCITTG4Parser(BitParser):
MODE = [None,None]
MODE = [None, None]
BitParser.add(MODE, 0, '1')
BitParser.add(MODE, +1, '011')
BitParser.add(MODE, -1, '010')
@ -82,7 +82,7 @@ class CCITTG4Parser(BitParser):
BitParser.add(MODE, 'x7', '0000001110')
BitParser.add(MODE, 'e', '000000000001000000000001')
WHITE = [None,None]
WHITE = [None, None]
BitParser.add(WHITE, 0 , '00110101')
BitParser.add(WHITE, 1 , '000111')
BitParser.add(WHITE, 2 , '0111')
@ -188,7 +188,7 @@ class CCITTG4Parser(BitParser):
BitParser.add(WHITE, 2496, '000000011110')
BitParser.add(WHITE, 2560, '000000011111')
BLACK = [None,None]
BLACK = [None, None]
BitParser.add(BLACK, 0 , '0000110111')
BitParser.add(BLACK, 1 , '010')
BitParser.add(BLACK, 2 , '11')
@ -294,25 +294,30 @@ class CCITTG4Parser(BitParser):
BitParser.add(BLACK, 2496, '000000011110')
BitParser.add(BLACK, 2560, '000000011111')
UNCOMPRESSED = [None,None]
BitParser.add(UNCOMPRESSED, '1' , '1')
BitParser.add(UNCOMPRESSED, '01' , '01')
BitParser.add(UNCOMPRESSED, '001' , '001')
BitParser.add(UNCOMPRESSED, '0001' , '0001')
BitParser.add(UNCOMPRESSED, '00001' , '00001')
BitParser.add(UNCOMPRESSED, '00000' , '000001')
BitParser.add(UNCOMPRESSED, 'T00' , '00000011')
BitParser.add(UNCOMPRESSED, 'T10' , '00000010')
BitParser.add(UNCOMPRESSED, 'T000' , '000000011')
BitParser.add(UNCOMPRESSED, 'T100' , '000000010')
BitParser.add(UNCOMPRESSED, 'T0000' , '0000000011')
BitParser.add(UNCOMPRESSED, 'T1000' , '0000000010')
BitParser.add(UNCOMPRESSED, 'T00000' , '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000' , '00000000010')
class EOFB(Exception): pass
class InvalidData(Exception): pass
class ByteSkip(Exception): pass
UNCOMPRESSED = [None, None]
BitParser.add(UNCOMPRESSED, '1', '1')
BitParser.add(UNCOMPRESSED, '01', '01')
BitParser.add(UNCOMPRESSED, '001', '001')
BitParser.add(UNCOMPRESSED, '0001', '0001')
BitParser.add(UNCOMPRESSED, '00001', '00001')
BitParser.add(UNCOMPRESSED, '00000', '000001')
BitParser.add(UNCOMPRESSED, 'T00', '00000011')
BitParser.add(UNCOMPRESSED, 'T10', '00000010')
BitParser.add(UNCOMPRESSED, 'T000', '000000011')
BitParser.add(UNCOMPRESSED, 'T100', '000000010')
BitParser.add(UNCOMPRESSED, 'T0000', '0000000011')
BitParser.add(UNCOMPRESSED, 'T1000', '0000000010')
BitParser.add(UNCOMPRESSED, 'T00000', '00000000011')
BitParser.add(UNCOMPRESSED, 'T10000', '00000000010')
class EOFB(Exception):
pass
class InvalidData(Exception):
pass
class ByteSkip(Exception):
pass
def __init__(self, width, bytealign=False):
BitParser.__init__(self)
@ -325,7 +330,7 @@ class CCITTG4Parser(BitParser):
for c in data:
b = ord(c)
try:
for m in (128,64,32,16,8,4,2,1):
for m in (128, 64, 32, 16, 8, 4, 2, 1):
self._parse_bit(b & m)
except self.ByteSkip:
self._accept = self._parse_mode
@ -359,7 +364,8 @@ class CCITTG4Parser(BitParser):
raise self.InvalidData(mode)
def _parse_horiz1(self, n):
if n is None: raise self.InvalidData
if n is None:
raise self.InvalidData
self._n1 += n
if n < 64:
self._n2 = 0
@ -371,7 +377,8 @@ class CCITTG4Parser(BitParser):
return self.BLACK
def _parse_horiz2(self, n):
if n is None: raise self.InvalidData
if n is None:
raise self.InvalidData
self._n2 += n
if n < 64:
self._color = 1-self._color
@ -385,9 +392,10 @@ class CCITTG4Parser(BitParser):
return self.BLACK
def _parse_uncompressed(self, bits):
if not bits: raise self.InvalidData
if not bits:
raise self.InvalidData
if bits.startswith('T'):
self._accept = self._parse_mode
self._accept = self._parse_mode
self._color = int(bits[1])
self._do_uncompressed(bits[2:])
return self.MODE
@ -396,17 +404,17 @@ class CCITTG4Parser(BitParser):
return self.UNCOMPRESSED
def _get_bits(self):
return ''.join( str(b) for b in self._curline[:self._curpos] )
return ''.join(str(b) for b in self._curline[:self._curpos])
def _get_refline(self, i):
if i < 0:
return '[]'+''.join( str(b) for b in self._refline )
return '[]'+''.join(str(b) for b in self._refline)
elif len(self._refline) <= i:
return ''.join( str(b) for b in self._refline )+'[]'
return ''.join(str(b) for b in self._refline)+'[]'
else:
return (''.join( str(b) for b in self._refline[:i] )+
'['+str(self._refline[i])+']'+
''.join( str(b) for b in self._refline[i+1:] ))
return (''.join(str(b) for b in self._refline[:i]) +
'['+str(self._refline[i])+']' +
''.join(str(b) for b in self._refline[i+1:]))
def reset(self):
self._y = 0
@ -417,16 +425,16 @@ class CCITTG4Parser(BitParser):
return
def output_line(self, y, bits):
print y, ''.join( str(b) for b in bits )
print y, ''.join(str(b) for b in bits)
return
def _reset_line(self):
self._refline = self._curline
self._curline = array.array('b', [1]*self.width)
self._curpos = -1
self._color = 1
return
def _flush_line(self):
if self.width <= self._curpos:
self.output_line(self._y, self._curline)
@ -442,12 +450,13 @@ class CCITTG4Parser(BitParser):
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and
self._refline[x1] != self._color): break
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break
self._refline[x1] != self._color):
break
x1 += 1
x1 += dx
x0 = max(0, self._curpos)
@ -461,50 +470,54 @@ class CCITTG4Parser(BitParser):
self._curpos = x1
self._color = 1-self._color
return
def _do_pass(self):
#print '* pass: curpos=%r, color=%r' % (self._curpos, self._color)
#print ' refline:', self._get_refline(self._curpos+1)
x1 = self._curpos+1
while 1:
if x1 == 0:
if (self._color == 1 and
self._refline[x1] != self._color): break
if (self._color == 1 and self._refline[x1] != self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] == self._color and
self._refline[x1] != self._color): break
self._refline[x1] != self._color):
break
x1 += 1
while 1:
if x1 == 0:
if (self._color == 0 and
self._refline[x1] == self._color): break
if (self._color == 0 and self._refline[x1] == self._color):
break
elif x1 == len(self._refline):
break
elif (self._refline[x1-1] != self._color and
self._refline[x1] == self._color): break
self._refline[x1] == self._color):
break
x1 += 1
for x in xrange(self._curpos, x1):
self._curline[x] = self._color
self._curpos = x1
return
def _do_horizontal(self, n1, n2):
#print '* horizontal(%d,%d): curpos=%r, color=%r' % (n1, n2, self._curpos, self._color)
if self._curpos < 0:
self._curpos = 0
x = self._curpos
for _ in xrange(n1):
if len(self._curline) <= x: break
if len(self._curline) <= x:
break
self._curline[x] = self._color
x += 1
for _ in xrange(n2):
if len(self._curline) <= x: break
if len(self._curline) <= x:
break
self._curline[x] = 1-self._color
x += 1
self._curpos = x
return
def _do_uncompressed(self, bits):
#print '* uncompressed(%r): curpos=%r' % (bits, self._curpos)
for c in bits:
@ -513,15 +526,16 @@ class CCITTG4Parser(BitParser):
self._flush_line()
return
import unittest
## Test cases
##
import unittest
class TestCCITTG4Parser(unittest.TestCase):
def get_parser(self, bits):
parser = CCITTG4Parser(len(bits))
parser._curline = [ int(c) for c in bits ]
parser._curline = [int(c) for c in bits]
parser._reset_line()
return parser
@ -656,7 +670,7 @@ class TestCCITTG4Parser(unittest.TestCase):
parser._do_vertical(-1)
parser._do_vertical(-1)
parser._do_vertical(1)
parser._do_horizontal(1,1)
parser._do_horizontal(1, 1)
self.assertEqual(parser._get_bits(), '011101')
return
@ -673,23 +687,23 @@ class TestCCITTG4Parser(unittest.TestCase):
## CCITTFaxDecoder
##
class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width, bytealign=False, reversed=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed
self._buf = ''
return
def close(self):
return self._buf
def output_line(self, y, bits):
bytes = array.array('B', [0]*((len(bits)+7)/8))
if self.reversed:
bits = [ 1-b for b in bits ]
for (i,b) in enumerate(bits):
bits = [1-b for b in bits]
for (i, b) in enumerate(bits):
if b:
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
bytes[i/8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
self._buf += bytes.tostring()
return
@ -705,35 +719,39 @@ def ccittfaxdecode(data, params):
raise ValueError(K)
parser.feedbytes(data)
return parser.close()
# test
def main(argv):
import pygame
if not argv[1:]:
return unittest.main()
class Parser(CCITTG4Parser):
def __init__(self, width, bytealign=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.img = pygame.Surface((self.width,1000))
self.img = pygame.Surface((self.width, 1000))
return
def output_line(self, y, bits):
for (x,b) in enumerate(bits):
for (x, b) in enumerate(bits):
if b:
self.img.set_at((x,y), (255,255,255))
self.img.set_at((x, y), (255, 255, 255))
else:
self.img.set_at((x,y), (0,0,0))
self.img.set_at((x, y), (0, 0, 0))
return
def close(self):
pygame.image.save(self.img, 'out.bmp')
return
for path in argv[1:]:
fp = file(path,'rb')
(_,_,k,w,h,_) = path.split('.')
fp = file(path, 'rb')
(_, _, k, w, h, _) = path.split('.')
parser = Parser(int(w))
parser.feedbytes(fp.read())
parser.close()
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -25,7 +25,8 @@ from encodingdb import name2unicode
from utils import choplist, nunpack
class CMapError(Exception): pass
class CMapError(Exception):
pass
## CMap
@ -43,8 +44,9 @@ class CMap(object):
def use_cmap(self, cmap):
assert isinstance(cmap, CMap)
def copy(dst, src):
for (k,v) in src.iteritems():
for (k, v) in src.iteritems():
if isinstance(v, dict):
d = {}
dst[k] = d
@ -73,14 +75,14 @@ class CMap(object):
if code2cid is None:
code2cid = self.code2cid
code = ()
for (k,v) in sorted(code2cid.iteritems()):
for (k, v) in sorted(code2cid.iteritems()):
c = code+(k,)
if isinstance(v, int):
out.write('code %r = cid %d\n' % (c,v))
out.write('code %r = cid %d\n' % (c, v))
else:
self.dump(out=out, code2cid=v, code=c)
return
## IdentityCMap
##
@ -99,8 +101,7 @@ class IdentityCMap(object):
return struct.unpack('>%dH' % n, code)
else:
return ()
## UnicodeMap
##
@ -118,8 +119,8 @@ class UnicodeMap(object):
return self.cid2unichr[cid]
def dump(self, out=sys.stdout):
for (k,v) in sorted(self.cid2unichr.iteritems()):
out.write('cid %d = unicode %r\n' % (k,v))
for (k, v) in sorted(self.cid2unichr.iteritems()):
out.write('cid %d = unicode %r\n' % (k, v))
return
@ -152,7 +153,7 @@ class FileCMap(CMap):
else:
t = {}
d[c] = t
d =t
d = t
c = ord(code[-1])
d[c] = cid
return
@ -161,7 +162,7 @@ class FileCMap(CMap):
## FileUnicodeMap
##
class FileUnicodeMap(UnicodeMap):
def __init__(self):
UnicodeMap.__init__(self)
self.attrs = {}
@ -204,12 +205,12 @@ class PyCMap(CMap):
def is_vertical(self):
return self._is_vertical
## PyUnicodeMap
##
class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical):
if vertical:
cid2unichr = module.CID2UNICHR_V
@ -230,18 +231,17 @@ class CMapDB(object):
debug = 0
_cmap_cache = {}
_umap_cache = {}
class CMapNotFound(CMapError): pass
class CMapNotFound(CMapError):
pass
@classmethod
def _load_data(klass, name):
filename = '%s.pickle.gz' % name
if klass.debug:
print >>sys.stderr, 'loading:', name
cmap_paths = (
os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),
)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
@ -305,11 +305,12 @@ class CMapParser(PSStackParser):
elif name == 'endcmap':
self._in_cmap = False
return
if not self._in_cmap: return
if not self._in_cmap:
return
#
if name == 'def':
try:
((_,k),(_,v)) = self.pop(2)
((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError:
pass
@ -317,7 +318,7 @@ class CMapParser(PSStackParser):
if name == 'usecmap':
try:
((_,cmapname),) = self.pop(1)
((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
@ -336,13 +337,15 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
if sprefix != eprefix:
continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
@ -350,7 +353,7 @@ class CMapParser(PSStackParser):
vlen = len(svar)
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+struct.pack('>L',s1+i)[-vlen:]
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
return
@ -358,8 +361,8 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid))
return
@ -368,10 +371,11 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
len(s) != len(e)):
continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
@ -384,7 +388,7 @@ class CMapParser(PSStackParser):
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+struct.pack('>L',base+i)[-vlen:]
x = prefix+struct.pack('>L', base+i)[-vlen:]
self.cmap.add_cid2unichr(s1+i, x)
return
@ -392,8 +396,8 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.add_cid2unichr(nunpack(cid), code)
return
@ -408,6 +412,7 @@ class CMapParser(PSStackParser):
self.push((pos, token))
return
# test
def main(argv):
args = argv[1:]
@ -420,4 +425,5 @@ def main(argv):
cmap.dump()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -21,9 +21,9 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return
def begin_page(self, page, ctm):
(x0,y0,x1,y1) = page.mediabox
(x0,y0) = apply_matrix_pt(ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(ctm, (x1,y1))
(x0, y0, x1, y1) = page.mediabox
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
self.cur_item = LTPage(self.pageno, mediabox)
return
@ -61,26 +61,26 @@ class PDFLayoutAnalyzer(PDFTextDevice):
shape = ''.join(x[0] for x in path)
if shape == 'ml':
# horizontal/vertical line
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(_, x0, y0) = path[0]
(_, x1, y1) = path[1]
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
if x0 == x1 or y0 == y1:
self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1)))
return
if shape == 'mlllh':
# rectangle
(_,x0,y0) = path[0]
(_,x1,y1) = path[1]
(_,x2,y2) = path[2]
(_,x3,y3) = path[3]
(x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
(x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
(x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
(x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
(_, x0, y0) = path[0]
(_, x1, y1) = path[1]
(_, x2, y2) = path[2]
(_, x3, y3) = path[3]
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2)))
return
# other shapes
pts = []
@ -119,7 +119,7 @@ class PDFPageAggregator(PDFLayoutAnalyzer):
PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
self.result = None
return
def receive_layout(self, ltpage):
self.result = ltpage
return
@ -137,7 +137,7 @@ class PDFConverter(PDFLayoutAnalyzer):
self.outfp = outfp
self.codec = codec
return
## TextConverter
##
@ -176,10 +176,11 @@ class TextConverter(PDFConverter):
# is text. This stops all the image and drawing ouput from being
# recorded and taking up RAM.
def render_image(self, name, stream):
if self.imagewriter is None: return
if self.imagewriter is None:
return
PDFConverter.render_image(self, name, stream)
return
def paint_path(self, gstate, stroke, fill, evenodd, path):
return
@ -196,18 +197,18 @@ class HTMLConverter(PDFConverter):
'textgroup': 'red',
'curve': 'black',
'page': 'gray',
}
}
TEXT_COLORS = {
'textbox': 'blue',
'char': 'black',
}
}
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
pagemargin=50, imagewriter=None,
rect_colors={'curve':'black', 'page':'gray'},
text_colors={'char':'black'}):
rect_colors={'curve': 'black', 'page': 'gray'},
text_colors={'char': 'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.scale = scale
self.fontscale = fontscale
@ -238,7 +239,7 @@ class HTMLConverter(PDFConverter):
def write_footer(self):
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno)))
self.write('</body></html>\n')
return
@ -295,7 +296,7 @@ class HTMLConverter(PDFConverter):
self._font = self._fontstack.pop()
self.write('</div>')
return
def put_text(self, text, fontname, fontsize):
font = (fontname, fontsize)
if font != self._font:
@ -318,6 +319,7 @@ class HTMLConverter(PDFConverter):
for child in item:
show_group(child)
return
def render(item):
if isinstance(item, LTPage):
self._yoffset += item.y1
@ -399,7 +401,7 @@ class XMLConverter(PDFConverter):
def write_footer(self):
self.outfp.write('</pages>\n')
return
def write_text(self, text):
self.outfp.write(enc(text, self.codec))
return
@ -415,6 +417,7 @@ class XMLConverter(PDFConverter):
show_group(child)
self.outfp.write('</textgroup>\n')
return
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %

View File

@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
from latin_enc import ENCODING
STRIP_NAME = re.compile(r'[0-9]+')
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in glyphname2unicode:
return glyphname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
if not m:
raise KeyError(name)
return unichr(int(m.group(0)))
@ -26,19 +29,23 @@ class EncodingDB(object):
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
for (name, std, mac, win, pdf) in ENCODING:
c = name2unicode(name)
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
if std:
std2unicode[std] = c
if mac:
mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):

View File

@ -8,7 +8,7 @@ written with a proportional font.
The following data were extracted from the AFM files:
http://www.ctan.org/tex-archive/fonts/adobe/afm/
"""
### BEGIN Verbatim copy of the license part

View File

@ -5,9 +5,11 @@ import os, os.path
from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
def align32(x):
return ((x+3)/4)*4
## BMPWriter
##
class BMPWriter(object):
@ -36,12 +38,12 @@ class BMPWriter(object):
self.fp.write(info)
if ncols == 2:
# B&W color table
for i in (0,255):
self.fp.write(struct.pack('BBBx', i,i,i))
for i in (0, 255):
self.fp.write(struct.pack('BBBx', i, i, i))
elif ncols == 256:
# grayscale color table
for i in xrange(256):
self.fp.write(struct.pack('BBBx', i,i,i))
self.fp.write(struct.pack('BBBx', i, i, i))
self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize
return
@ -68,7 +70,7 @@ class ImageWriter(object):
(width, height) = image.srcsize
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
ext = '.jpg'
elif (image.bits == 1 or
elif (image.bits == 1 or
image.bits == 8 and image.colorspace in (LITERAL_DEVICE_RGB, LITERAL_DEVICE_GRAY)):
ext = '.%dx%d.bmp' % (width, height)
else:
@ -82,7 +84,7 @@ class ImageWriter(object):
from PIL import Image
from PIL import ImageChops
ifp = cStringIO.StringIO(raw_data)
i = Image.open(ifp)
i = Image.open(ifp)
i = ImageChops.invert(i)
i = i.convert('RGB')
i.save(fp, 'JPEG')

View File

@ -81,7 +81,7 @@ class LTComponent(LTItem):
return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox)))
def set_bbox(self, (x0,y0,x1,y1)):
def set_bbox(self, (x0, y0, x1, y1)):
self.x0 = x0
self.y0 = y0
self.x1 = x1
@ -93,7 +93,7 @@ class LTComponent(LTItem):
def is_empty(self):
return self.width <= 0 or self.height <= 0
def is_hoverlap(self, obj):
assert isinstance(obj, LTComponent)
return obj.x0 <= self.x1 and self.x0 <= obj.x1
@ -142,7 +142,7 @@ class LTCurve(LTComponent):
return
def get_pts(self):
return ','.join( '%.3f,%.3f' % p for p in self.pts )
return ','.join('%.3f,%.3f' % p for p in self.pts)
## LTLine
@ -158,8 +158,8 @@ class LTLine(LTCurve):
##
class LTRect(LTCurve):
def __init__(self, linewidth, (x0,y0,x1,y1)):
LTCurve.__init__(self, linewidth, [(x0,y0), (x1,y0), (x1,y1), (x0,y1)])
def __init__(self, linewidth, (x0, y0, x1, y1)):
LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
return
@ -212,7 +212,7 @@ class LTChar(LTComponent, LTText):
if font.is_vertical():
# vertical
width = font.get_width() * fontsize
(vx,vy) = textdisp
(vx, vy) = textdisp
if vx is None:
vx = width/2
else:
@ -229,15 +229,15 @@ class LTChar(LTComponent, LTText):
ty = descent + rise
bll = (0, ty)
bur = (self.adv, ty+height)
(a,b,c,d,e,f) = self.matrix
(a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0)
(x0,y0) = apply_matrix_pt(self.matrix, bll)
(x1,y1) = apply_matrix_pt(self.matrix, bur)
(x0, y0) = apply_matrix_pt(self.matrix, bll)
(x1, y1) = apply_matrix_pt(self.matrix, bur)
if x1 < x0:
(x0,x1) = (x1,x0)
(x0, x1) = (x1, x0)
if y1 < y0:
(y0,y1) = (y1,y0)
LTComponent.__init__(self, (x0,y0,x1,y1))
(y0, y1) = (y1, y0)
LTComponent.__init__(self, (x0, y0, x1, y1))
if font.is_vertical():
self.size = self.width
else:
@ -246,7 +246,7 @@ class LTChar(LTComponent, LTText):
def __repr__(self):
return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
(self.__class__.__name__, bbox2str(self.bbox),
(self.__class__.__name__, bbox2str(self.bbox),
matrix2str(self.matrix), self.fontname, self.adv,
self.get_text()))
@ -257,7 +257,7 @@ class LTChar(LTComponent, LTText):
"""Returns True if two characters can coexist in the same line."""
return True
## LTContainer
##
class LTContainer(LTComponent):
@ -286,14 +286,14 @@ class LTContainer(LTComponent):
for obj in self._objs:
obj.analyze(laparams)
return
## LTExpandableContainer
##
class LTExpandableContainer(LTContainer):
def __init__(self):
LTContainer.__init__(self, (+INF,+INF,-INF,-INF))
LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
return
def add(self, obj):
@ -313,8 +313,8 @@ class LTTextContainer(LTExpandableContainer, LTText):
return
def get_text(self):
return ''.join( obj.get_text() for obj in self if isinstance(obj, LTText) )
return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
## LTTextLine
##
@ -338,6 +338,7 @@ class LTTextLine(LTTextContainer):
def find_neighbors(self, plane, ratio):
raise NotImplementedError
class LTTextLineHorizontal(LTTextLine):
def __init__(self, word_margin):
@ -357,12 +358,13 @@ class LTTextLineHorizontal(LTTextLine):
def find_neighbors(self, plane, ratio):
d = ratio*self.height
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
return [ obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
abs(obj.height-self.height) < d and
(abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d)) ]
return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
abs(obj.height-self.height) < d and
(abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d))]
class LTTextLineVertical(LTTextLine):
def __init__(self, word_margin):
@ -378,16 +380,16 @@ class LTTextLineVertical(LTTextLine):
self._y0 = obj.y0
LTTextLine.add(self, obj)
return
def find_neighbors(self, plane, ratio):
d = ratio*self.width
objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
return [ obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and
abs(obj.width-self.width) < d and
(abs(obj.y0-self.y0) < d or
abs(obj.y1-self.y1) < d)) ]
return [obj for obj in objs
if (isinstance(obj, LTTextLineVertical) and
abs(obj.width-self.width) < d and
(abs(obj.y0-self.y0) < d or
abs(obj.y1-self.y1) < d))]
## LTTextBox
##
@ -406,8 +408,9 @@ class LTTextBox(LTTextContainer):
(self.__class__.__name__,
self.index, bbox2str(self.bbox), self.get_text()))
class LTTextBoxHorizontal(LTTextBox):
def analyze(self, laparams):
LTTextBox.analyze(self, laparams)
self._objs = csort(self._objs, key=lambda obj: -obj.y1)
@ -416,6 +419,7 @@ class LTTextBoxHorizontal(LTTextBox):
def get_writing_mode(self):
return 'lr-tb'
class LTTextBoxVertical(LTTextBox):
def analyze(self, laparams):
@ -436,8 +440,9 @@ class LTTextGroup(LTTextContainer):
self.extend(objs)
return
class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams):
LTTextGroup.analyze(self, laparams)
# reorder the objects from top-left to bottom-right.
@ -446,14 +451,15 @@ class LTTextGroupLRTB(LTTextGroup):
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
return
class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams):
LTTextGroup.analyze(self, laparams)
# reorder the objects from top-right to bottom-left.
self._objs = csort(self._objs, key=lambda obj:
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
-(1-laparams.boxes_flow)*(obj.y1))
- (1-laparams.boxes_flow)*(obj.y1))
return
@ -465,14 +471,14 @@ class LTLayoutContainer(LTContainer):
LTContainer.__init__(self, bbox)
self.groups = None
return
def get_textlines(self, laparams, objs):
obj0 = None
line = None
for obj1 in objs:
if obj0 is not None:
k = 0
if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin):
# obj0 and obj1 is horizontally aligned:
@ -487,7 +493,7 @@ class LTLayoutContainer(LTContainer):
# (char_margin)
k |= 1
if (laparams.detect_vertical and
obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin):
# obj0 and obj1 is vertically aligned:
@ -505,8 +511,8 @@ class LTLayoutContainer(LTContainer):
# |<-->|
# (line_overlap)
k |= 2
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
(k & 2 and isinstance(line, LTTextLineVertical)) ):
if ((k & 1 and isinstance(line, LTTextLineHorizontal)) or
(k & 2 and isinstance(line, LTTextLineVertical))):
line.add(obj1)
elif line is not None:
yield line
@ -554,7 +560,8 @@ class LTLayoutContainer(LTContainer):
done = set()
for line in lines:
box = boxes[line]
if box in done: continue
if box in done:
continue
done.add(box)
if not box.is_empty():
yield box
@ -562,32 +569,34 @@ class LTLayoutContainer(LTContainer):
def group_textboxes(self, laparams, boxes):
assert boxes
def dist(obj1, obj2):
"""A distance function between two TextBoxes.
Consider the bounding rectangle for obj1 and obj2.
Return its area less the areas of obj1 and obj2,
Return its area less the areas of obj1 and obj2,
shown as 'www' below. This value may be negative.
+------+..........+ (x1,y1)
+------+..........+ (x1, y1)
| obj1 |wwwwwwwwww:
+------+www+------+
:wwwwwwwwww| obj2 |
(x0,y0) +..........+------+
(x0, y0) +..........+------+
"""
x0 = min(obj1.x0,obj2.x0)
y0 = min(obj1.y0,obj2.y0)
x1 = max(obj1.x1,obj2.x1)
y1 = max(obj1.y1,obj2.y1)
x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
def isany(obj1, obj2):
"""Check if there's any other object between obj1 and obj2.
"""
x0 = min(obj1.x0,obj2.x0)
y0 = min(obj1.y0,obj2.y0)
x1 = max(obj1.x1,obj2.x1)
y1 = max(obj1.y1,obj2.y1)
objs = set(plane.find((x0,y0,x1,y1)))
return objs.difference((obj1,obj2))
x0 = min(obj1.x0, obj2.x0)
y0 = min(obj1.y0, obj2.y0)
x1 = max(obj1.x1, obj2.x1)
y1 = max(obj1.y1, obj2.y1)
objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2))
# XXX this still takes O(n^2) :(
dists = []
for i in xrange(len(boxes)):
@ -599,49 +608,50 @@ class LTLayoutContainer(LTContainer):
plane = Plane(self.bbox)
plane.extend(boxes)
while dists:
(c,d,obj1,obj2) = dists.pop(0)
(c, d, obj1, obj2) = dists.pop(0)
if c == 0 and isany(obj1, obj2):
dists.append((1,d,obj1,obj2))
dists.append((1, d, obj1, obj2))
continue
if (isinstance(obj1, LTTextBoxVertical) or
isinstance(obj1, LTTextGroupTBRL) or
isinstance(obj2, LTTextBoxVertical) or
isinstance(obj2, LTTextGroupTBRL)):
group = LTTextGroupTBRL([obj1,obj2])
group = LTTextGroupTBRL([obj1, obj2])
else:
group = LTTextGroupLRTB([obj1,obj2])
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
# this line is optimized -- don't change without profiling
dists = [ n for n in dists if n[2] in plane._objs and n[3] in plane._objs ]
dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
for other in plane:
dists.append((0, dist(group,other), group, other))
dists.append((0, dist(group, other), group, other))
dists.sort()
plane.add(group)
assert len(plane) == 1
return list(plane)
def analyze(self, laparams):
# textobjs is a list of LTChar objects, i.e.
# it has all the individual characters in the page.
(textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
for obj in otherobjs:
obj.analyze(laparams)
if not textobjs: return
if not textobjs:
return
textlines = list(self.get_textlines(laparams, textobjs))
assert len(textobjs) <= sum( len(line._objs) for line in textlines )
assert len(textobjs) <= sum(len(line._objs) for line in textlines)
(empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
for obj in empties:
obj.analyze(laparams)
textboxes = list(self.get_textboxes(laparams, textlines))
assert len(textlines) == sum( len(box._objs) for box in textboxes )
assert len(textlines) == sum(len(box._objs) for box in textboxes)
if textboxes:
self.groups = self.group_textboxes(laparams, textboxes)
assigner = IndexAssigner()
for group in self.groups:
group.analyze(laparams)
assigner.run(group)
textboxes.sort(key=lambda box:box.index)
textboxes.sort(key=lambda box: box.index)
self._objs = textboxes + otherobjs + empties
return
@ -653,9 +663,9 @@ class LTFigure(LTLayoutContainer):
def __init__(self, name, bbox, matrix):
self.name = name
self.matrix = matrix
(x,y,w,h) = bbox
bbox = get_bound( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
(x, y, w, h) = bbox
bbox = get_bound(apply_matrix_pt(matrix, (p, q))
for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
LTLayoutContainer.__init__(self, bbox)
return
@ -665,9 +675,10 @@ class LTFigure(LTLayoutContainer):
bbox2str(self.bbox), matrix2str(self.matrix)))
def analyze(self, laparams):
if not laparams.all_texts: return
if not laparams.all_texts:
return
LTLayoutContainer.analyze(self, laparams)
return
return
## LTPage

View File

@ -34,17 +34,18 @@ class LZWDecoder(object):
# |-----8-bits-----|
# |-bpos-|-bits-| |
# | |----r----|
v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
self.bpos += bits
break
else:
# |-----8-bits-----|
# |-bpos-|---bits----...
# | |----r----|
v = (v<<r) | (self.buff & ((1<<r)-1))
v = (v << r) | (self.buff & ((1 << r)-1))
bits -= r
x = self.fp.read(1)
if not x: raise EOFError
if not x:
raise EOFError
self.buff = ord(x)
self.bpos = 0
return v
@ -52,9 +53,9 @@ class LZWDecoder(object):
def feed(self, code):
x = ''
if code == 256:
self.table = [ chr(c) for c in xrange(256) ] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.table = [chr(c) for c in xrange(256)] # 0-255
self.table.append(None) # 256
self.table.append(None) # 257
self.prevbuf = ''
self.nbits = 9
elif code == 257:
@ -97,6 +98,7 @@ class LZWDecoder(object):
(self.nbits, code, x, self.table[258:]))
return
# lzwdecode
def lzwdecode(data):
"""

View File

@ -8,6 +8,7 @@ LITERAL_DEVICE_GRAY = LIT('DeviceGray')
LITERAL_DEVICE_RGB = LIT('DeviceRGB')
LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
class PDFColorSpace(object):
def __init__(self, name, ncomponents):
@ -20,14 +21,14 @@ class PDFColorSpace(object):
PREDEFINED_COLORSPACE = dict(
(name, PDFColorSpace(name,n)) for (name,n) in {
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,
'DeviceRGB': 3,
'DeviceCMYK': 4,
'DeviceGray': 1,
'Separation': 1,
'Indexed': 1,
'Pattern': 1,
}.iteritems())
(name, PDFColorSpace(name, n)) for (name, n) in {
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,
'DeviceRGB': 3,
'DeviceCMYK': 4,
'DeviceGray': 1,
'Separation': 1,
'Indexed': 1,
'Pattern': 1,
}.iteritems())

View File

@ -27,24 +27,31 @@ class PDFDevice(object):
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page, ctm):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, name, stream):
return
def render_string(self, textstate, seq):
return
@ -73,8 +80,8 @@ class PDFTextDevice(PDFDevice):
seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale)
return
def render_string_horizontal(self, seq, matrix, (x,y),
def render_string_horizontal(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False
for obj in seq:
@ -85,14 +92,14 @@ class PDFTextDevice(PDFDevice):
for cid in font.decode(obj):
if needcharspace:
x += charspace
x += self.render_char(translate_matrix(matrix, (x,y)),
x += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
x += wordspace
needcharspace = True
return (x, y)
def render_string_vertical(self, seq, matrix, (x,y),
def render_string_vertical(self, seq, matrix, (x, y),
font, fontsize, scaling, charspace, wordspace, rise, dxscale):
needcharspace = False
for obj in seq:
@ -103,7 +110,7 @@ class PDFTextDevice(PDFDevice):
for cid in font.decode(obj):
if needcharspace:
y += charspace
y += self.render_char(translate_matrix(matrix, (x,y)),
y += self.render_char(translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace:
y += wordspace
@ -131,7 +138,8 @@ class TagExtractor(PDFDevice):
font = textstate.font
text = ''
for obj in seq:
if not isinstance(obj, str): continue
if not isinstance(obj, str):
continue
chars = font.decode(obj)
for cid in chars:
try:
@ -155,8 +163,8 @@ class TagExtractor(PDFDevice):
def begin_tag(self, tag, props=None):
s = ''
if isinstance(props, dict):
s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
in sorted(props.iteritems()) )
s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
in sorted(props.iteritems()))
self.outfp.write('<%s%s>' % (enc(tag.name), s))
self._stack.append(tag)
return

View File

@ -23,11 +23,24 @@ from utils import decode_text
## Exceptions
##
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFNoValidXRef(PDFSyntaxError):
pass
class PDFNoOutlines(PDFException):
pass
class PDFDestinationNotFound(PDFException):
pass
class PDFEncryptionError(PDFException):
pass
class PDFPasswordIncorrect(PDFEncryptionError):
pass
# some predefined literals and keywords.
LITERAL_OBJSTM = LIT('ObjStm')
@ -68,7 +81,8 @@ class PDFXRef(PDFBaseXRef):
while 1:
try:
(pos, line) = parser.nextline()
if not line.strip(): continue
if not line.strip():
continue
except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line:
@ -92,7 +106,8 @@ class PDFXRef(PDFBaseXRef):
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
if use != 'n': continue
if use != 'n':
continue
self.offsets[objid] = (None, long(pos), int(genno))
if 1 <= debug:
print >>sys.stderr, 'xref objects:', self.offsets
@ -100,16 +115,17 @@ class PDFXRef(PDFBaseXRef):
return
KEYWORD_TRAILER = KWD('trailer')
def load_trailer(self, parser):
try:
(_,kwd) = parser.nexttoken()
(_, kwd) = parser.nexttoken()
assert kwd is self.KEYWORD_TRAILER
(_,dic) = parser.nextobject()
(_, dic) = parser.nextobject()
except PSEOF:
x = parser.pop(1)
if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0]
(_, dic) = x[0]
self.trailer.update(dict_value(dic))
return
@ -134,6 +150,7 @@ class PDFXRefFallback(PDFXRef):
return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load(self, parser, debug=0):
parser.seek(0)
while 1:
@ -148,14 +165,15 @@ class PDFXRefFallback(PDFXRef):
print >>sys.stderr, 'trailer: %r' % self.get_trailer()
break
m = self.PDFOBJ_CUE.match(line)
if not m: continue
if not m:
continue
(objid, genno) = m.groups()
objid = int(objid)
genno = int(genno)
self.offsets[objid] = (None, pos, genno)
# expand ObjStm.
parser.seek(pos)
(_,obj) = parser.nextobject()
(_, obj) = parser.nextobject()
if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
stream = stream_value(obj)
try:
@ -168,7 +186,7 @@ class PDFXRefFallback(PDFXRef):
objs = []
try:
while 1:
(_,obj) = parser1.nextobject()
(_, obj) = parser1.nextobject()
objs.append(obj)
except PSEOF:
pass
@ -193,14 +211,14 @@ class PDFXRefStream(PDFBaseXRef):
return '<PDFXRefStream: ranges=%r>' % (self.ranges)
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
(_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken()
(_, stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream['Size']
index_array = stream.get('Index', (1,size))
index_array = stream.get('Index', (1, size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.ranges.extend(choplist(2, index_array))
@ -210,22 +228,22 @@ class PDFXRefStream(PDFBaseXRef):
self.trailer = stream.attrs
if 1 <= debug:
print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3))
(', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3))
return
def get_trailer(self):
return self.trailer
def get_objids(self):
for (start,nobjs) in self.ranges:
for (start, nobjs) in self.ranges:
for i in xrange(nobjs):
yield start+i
return
def get_pos(self, objid):
index = 0
for (start,nobjs) in self.ranges:
for (start, nobjs) in self.ranges:
if start <= objid and objid < start+nobjs:
index += objid - start
else:
@ -260,7 +278,7 @@ class PDFDocument(object):
doc = PDFDocument(parser)
doc.initialize(password)
obj = doc.getobj(objid)
"""
debug = 0
@ -292,7 +310,8 @@ class PDFDocument(object):
self.xrefs.append(xref)
for xref in self.xrefs:
trailer = xref.get_trailer()
if not trailer: continue
if not trailer:
continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer:
#assert not self.encryption
@ -316,6 +335,7 @@ class PDFDocument(object):
# This step is mandatory even if there's no password associated
# with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''):
if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True
@ -326,9 +346,9 @@ class PDFDocument(object):
V = int_value(param.get('V', 0))
if not (V == 1 or V == 2):
raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
length = int_value(param.get('Length', 40)) # Key length (bits)
length = int_value(param.get('Length', 40)) # Key length (bits)
O = str_value(param['O'])
R = int_value(param['R']) # Revision
R = int_value(param['R']) # Revision
if 5 <= R:
raise PDFEncryptionError('Unknown revision: %r' % R)
U = str_value(param['U'])
@ -337,11 +357,11 @@ class PDFDocument(object):
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
# Algorithm 3.2
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5
if 4 <= R:
# 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
@ -355,13 +375,13 @@ class PDFDocument(object):
u1 = Arcfour(key).process(self.PASSWORD_PADDING)
elif R == 3:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1):
k = ''.join( chr(ord(c) ^ i) for c in key )
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1, 19+1):
k = ''.join(chr(ord(c) ^ i) for c in key)
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
u1 = x+x # 32bytes total
if R == 2:
is_authenticated = (u1 == U)
else:
@ -373,18 +393,18 @@ class PDFDocument(object):
return
def decrypt_rc4(self, objid, genno, data):
key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
hash = md5.md5(key)
key = hash.digest()[:min(len(key),16)]
key = hash.digest()[:min(len(key), 16)]
return Arcfour(key).process(data)
def _getobj_objstm(self, stream, index, objid):
if stream.objid in self._parsed_objs:
(objs,n) = self._parsed_objs[stream.objid]
(objs, n) = self._parsed_objs[stream.objid]
else:
(objs,n) = self._get_objects(stream)
(objs, n) = self._get_objects(stream)
if self.caching:
self._parsed_objs[stream.objid] = (objs,n)
self._parsed_objs[stream.objid] = (objs, n)
i = n*2+index
try:
obj = objs[i]
@ -407,25 +427,26 @@ class PDFDocument(object):
objs = []
try:
while 1:
(_,obj) = parser.nextobject()
(_, obj) = parser.nextobject()
objs.append(obj)
except PSEOF:
pass
return (objs, n)
KEYWORD_OBJ = KWD('obj')
def _getobj_parse(self, pos, objid):
self._parser.seek(pos)
(_,objid1) = self._parser.nexttoken() # objid
(_, objid1) = self._parser.nexttoken() # objid
if objid1 != objid:
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
(_,genno) = self._parser.nexttoken() # genno
(_,kwd) = self._parser.nexttoken()
(_, genno) = self._parser.nexttoken() # genno
(_, kwd) = self._parser.nexttoken()
if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
(_,obj) = self._parser.nextobject()
(_, obj) = self._parser.nextobject()
return obj
# can raise PDFObjectNotFound
def getobj(self, objid):
assert objid != 0
@ -465,6 +486,7 @@ class PDFDocument(object):
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
@ -487,13 +509,15 @@ class PDFDocument(object):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat,key))
raise KeyError((cat, key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1,k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None
(k1, k2) = list_value(d['Limits'])
if key < k1 or k2 < key:
return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
@ -501,8 +525,9 @@ class PDFDocument(object):
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v: return v
raise KeyError((cat,key))
if v:
return v
raise KeyError((cat, key))
return lookup(d0)
def get_dest(self, name):
@ -528,7 +553,8 @@ class PDFDocument(object):
line = line.strip()
if 2 <= self.debug:
print >>sys.stderr, 'find_xref: %r' % line
if line == 'startxref': break
if line == 'startxref':
break
if line:
prev = line
else:

View File

@ -25,13 +25,13 @@ def get_widths(seq):
if isinstance(v, list):
if r:
char1 = r[-1]
for (i,w) in enumerate(v):
for (i, w) in enumerate(v):
widths[char1+i] = w
r = []
elif isinstance(v, int):
r.append(v)
if len(r) == 3:
(char1,char2,w) = r
(char1, char2, w) = r
for i in xrange(char1, char2+1):
widths[i] = w
r = []
@ -40,6 +40,7 @@ def get_widths(seq):
#assert get_widths([1,2,3]) == {1:3, 2:3}
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
def get_widths2(seq):
widths = {}
r = []
@ -47,20 +48,20 @@ def get_widths2(seq):
if isinstance(v, list):
if r:
char1 = r[-1]
for (i,(w,vx,vy)) in enumerate(choplist(3,v)):
widths[char1+i] = (w,(vx,vy))
for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
widths[char1+i] = (w, (vx, vy))
r = []
elif isinstance(v, int):
r.append(v)
if len(r) == 5:
(char1,char2,w,vx,vy) = r
(char1, char2, w, vx, vy) = r
for i in xrange(char1, char2+1):
widths[i] = (w,(vx,vy))
widths[i] = (w, (vx, vy))
r = []
return widths
#assert get_widths2([1]) == {}
#assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))}
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
## FontMetricsDB
@ -94,7 +95,7 @@ class Type1FontHeaderParser(PSStackParser):
def get_encoding(self):
while 1:
try:
(cid,name) = self.nextobject()
(cid, name) = self.nextobject()
except PSEOF:
break
try:
@ -102,28 +103,31 @@ class Type1FontHeaderParser(PSStackParser):
except KeyError:
pass
return self._cid2unicode
def do_keyword(self, pos, token):
if token is self.KEYWORD_PUT:
((_,key),(_,value)) = self.pop(2)
((_, key), (_, value)) = self.pop(2)
if (isinstance(key, int) and
isinstance(value, PSLiteral)):
self.add_results((key, literal_name(value)))
return
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
##
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
def getdict(data):
d = {}
fp = StringIO(data)
stack = []
while 1:
c = fp.read(1)
if not c: break
if not c:
break
b0 = ord(c)
if b0 <= 21:
d[b0] = stack
@ -145,19 +149,21 @@ def getdict(data):
else:
b1 = ord(fp.read(1))
if 247 <= b0 and b0 <= 250:
value = ((b0-247)<<8)+b1+108
value = ((b0-247) << 8)+b1+108
elif 251 <= b0 and b0 <= 254:
value = -((b0-251)<<8)-b1-108
value = -((b0-251) << 8)-b1-108
else:
b2 = ord(fp.read(1))
if 128 <= b1: b1 -= 256
if 128 <= b1:
b1 -= 256
if b0 == 28:
value = b1<<8 | b2
value = b1 << 8 | b2
else:
value = b1<<24 | b2<<16 | struct.unpack('>H', fp.read(2))[0]
value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
stack.append(value)
return d
class CFFFont(object):
STANDARD_STRINGS = (
@ -239,7 +245,7 @@ class CFFFont(object):
'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
'001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
)
)
class INDEX(object):
@ -264,13 +270,13 @@ class CFFFont(object):
return self.fp.read(self.offsets[i+1]-self.offsets[i])
def __iter__(self):
return iter( self[i] for i in xrange(len(self)) )
return iter(self[i] for i in xrange(len(self)))
def __init__(self, name, fp):
self.name = name
self.fp = fp
# Header
(_major,_minor,hdrsize,offsize) = struct.unpack('BBBB', self.fp.read(4))
(_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
self.fp.read(hdrsize-4)
# Name INDEX
self.name_index = self.INDEX(self.fp)
@ -297,7 +303,7 @@ class CFFFont(object):
if format == '\x00':
# Format 0
(n,) = struct.unpack('B', self.fp.read(1))
for (code,gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
self.code2gid[code] = gid
self.gid2code[gid] = code
elif format == '\x01':
@ -305,8 +311,8 @@ class CFFFont(object):
(n,) = struct.unpack('B', self.fp.read(1))
code = 0
for i in xrange(n):
(first,nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first,first+nleft+1):
(first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first, first+nleft+1):
self.code2gid[code] = gid
self.gid2code[gid] = code
code += 1
@ -320,7 +326,7 @@ class CFFFont(object):
if format == '\x00':
# Format 0
n = self.nglyphs-1
for (gid,sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
gid += 1
name = self.getstr(sid)
self.name2gid[name] = gid
@ -330,8 +336,8 @@ class CFFFont(object):
(n,) = struct.unpack('B', self.fp.read(1))
sid = 0
for i in xrange(n):
(first,nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first,first+nleft+1):
(first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first, first+nleft+1):
name = self.getstr(sid)
self.name2gid[name] = gid
self.gid2name[gid] = name
@ -356,7 +362,8 @@ class CFFFont(object):
##
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
class CMapNotFound(Exception):
pass
def __init__(self, name, fp):
self.name = name
@ -389,15 +396,16 @@ class TrueTypeFont(object):
elif fmttype == 2:
subheaderkeys = struct.unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
for (i, k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = struct.unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
for (i, firstcode, entcount, delta, pos) in hdrs:
if not entcount:
continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
@ -414,7 +422,7 @@ class TrueTypeFont(object):
idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
@ -426,16 +434,19 @@ class TrueTypeFont(object):
assert 0
# create unicode map
unicode_map = FileUnicodeMap()
for (char,gid) in char2gid.iteritems():
for (char, gid) in char2gid.iteritems():
unicode_map.add_cid2unichr(gid, char)
return unicode_map
## Fonts
##
class PDFFontError(PDFException):
pass
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass
class PDFUnicodeNotDefined(PDFFontError):
pass
LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
LITERAL_TYPE1C = LIT('Type1C')
@ -456,7 +467,7 @@ class PDFFont(object):
self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
self.hscale = self.vscale = .001
return
@ -474,6 +485,7 @@ class PDFFont(object):
def get_ascent(self):
return self.ascent * self.vscale
def get_descent(self):
return self.descent * self.vscale
@ -482,6 +494,7 @@ class PDFFont(object):
if w == 0:
w = -self.default_width
return w * self.hscale
def get_height(self):
h = self.bbox[3]-self.bbox[1]
if h == 0:
@ -501,7 +514,7 @@ class PDFFont(object):
return 0
def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) )
return sum(self.char_width(cid) for cid in self.decode(s))
# PDFSimpleFont
@ -540,6 +553,7 @@ class PDFSimpleFont(PDFFont):
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
@ -557,7 +571,7 @@ class PDFType1Font(PDFSimpleFont):
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 255))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
PDFSimpleFont.__init__(self, descriptor, widths, spec)
if 'Encoding' not in spec and 'FontFile' in descriptor:
# try to recover the missing encoding info from the font file.
@ -571,12 +585,14 @@ class PDFType1Font(PDFSimpleFont):
def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
@ -584,16 +600,16 @@ class PDFType3Font(PDFSimpleFont):
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
descriptor = {'Ascent': 0, 'Descent': 0,
'FontBBox': spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
self.matrix = tuple(list_value(spec.get('FontMatrix')))
(_,self.descent,_,self.ascent) = self.bbox
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
(_, self.descent, _, self.ascent) = self.bbox
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
return
def __repr__(self):
@ -657,10 +673,10 @@ class PDFCIDFont(PDFFont):
if self.vertical:
# writing mode: vertical
widths = get_widths2(list_value(spec.get('W2', [])))
self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.iteritems() )
(vy,w) = spec.get('DW2', [880, -1000])
self.default_disp = (None,vy)
widths = dict( (cid,w) for (cid,(w,_)) in widths.iteritems() )
self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
(vy, w) = spec.get('DW2', [880, -1000])
self.default_disp = (None, vy)
widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
default_width = w
else:
# writing mode: horizontal
@ -689,7 +705,8 @@ class PDFCIDFont(PDFFont):
def to_unichr(self, cid):
try:
if not self.unicode_map: raise KeyError(cid)
if not self.unicode_map:
raise KeyError(cid)
return self.unicode_map.get_unichr(cid)
except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
@ -705,4 +722,5 @@ def main(argv):
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -26,8 +26,12 @@ from utils import mult_matrix, MATRIX_IDENTITY
## Exceptions
##
class PDFResourceError(PDFException): pass
class PDFInterpreterError(PDFException): pass
class PDFResourceError(PDFException):
pass
class PDFInterpreterError(PDFException):
pass
## Constants
@ -116,12 +120,13 @@ class PDFGraphicState(object):
(self.linewidth, self.linecap, self.linejoin,
self.miterlimit, self.dash, self.intent, self.flatness))
## Resource Manager
##
class PDFResourceManager(object):
"""Repository of shared resources.
ResourceManager facilitates reuse of shared resources
such as fonts and images so that large objects are not
allocated multiple times.
@ -148,7 +153,8 @@ class PDFResourceManager(object):
try:
return CMapDB.get_cmap(cmapname)
except CMapDB.CMapNotFound:
if strict: raise
if strict:
raise
return CMap()
def get_font(self, objid, spec):
@ -191,7 +197,7 @@ class PDFResourceManager(object):
else:
if STRICT:
raise PDFFontError('Invalid Font spec: %r' % spec)
font = PDFType1Font(self, spec) # this is so wrong!
font = PDFType1Font(self, spec) # this is so wrong!
if objid and self.caching:
self._cached_fonts[objid] = font
return font
@ -223,12 +229,14 @@ class PDFContentParser(PSStackParser):
return
def fillbuf(self):
if self.charpos < len(self.buf): return
if self.charpos < len(self.buf):
return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf: break
if self.buf:
break
self.fp = None
self.charpos = 0
return
@ -259,7 +267,7 @@ class PDFContentParser(PSStackParser):
except ValueError:
data += self.buf[self.charpos:]
self.charpos = len(self.buf)
data = data[:-(len(target)+1)] # strip the last part
data = data[:-(len(target)+1)] # strip the last part
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data)
return (pos, data)
@ -270,6 +278,7 @@ class PDFContentParser(PSStackParser):
KEYWORD_BI = KWD('BI')
KEYWORD_ID = KWD('ID')
KEYWORD_EI = KWD('EI')
def do_keyword(self, pos, token):
if token is self.KEYWORD_BI:
# inline image within a content stream
@ -279,13 +288,14 @@ class PDFContentParser(PSStackParser):
(_, objs) = self.end_type('inline')
if len(objs) % 2 != 0:
raise PSTypeError('Invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs))
(pos, data) = self.get_inline_data(pos+len('ID '))
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
else:
self.push((pos, token))
return
@ -312,7 +322,9 @@ class PDFPageInterpreter(object):
self.fontmap = {}
self.xobjmap = {}
self.csmap = PREDEFINED_COLORSPACE.copy()
if not resources: return
if not resources:
return
def get_colorspace(spec):
if isinstance(spec, list):
name = literal_name(spec[0])
@ -324,23 +336,23 @@ class PDFPageInterpreter(object):
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE.get(name)
for (k,v) in dict_value(resources).iteritems():
for (k, v) in dict_value(resources).iteritems():
if 2 <= self.debug:
print >>sys.stderr, 'Resource: %r: %r' % (k,v)
print >>sys.stderr, 'Resource: %r: %r' % (k, v)
if k == 'Font':
for (fontid,spec) in dict_value(v).iteritems():
for (fontid, spec) in dict_value(v).iteritems():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == 'ColorSpace':
for (csid,spec) in dict_value(v).iteritems():
for (csid, spec) in dict_value(v).iteritems():
self.csmap[csid] = get_colorspace(resolve1(spec))
elif k == 'ProcSet':
self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
for (xobjid, xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
return
@ -367,7 +379,8 @@ class PDFPageInterpreter(object):
return
def pop(self, n):
if n == 0: return []
if n == 0:
return []
x = self.argstack[-n:]
self.argstack = self.argstack[:-n]
return x
@ -384,6 +397,7 @@ class PDFPageInterpreter(object):
def do_q(self):
self.gstack.append(self.get_current_state())
return
# grestore
def do_Q(self):
if self.gstack:
@ -392,7 +406,7 @@ class PDFPageInterpreter(object):
# concat-matrix
def do_cm(self, a1, b1, c1, d1, e1, f1):
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
self.ctm = mult_matrix((a1, b1, c1, d1, e1, f1), self.ctm)
self.device.set_ctm(self.ctm)
return
@ -400,30 +414,37 @@ class PDFPageInterpreter(object):
def do_w(self, linewidth):
self.graphicstate.linewidth = linewidth
return
# setlinecap
def do_J(self, linecap):
self.graphicstate.linecap = linecap
return
# setlinejoin
def do_j(self, linejoin):
self.graphicstate.linejoin = linejoin
return
# setmiterlimit
def do_M(self, miterlimit):
self.graphicstate.miterlimit = miterlimit
return
# setdash
def do_d(self, dash, phase):
self.graphicstate.dash = (dash, phase)
return
# setintent
def do_ri(self, intent):
self.graphicstate.intent = intent
return
# setflatness
def do_i(self, flatness):
self.graphicstate.flatness = flatness
return
# load-gstate
def do_gs(self, name):
#XXX
@ -431,34 +452,40 @@ class PDFPageInterpreter(object):
# moveto
def do_m(self, x, y):
self.curpath.append(('m',x,y))
self.curpath.append(('m', x, y))
return
# lineto
def do_l(self, x, y):
self.curpath.append(('l',x,y))
self.curpath.append(('l', x, y))
return
# curveto
def do_c(self, x1, y1, x2, y2, x3, y3):
self.curpath.append(('c',x1,y1,x2,y2,x3,y3))
self.curpath.append(('c', x1, y1, x2, y2, x3, y3))
return
# urveto
def do_v(self, x2, y2, x3, y3):
self.curpath.append(('v',x2,y2,x3,y3))
self.curpath.append(('v', x2, y2, x3, y3))
return
# rveto
def do_y(self, x1, y1, x3, y3):
self.curpath.append(('y',x1,y1,x3,y3))
self.curpath.append(('y', x1, y1, x3, y3))
return
# closepath
def do_h(self):
self.curpath.append(('h',))
return
# rectangle
def do_re(self, x, y, w, h):
self.curpath.append(('m',x,y))
self.curpath.append(('l',x+w,y))
self.curpath.append(('l',x+w,y+h))
self.curpath.append(('l',x,y+h))
self.curpath.append(('m', x, y))
self.curpath.append(('l', x+w, y))
self.curpath.append(('l', x+w, y+h))
self.curpath.append(('l', x, y+h))
self.curpath.append(('h',))
return
@ -467,11 +494,13 @@ class PDFPageInterpreter(object):
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = []
return
# close-and-stroke
def do_s(self):
self.do_h()
self.do_S()
return
# fill
def do_f(self):
self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
@ -479,68 +508,85 @@ class PDFPageInterpreter(object):
return
# fill (obsolete)
do_F = do_f
# fill-even-odd
def do_f_a(self):
self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
self.curpath = []
return
# fill-and-stroke
def do_B(self):
self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
self.curpath = []
return
# fill-and-stroke-even-odd
def do_B_a(self):
self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
self.curpath = []
return
# close-fill-and-stroke
def do_b(self):
self.do_h()
self.do_B()
return
# close-fill-and-stroke-even-odd
def do_b_a(self):
self.do_h()
self.do_B_a()
return
# close-only
def do_n(self):
self.curpath = []
return
# clip
def do_W(self): return
def do_W(self):
return
# clip-even-odd
def do_W_a(self): return
def do_W_a(self):
return
# setcolorspace-stroking
def do_CS(self, name):
self.scs = self.csmap[literal_name(name)]
return
# setcolorspace-non-strokine
def do_cs(self, name):
self.ncs = self.csmap[literal_name(name)]
return
# setgray-stroking
def do_G(self, gray):
#self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
#self.do_cs(LITERAL_DEVICE_GRAY)
return
# setrgb-stroking
def do_RG(self, r, g, b):
#self.do_CS(LITERAL_DEVICE_RGB)
return
# setrgb-non-stroking
def do_rg(self, r, g, b):
#self.do_cs(LITERAL_DEVICE_RGB)
return
# setcmyk-stroking
def do_K(self, c, m, y, k):
#self.do_CS(LITERAL_DEVICE_CMYK)
return
# setcmyk-non-stroking
def do_k(self, c, m, y, k):
#self.do_cs(LITERAL_DEVICE_CMYK)
@ -556,6 +602,7 @@ class PDFPageInterpreter(object):
n = 1
self.pop(n)
return
def do_scn(self):
if self.ncs:
n = self.ncs.ncomponents
@ -565,42 +612,53 @@ class PDFPageInterpreter(object):
n = 1
self.pop(n)
return
def do_SC(self):
self.do_SCN()
return
def do_sc(self):
self.do_scn()
return
# sharing-name
def do_sh(self, name): return
def do_sh(self, name):
return
# begin-text
def do_BT(self):
self.textstate.reset()
return
# end-text
def do_ET(self):
return
# begin-compat
def do_BX(self): return
def do_BX(self):
return
# end-compat
def do_EX(self): return
def do_EX(self):
return
# marked content operators
def do_MP(self, tag):
self.device.do_tag(tag)
return
def do_DP(self, tag, props):
self.device.do_tag(tag, props)
return
def do_BMC(self, tag):
self.device.begin_tag(tag)
return
def do_BDC(self, tag, props):
self.device.begin_tag(tag, props)
return
def do_EMC(self):
self.device.end_tag()
return
@ -609,18 +667,22 @@ class PDFPageInterpreter(object):
def do_Tc(self, space):
self.textstate.charspace = space
return
# setwordspace
def do_Tw(self, space):
self.textstate.wordspace = space
return
# textscale
def do_Tz(self, scale):
self.textstate.scaling = scale
return
# setleading
def do_TL(self, leading):
self.textstate.leading = -leading
return
# selectfont
def do_Tf(self, fontid, fontsize):
try:
@ -631,10 +693,12 @@ class PDFPageInterpreter(object):
self.textstate.font = self.rsrcmgr.get_font(None, {})
self.textstate.fontsize = fontsize
return
# setrendering
def do_Tr(self, render):
self.textstate.render = render
return
# settextrise
def do_Ts(self, rise):
self.textstate.rise = rise
@ -642,49 +706,55 @@ class PDFPageInterpreter(object):
# text-move
def do_Td(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate)
#print >>sys.stderr, 'Td(%r,%r): %r' % (tx, ty, self.textstate)
return
# text-move
def do_TD(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, tx*a+ty*c+e, tx*b+ty*d+f)
self.textstate.leading = ty
self.textstate.linematrix = (0, 0)
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate)
#print >>sys.stderr, 'TD(%r,%r): %r' % (tx, ty, self.textstate)
return
# textmatrix
def do_Tm(self, a,b,c,d,e,f):
self.textstate.matrix = (a,b,c,d,e,f)
def do_Tm(self, a, b, c, d, e, f):
self.textstate.matrix = (a, b, c, d, e, f)
self.textstate.linematrix = (0, 0)
return
# nextline
def do_T_a(self):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f)
(a, b, c, d, e, f) = self.textstate.matrix
self.textstate.matrix = (a, b, c, d, self.textstate.leading*c+e, self.textstate.leading*d+f)
self.textstate.linematrix = (0, 0)
return
# show-pos
def do_TJ(self, seq):
#print >>sys.stderr, 'TJ(%r): %r' % (seq,self.textstate)
#print >>sys.stderr, 'TJ(%r): %r' % (seq, self.textstate)
if self.textstate.font is None:
if STRICT:
raise PDFInterpreterError('No font specified!')
return
self.device.render_string(self.textstate, seq)
return
# show
def do_Tj(self, s):
self.do_TJ([s])
return
# quote
def do__q(self, s):
self.do_T_a()
self.do_TJ([s])
return
# doublequote
def do__w(self, aw, ac, s):
self.do_Tw(aw)
@ -693,14 +763,16 @@ class PDFPageInterpreter(object):
return
# inline image
def do_BI(self): # never called
def do_BI(self): # never called
return
def do_ID(self): # never called
def do_ID(self): # never called
return
def do_EI(self, obj):
if 'W' in obj and 'H' in obj:
iobjid = str(id(obj))
self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY)
self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(iobjid, obj)
self.device.end_figure(iobjid)
return
@ -721,7 +793,7 @@ class PDFPageInterpreter(object):
interpreter = self.dup()
bbox = list_value(xobj['BBox'])
matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
# According to PDF reference 1.7 section 4.9.1, XObjects in
# According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry.
resources = dict_value(xobj.get('Resources')) or self.resources.copy()
@ -729,7 +801,7 @@ class PDFPageInterpreter(object):
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid)
elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
self.device.render_image(xobjid, xobj)
self.device.end_figure(xobjid)
else:
@ -740,15 +812,15 @@ class PDFPageInterpreter(object):
def process_page(self, page):
if 1 <= self.debug:
print >>sys.stderr, 'Processing page: %r' % page
(x0,y0,x1,y1) = page.mediabox
(x0, y0, x1, y1) = page.mediabox
if page.rotate == 90:
ctm = (0,-1,1,0, -y0,x1)
ctm = (0, -1, 1, 0, -y0, x1)
elif page.rotate == 180:
ctm = (-1,0,0,-1, x1,y1)
ctm = (-1, 0, 0, -1, x1, y1)
elif page.rotate == 270:
ctm = (0,1,-1,0, y1,-x0)
ctm = (0, 1, -1, 0, y1, -x0)
else:
ctm = (1,0,0,1, -x0,-y0)
ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
@ -760,7 +832,7 @@ class PDFPageInterpreter(object):
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
if 1 <= self.debug:
print >>sys.stderr, ('render_contents: resources=%r, streams=%r, ctm=%r' %
(resources, streams, ctm))
(resources, streams, ctm))
self.init_resources(resources)
self.init_state(ctm)
self.execute(list_value(streams))
@ -774,12 +846,12 @@ class PDFPageInterpreter(object):
return
while 1:
try:
(_,obj) = parser.nextobject()
(_, obj) = parser.nextobject()
except PSEOF:
break
if isinstance(obj, PSKeyword):
name = keyword_name(obj)
method = 'do_%s' % name.replace('*','_a').replace('"','_w').replace("'",'_q')
method = 'do_%s' % name.replace('*', '_a').replace('"', '_w').replace("'", '_q')
if hasattr(self, method):
func = getattr(self, method)
nargs = func.func_code.co_argcount-1

View File

@ -39,7 +39,7 @@ class PDFPage(object):
def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
@ -62,7 +62,7 @@ class PDFPage(object):
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
contents = [contents]
self.contents = contents
return
@ -70,6 +70,7 @@ class PDFPage(object):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod
def create_pages(klass, document, debug=0):
def search(obj, parent):
@ -79,7 +80,7 @@ class PDFPage(object):
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k,v) in parent.iteritems():
for (k, v) in parent.iteritems():
if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
@ -94,7 +95,7 @@ class PDFPage(object):
yield (objid, tree)
pages = False
if 'Pages' in document.catalog:
for (objid,tree) in search(document.catalog['Pages'], document.catalog):
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
yield klass(document, objid, tree)
pages = True
if not pages:
@ -109,7 +110,8 @@ class PDFPage(object):
pass
return
class PDFTextExtractionNotAllowed(PDFEncryptionError): pass
class PDFTextExtractionNotAllowed(PDFEncryptionError):
pass
@classmethod
def get_pages(klass, fp,
@ -126,8 +128,10 @@ class PDFPage(object):
if check_extractable and not doc.is_extractable:
raise klass.PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document.
for (pageno,page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos): continue
for (pageno, page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno+1: break
if maxpages and maxpages <= pageno+1:
break
return

View File

@ -15,7 +15,8 @@ from pdftypes import dict_value
## Exceptions
##
class PDFSyntaxError(PDFException): pass
class PDFSyntaxError(PDFException):
pass
## PDFParser
@ -35,7 +36,7 @@ class PDFParser(PSStackParser):
parser.set_document(doc)
parser.seek(offset)
parser.nextobject()
"""
def __init__(self, fp):
@ -55,12 +56,13 @@ class PDFParser(PSStackParser):
KEYWORD_STREAM = KWD('stream')
KEYWORD_XREF = KWD('xref')
KEYWORD_STARTXREF = KWD('startxref')
def do_keyword(self, pos, token):
"""Handles PDF-related keywords."""
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
elif token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
@ -71,7 +73,7 @@ class PDFParser(PSStackParser):
elif token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
@ -80,7 +82,7 @@ class PDFParser(PSStackParser):
elif token is self.KEYWORD_STREAM:
# stream object
((_,dic),) = self.pop(1)
((_, dic),) = self.pop(1)
dic = dict_value(dic)
objlen = 0
if not self.fallback:
@ -118,14 +120,14 @@ class PDFParser(PSStackParser):
# XXX limit objlen not to exceed object boundary
if 2 <= self.debug:
print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10])
(pos, objlen, dic, data[:10])
obj = PDFStream(dic, data, self.doc.decipher)
self.push((pos, obj))
else:
# others
self.push((pos, token))
return
@ -153,7 +155,7 @@ class PDFStreamParser(PDFParser):
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_,objid), (_,genno)) = self.pop(2)
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))

View File

@ -22,13 +22,28 @@ LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
## PDF Objects
##
class PDFObject(PSObject): pass
class PDFObject(PSObject):
pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFObjectNotFound(PDFException): pass
class PDFNotImplementedError(PDFException): pass
class PDFException(PSException):
pass
class PDFTypeError(PDFException):
pass
class PDFValueError(PDFException):
pass
class PDFObjectNotFound(PDFException):
pass
class PDFNotImplementedError(PDFException):
pass
## PDFObjRef
@ -65,33 +80,36 @@ def resolve1(x, default=None):
x = x.resolve(default=default)
return x
def resolve_all(x, default=None):
"""Recursively resolves the given object and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
"""
while isinstance(x, PDFObjRef):
x = x.resolve(default=default)
if isinstance(x, list):
x = [ resolve_all(v, default=default) for v in x ]
x = [resolve_all(v, default=default) for v in x]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
for (k, v) in x.iteritems():
x[k] = resolve_all(v, default=default)
return x
def decipher_all(decipher, objid, genno, x):
"""Recursively deciphers the given object.
"""
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
x = [decipher_all(decipher, objid, genno, v) for v in x]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
for (k, v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
@ -101,6 +119,7 @@ def int_value(x):
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
@ -109,6 +128,7 @@ def float_value(x):
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
@ -117,6 +137,7 @@ def num_value(x):
return 0
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
@ -125,6 +146,7 @@ def str_value(x):
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
@ -133,6 +155,7 @@ def list_value(x):
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
@ -141,6 +164,7 @@ def dict_value(x):
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
@ -179,13 +203,13 @@ class PDFStream(PDFObject):
def __contains__(self, name):
return name in self.attrs
def __getitem__(self, name):
return self.attrs[name]
def get(self, name, default=None):
return self.attrs.get(name, default)
def get_any(self, names, default=None):
for name in names:
if name in self.attrs:
@ -194,12 +218,14 @@ class PDFStream(PDFObject):
def get_filters(self):
filters = self.get_any(('F', 'Filter'))
if not filters: return []
if isinstance(filters, list): return filters
return [ filters ]
if not filters:
return []
if isinstance(filters, list):
return filters
return [filters]
def decode(self):
assert self.data is None and self.rawdata != None
assert self.data is None and self.rawdata is not None
data = self.rawdata
if self.decipher:
# Handle encryption

View File

@ -8,11 +8,24 @@ STRICT = 0
## PS Exceptions
##
class PSException(Exception): pass
class PSEOF(PSException): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
class PSException(Exception):
pass
class PSEOF(PSException):
pass
class PSSyntaxError(PSException):
pass
class PSTypeError(PSException):
pass
class PSValueError(PSException):
pass
## Basic PostScript Types
@ -32,7 +45,7 @@ class PSObject(object):
class PSLiteral(PSObject):
"""A class that represents a PostScript literal.
Postscript literals are used as identifiers, such as
variable names, property names and dictionary keys.
Literals are case sensitive and denoted by a preceding
@ -55,11 +68,11 @@ class PSLiteral(PSObject):
class PSKeyword(PSObject):
"""A class that represents a PostScript keyword.
PostScript keywords are a dozen of predefined words.
Commands and directives in PostScript are expressed by keywords.
They are also used to denote the content boundaries.
Note: Do not create an instance of PSKeyword directly.
Always use PSKeywordTable.intern().
"""
@ -80,7 +93,7 @@ class PSSymbolTable(object):
Interned objects can be checked its identity with "is" operator.
"""
def __init__(self, klass):
self.dict = {}
self.klass = klass
@ -114,6 +127,7 @@ def literal_name(x):
return str(x)
return x.name
def keyword_name(x):
if not isinstance(x, PSKeyword):
if STRICT:
@ -136,7 +150,9 @@ END_NUMBER = re.compile(r'[^0-9]')
END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
ESC_STRING = {'b': 8, 't': 9, 'n': 10, 'f': 12, 'r': 13, '(': 40, ')': 41, '\\': 92}
class PSBaseParser(object):
"""Most basic PostScript parser that performs only tokenization.
@ -190,7 +206,8 @@ class PSBaseParser(object):
return
def fillbuf(self):
if self.charpos < len(self.buf): return
if self.charpos < len(self.buf):
return
# fetch next chunk.
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
@ -242,7 +259,8 @@ class PSBaseParser(object):
pos = max(0, pos-self.BUFSIZ)
self.fp.seek(pos)
s = self.fp.read(prevpos-pos)
if not s: break
if not s:
break
while 1:
n = max(s.rfind('\r'), s.rfind('\n'))
if n == -1:
@ -357,7 +375,7 @@ class PSBaseParser(object):
pass
self._parse1 = self._parse_main
return j
def _parse_float(self, s, i):
m = END_NUMBER.search(s, i)
if not m:
@ -407,7 +425,7 @@ class PSBaseParser(object):
return j+1
if c == ')':
self.paren -= 1
if self.paren: # WTF, they said balanced parens need no special treatment.
if self.paren: # WTF, they said balanced parens need no special treatment.
self._curtoken += c
return j+1
self._add_token(self._curtoken)
@ -493,17 +511,17 @@ class PSStackParser(PSBaseParser):
def push(self, *objs):
self.curstack.extend(objs)
return
def pop(self, n):
objs = self.curstack[-n:]
self.curstack[-n:] = []
return objs
def popall(self):
objs = self.curstack
self.curstack = []
return objs
def add_results(self, *objs):
if 2 <= self.debug:
print >>sys.stderr, 'add_results: %r' % (objs,)
@ -516,11 +534,11 @@ class PSStackParser(PSBaseParser):
if 2 <= self.debug:
print >>sys.stderr, 'start_type: pos=%r, type=%r' % (pos, type)
return
def end_type(self, type):
if self.curtype != type:
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [ obj for (_,obj) in self.curstack ]
objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop()
if 2 <= self.debug:
print >>sys.stderr, 'end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)
@ -553,7 +571,8 @@ class PSStackParser(PSBaseParser):
try:
self.push(self.end_type('a'))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
@ -564,10 +583,11 @@ class PSStackParser(PSBaseParser):
if len(objs) % 2 != 0:
raise PSSyntaxError('Invalid dictionary construct: %r' % objs)
# construct a Python dictionary.
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
d = dict((literal_name(k), v) for (k, v) in choplist(2, objs) if v is not None)
self.push((pos, d))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
elif token == KEYWORD_PROC_BEGIN:
# begin proc
self.start_type(pos, 'p')
@ -576,7 +596,8 @@ class PSStackParser(PSBaseParser):
try:
self.push(self.end_type('p'))
except PSTypeError:
if STRICT: raise
if STRICT:
raise
else:
if 2 <= self.debug:
print >>sys.stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
@ -592,9 +613,11 @@ class PSStackParser(PSBaseParser):
return obj
import unittest
## Simplistic Test cases
##
import unittest
class TestPSBaseParser(unittest.TestCase):
TESTDATA = r'''%!PS
@ -630,7 +653,7 @@ func/a/b{(c)do*}def
(242, KWD('def')), (246, KWD('[')), (248, 1), (250, 'z'), (254, KWD('!')),
(256, KWD(']')), (258, KWD('<<')), (261, LIT('foo')), (266, 'bar'),
(272, KWD('>>'))
]
]
OBJS = [
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
@ -641,10 +664,11 @@ func/a/b{(c)do*}def
(191, ''), (194, ' '), (199, '@@ '), (211, '\xab\xcd\x00\x124\x05'),
(230, LIT('a')), (232, LIT('b')), (234, ['c']), (246, [1, 'z']),
(258, {'foo': 'bar'}),
]
]
def get_tokens(self, s):
import StringIO
class MyParser(PSBaseParser):
def flush(self):
self.add_results(*self.popall())
@ -659,6 +683,7 @@ func/a/b{(c)do*}def
def get_objects(self, s):
import StringIO
class MyParser(PSStackParser):
def flush(self):
self.add_results(*self.popall())
@ -683,4 +708,5 @@ func/a/b{(c)do*}def
self.assertEqual(objs, self.OBJS)
return
if __name__ == '__main__': unittest.main()
if __name__ == '__main__':
unittest.main()

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,7 @@ def rldecode(data):
'1234567777777abcde'
"""
decoded = []
i=0
i = 0
while i < len(data):
#print "data[%d]=:%d:" % (i,ord(data[i]))
length = ord(data[i])

View File

@ -32,13 +32,13 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
buf += chr(c)
elif pred == '\x02':
# PNG up
for (a,b) in zip(line0,line1):
for (a, b) in zip(line0, line1):
c = (ord(a)+ord(b)) & 255
buf += chr(c)
elif pred == '\x03':
# PNG average (UNTESTED)
c = 0
for (a,b) in zip(line0,line1):
for (a, b) in zip(line0, line1):
c = ((c+ord(a)+ord(b))/2) & 255
buf += chr(c)
else:
@ -52,21 +52,25 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
##
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
"""Returns the multiplication of two matrices."""
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
"""Translates a matrix by (x,y)."""
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
def translate_matrix((a, b, c, d, e, f), (x, y)):
"""Translates a matrix by (x, y)."""
return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
"""Applies a matrix to a point."""
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
"""Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
return (a*p+c*q, b*p+d*q)
@ -79,17 +83,20 @@ def uniq(objs):
"""Eliminates duplicated elements."""
done = set()
for obj in objs:
if obj in done: continue
if obj in done:
continue
done.add(obj)
yield obj
return
# csort
def csort(objs, key=lambda x:x):
def csort(objs, key=lambda x: x):
"""Order-preserving sorting function."""
idxs = dict( (obj,i) for (i,obj) in enumerate(objs) )
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
# fsplit
def fsplit(pred, objs):
"""Split a list into two classes according to the predicate."""
@ -100,7 +107,8 @@ def fsplit(pred, objs):
t.append(obj)
else:
f.append(obj)
return (t,f)
return (t, f)
# drange
def drange(v0, v1, d):
@ -108,16 +116,18 @@ def drange(v0, v1, d):
assert v0 < v1
return xrange(int(v0)/d, int(v1+d)/d)
# get_bound
def get_bound(pts):
"""Compute a minimal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
for (x, y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)
return (x0, y0, x1, y1)
# pick
def pick(seq, func, maxobj=None):
@ -126,9 +136,10 @@ def pick(seq, func, maxobj=None):
for obj in seq:
score = func(obj)
if maxscore is None or maxscore < score:
(maxscore,maxobj) = (score,obj)
(maxscore, maxobj) = (score, obj)
return maxobj
# choplist
def choplist(n, seq):
"""Groups every n elements of the list."""
@ -140,6 +151,7 @@ def choplist(n, seq):
r = []
return
# nunpack
def nunpack(s, default=0):
"""Unpacks 1 to 4 byte integers (big endian)."""
@ -157,59 +169,65 @@ def nunpack(s, default=0):
else:
raise TypeError('invalid length: %d' % l)
# decode_text
PDFDocEncoding = ''.join( unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
PDFDocEncoding = ''.join(unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
"""Decodes a PDFDocEncoding string to Unicode."""
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
return ''.join(PDFDocEncoding[ord(c)] for c in s)
# enc
def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML"""
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
return x.encode(codec, 'xmlcharrefreplace')
def bbox2str((x0,y0,x1,y1)):
def bbox2str((x0, y0, x1, y1)):
return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
def matrix2str((a,b,c,d,e,f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a,b,c,d,e,f)
def matrix2str((a, b, c, d, e, f)):
return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
## Plane
@ -240,14 +258,14 @@ class Plane(object):
def __contains__(self, obj):
return obj in self._objs
def _getrange(self, (x0,y0,x1,y1)):
def _getrange(self, (x0, y0, x1, y1)):
x0 = max(self.x0, x0)
y0 = max(self.y0, y0)
x1 = min(self.x1, x1)
y1 = min(self.y1, y1)
for y in drange(y0, y1, self.gridsize):
for x in drange(x0, x1, self.gridsize):
yield (x,y)
yield (x, y)
return
# extend(objs)
@ -255,7 +273,7 @@ class Plane(object):
for obj in objs:
self.add(obj)
return
# add(obj): place an object.
def add(self, obj):
for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
@ -279,14 +297,17 @@ class Plane(object):
return
# find(): finds objects that are in a certain area.
def find(self, (x0,y0,x1,y1)):
def find(self, (x0, y0, x1, y1)):
done = set()
for k in self._getrange((x0,y0,x1,y1)):
if k not in self._grid: continue
for k in self._getrange((x0, y0, x1, y1)):
if k not in self._grid:
continue
for obj in self._grid[k]:
if obj in done: continue
if obj in done:
continue
done.add(obj)
if (obj.x1 <= x0 or x1 <= obj.x0 or
obj.y1 <= y0 or y1 <= obj.y0): continue
obj.y1 <= y0 or y1 <= obj.y0):
continue
yield obj
return

View File

@ -7,9 +7,9 @@ setup(
version=__version__,
description='PDF parser and analyzer',
long_description='''PDFMiner is a tool for extracting information from PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting
Unlike other PDF-related tools, it focuses entirely on getting
and analyzing text data. PDFMiner allows to obtain
the exact location of texts in a page, as well as
the exact location of texts in a page, as well as
other information such as fonts or lines.
It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible

View File

@ -50,7 +50,7 @@ class CMapConverter(object):
assert values[0] == 'CID'
encs = values
continue
def put(dmap, code, cid, force=False):
for b in code[:-1]:
b = ord(b)
@ -64,7 +64,7 @@ class CMapConverter(object):
if force or ((b not in dmap) or dmap[b] == cid):
dmap[b] = cid
return
def add(unimap, enc, code):
try:
codec = self.enc2codec[enc]
@ -78,20 +78,20 @@ class CMapConverter(object):
except UnicodeError:
pass
return
def pick(unimap):
chars = unimap.items()
chars.sort(key=(lambda (c,n):(n,-ord(c))), reverse=True)
(c,_) = chars[0]
return c
cid = int(values[0])
unimap_h = {}
unimap_v = {}
for (enc,value) in zip(encs, values):
if enc == 'CID': continue
if value == '*': continue
# hcodes, vcodes: encoded bytes for each writing mode.
hcodes = []
vcodes = []
@ -121,7 +121,7 @@ class CMapConverter(object):
for code in hcodes:
put(hmap, code, cid)
put(vmap, code, cid)
# Determine the "most popular" candidate.
if unimap_h:
self.cid2unichr_h[cid] = pick(unimap_h)
@ -137,7 +137,7 @@ class CMapConverter(object):
)
fp.write(pickle.dumps(data))
return
def dump_unicodemap(self, fp):
data = dict(
CID2UNICHR_H=self.cid2unichr_h,
@ -151,7 +151,7 @@ def main(argv):
import getopt
import gzip
import os.path
def usage():
print 'usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]' % argv[0]
return 100

View File

@ -25,7 +25,7 @@ def dumpxml(out, obj, codec=None):
if obj is None:
out.write('<null />')
return
if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems():
@ -179,7 +179,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
out.write(fileobj.get_data())
out.close()
return
fp = file(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)

View File

@ -14,7 +14,7 @@ This is an in-house mapping table for some Latin-1 characters
LATIN2ASCII = {
#0x00a0: '',
#0x00a7: '',
# iso-8859-1
0x00c0: 'A`',
0x00c1: "A'",

View File

@ -159,7 +159,7 @@ class WebApp(object):
def convert(self):
self.form = cgi.FieldStorage(fp=self.infp, environ=self.environ)
if (self.method != 'POST' or
if (self.method != 'POST' or
'c' not in self.form or
'f' not in self.form):
self.response_200()