various bugfixes

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@56 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-01-05 04:40:50 +00:00
parent 71be16febe
commit 24bdd33557
12 changed files with 162 additions and 114 deletions

View File

@ -1,7 +1,7 @@
# Makefile for pdfminer # Makefile for pdfminer
PACKAGE=pdfminer PACKAGE=pdfminer
VERSION=20080906 VERSION=20081228
GNUTAR=tar GNUTAR=tar
SVN=svn SVN=svn
PYTHON=python PYTHON=python

View File

@ -14,7 +14,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sat Sep 6 13:52:10 JST 2008 Last Modified: Sun Dec 28 20:11:59 JST 2008
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -245,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2008/12/28: Better handling of word spacing. Thanks to
<li> 2008/09/06: A sample pdf2html webapp added. <li> 2008/09/06: A sample pdf2html webapp added.
<li> 2008/08/30: ASCII85 encoding filter support. <li> 2008/08/30: ASCII85 encoding filter support.
<li> 2008/07/27: Tagged contents extraction support. <li> 2008/07/27: Tagged contents extraction support.

View File

@ -2,8 +2,8 @@
import sys import sys
stderr = sys.stderr stderr = sys.stderr
from struct import pack, unpack from struct import pack, unpack
from utils import choplist, nunpack from pdflib.utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \ PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser PSStackParser
try: try:
@ -18,9 +18,10 @@ class CMapError(Exception): pass
## CMap ## CMap
## ##
class CMap(object): class CMap(object):
debug = 0
def __init__(self, debug=0): def __init__(self):
self.debug = debug
self.code2cid = {} self.code2cid = {}
self.cid2code = {} self.cid2code = {}
self.attrs = {} self.attrs = {}
@ -90,8 +91,8 @@ class CMap(object):
## ##
class CDBCMap(CMap): class CDBCMap(CMap):
def __init__(self, cdbname, debug=0): def __init__(self, cdbname):
CMap.__init__(self, debug=debug) CMap.__init__(self)
self.cdbname = cdbname self.cdbname = cdbname
self.db = cdb.init(cdbname) self.db = cdb.init(cdbname)
return return
@ -176,10 +177,9 @@ class CMapDB(object):
cmapdb = {} cmapdb = {}
@classmethod @classmethod
def initialize(klass, dirname, cdbdirname=None, debug=0): def initialize(klass, dirname, cdbdirname=None):
klass.dirname = dirname klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname klass.cdbdirname = cdbdirname or dirname
klass.debug = debug
return return
@classmethod @classmethod
@ -200,7 +200,7 @@ class CMapDB(object):
print >>stderr, 'Reading: CMap %r...' % fname print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap() cmap = CMap()
fp = file(fname, 'rb') fp = file(fname, 'rb')
CMapParser(cmap, fp, debug=klass.debug).run() CMapParser(cmap, fp).run()
fp.close() fp.close()
elif not strict: elif not strict:
cmap = CMap() # just create empty cmap cmap = CMap() # just create empty cmap
@ -214,8 +214,8 @@ class CMapDB(object):
## ##
class CMapParser(PSStackParser): class CMapParser(PSStackParser):
def __init__(self, cmap, fp, debug=0): def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp, debug=debug) PSStackParser.__init__(self, fp)
self.cmap = cmap self.cmap = cmap
self.in_cmap = False self.in_cmap = False
return return

View File

@ -5,10 +5,11 @@ stderr = sys.stderr
## LZWDecoder ## LZWDecoder
## ##
class LZWDecoder(object): class LZWDecoder(object):
debug = 0
def __init__(self, fp, debug=0): def __init__(self, fp):
self.fp = fp self.fp = fp
self.debug = debug
self.buff = 0 self.buff = 0
self.bpos = 8 self.bpos = 8
self.nbits = 9 self.nbits = 9
@ -88,7 +89,8 @@ def main(argv):
input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01' input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(input) fp = StringIO.StringIO(input)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
output = ''.join(LZWDecoder(fp, debug=1).run()) LZWDecoder.debug = 1
output = ''.join(LZWDecoder(fp).run())
print (input, expected, output) print (input, expected, output)
print output == expected print output == expected
return 0 return 0

View File

@ -2,8 +2,8 @@
import sys import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \ from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
## PageItem ## PageItem
@ -46,6 +46,7 @@ class TextItem(object):
self.origin = (tx,ty) self.origin = (tx,ty)
self.direction = 0 self.direction = 0
self.text = '' self.text = ''
scaling *= .01
if not self.font.is_vertical(): if not self.font.is_vertical():
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
self.direction = 1 self.direction = 1
@ -62,12 +63,12 @@ class TextItem(object):
self.text += char self.text += char
prev = char prev = char
dx = 0 dx = 0
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01 w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
else: else:
dx -= t dx -= t
w += t * fontsize * .001 * scaling * .01 w += t * fontsize * .001 * scaling
self.adv = (w, 0)
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize)) (w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
self.adv = (w, 0)
self.bbox = (tx, ty, tx+w, ty+h) self.bbox = (tx, ty, tx+w, ty+h)
else: else:
self.direction = 2 self.direction = 2
@ -78,33 +79,33 @@ class TextItem(object):
(disp,char) = t (disp,char) = t
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += char self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01 h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
break break
for t in text: for t in text:
if isinstance(t, tuple): if isinstance(t, tuple):
(_,char) = t (_,char) = t
self.text += char self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01 h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
self.adv = (0, h)
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h)) (w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
tx -= w/2 tx -= w/2
ty += disp ty += disp
self.adv = (0, h)
self.bbox = (tx, ty+h, tx+w, ty) self.bbox = (tx, ty+h, tx+w, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize))) self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
return return
def __repr__(self): def __repr__(self):
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' % return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
(self.matrix, self.font, self.fontsize, self.bbox, self.text)) (self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
## PageAggregator ## PageAggregator
## ##
class PageAggregator(PDFDevice): class PageAggregator(PDFDevice):
def __init__(self, rsrc, debug=0): def __init__(self, rsrc, pageno=1):
PDFDevice.__init__(self, rsrc, debug=debug) PDFDevice.__init__(self, rsrc)
self.pageno = 0 self.pageno = pageno
self.stack = [] self.stack = []
return return
@ -138,6 +139,7 @@ class PageAggregator(PDFDevice):
def render_string(self, textstate, textmatrix, seq): def render_string(self, textstate, textmatrix, seq):
font = textstate.font font = textstate.font
text = [] text = []
textmatrix = mult_matrix(textmatrix, self.ctm)
for x in seq: for x in seq:
if isinstance(x, int) or isinstance(x, float): if isinstance(x, int) or isinstance(x, float):
text.append(x) text.append(x)
@ -154,15 +156,13 @@ class PageAggregator(PDFDevice):
text.append(unc) text.append(unc)
if cid == 32 and not font.is_multibyte(): if cid == 32 and not font.is_multibyte():
if text: if text:
item = TextItem(mult_matrix(textmatrix, self.ctm), item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item) self.cur_item.add(item)
(dx,dy) = item.adv (dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01 dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy)) textmatrix = translate_matrix(textmatrix, (dx, dy))
text = [] text = []
if text: if text:
item = TextItem(mult_matrix(textmatrix, self.ctm), item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item) self.cur_item.add(item)
return return

View File

@ -2,11 +2,11 @@
import sys import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFDevice, PDFResourceManager, \ from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined PDFPageInterpreter, PDFUnicodeNotDefined
from cmap import CMapDB from pdflib.cmap import CMapDB
from page import PageItem, FigureItem, TextItem, PageAggregator from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator
def enc(x, codec): def enc(x, codec):
@ -21,8 +21,8 @@ def encprops(props, codec):
## TextConverter ## TextConverter
class TextConverter(PageAggregator): class TextConverter(PageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', debug=0): def __init__(self, rsrc, outfp, codec='ascii'):
PageAggregator.__init__(self, rsrc, debug=debug) PageAggregator.__init__(self, rsrc)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
@ -60,8 +60,8 @@ class SGMLConverter(TextConverter):
## ##
class HTMLConverter(TextConverter): class HTMLConverter(TextConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, debug=0): def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1):
TextConverter.__init__(self, rsrc, outfp, codec=codec, debug=debug) TextConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum self.pagenum = pagenum
self.pagepad = pagepad self.pagepad = pagepad
self.scale = scale self.scale = scale
@ -110,8 +110,8 @@ class HTMLConverter(TextConverter):
## ##
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
def __init__(self, rsrc, outfp, codec='utf-8', debug=0): def __init__(self, rsrc, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrc, debug=debug) PDFDevice.__init__(self, rsrc)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.pageno = 0 self.pageno = 0
@ -166,18 +166,18 @@ class TagExtractor(PDFDevice):
# pdf2txt # pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass class TextExtractionNotAllowed(RuntimeError): pass
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0): def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
doc = PDFDocument(debug=debug) doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp, debug=debug) parser = PDFParser(doc, fp)
try: try:
doc.initialize(password) doc.initialize(password)
except PDFPasswordIncorrect: except PDFPasswordIncorrect:
raise TextExtractionNotAllowed('Incorrect password') raise TextExtractionNotAllowed('Incorrect password')
if not doc.is_extractable: if not doc.is_extractable:
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug) interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages(debug=debug)): for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page) interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break if maxpages and maxpages <= pageno+1: break
@ -217,19 +217,25 @@ def main(argv):
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
# #
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) CMapDB.debug = debug
rsrc = PDFResourceManager(debug=debug) PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
#
CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager()
if outtype == 'sgml': if outtype == 'sgml':
device = SGMLConverter(rsrc, outfp, codec, debug=debug) device = SGMLConverter(rsrc, outfp, codec)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec, debug=debug) device = HTMLConverter(rsrc, outfp, codec)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec, debug=debug) device = TagExtractor(rsrc, outfp, codec)
else: else:
return usage() return usage()
for fname in args: for fname in args:
convert(rsrc, device, fname, pagenos, convert(rsrc, device, fname, pagenos,
maxpages=maxpages, password=password, debug=debug) maxpages=maxpages, password=password)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -6,14 +6,14 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSStackParser, PSLiteral, PSKeyword, STRICT, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \ from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
int_value, float_value, num_value, \ int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import choplist from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix
## Exceptions ## Exceptions
@ -65,25 +65,6 @@ PREDEFINED_COLORSPACE = dict(
}.iteritems()) }.iteritems())
## Matrix operations
##
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,e+x,f+y)
def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to coordinates.'''
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
return (a*x+c*y, b*x+d*y)
## Fonts ## Fonts
## ##
@ -410,9 +391,9 @@ class PDFResourceManager(object):
such as fonts, images and cmaps so that large objects are not such as fonts, images and cmaps so that large objects are not
allocated multiple times. allocated multiple times.
''' '''
debug = 0
def __init__(self, debug=0): def __init__(self):
self.debug = debug
self.fonts = {} self.fonts = {}
return return
@ -477,10 +458,11 @@ class PDFResourceManager(object):
## PDFDevice ## PDFDevice
## ##
class PDFDevice(object): class PDFDevice(object):
debug = 0
def __init__(self, rsrc, debug=0): def __init__(self, rsrc):
self.rsrc = rsrc self.rsrc = rsrc
self.debug = debug
self.ctm = None self.ctm = None
return return
@ -520,10 +502,10 @@ class PDFDevice(object):
## ##
class PDFContentParser(PSStackParser): class PDFContentParser(PSStackParser):
def __init__(self, streams, debug=0): def __init__(self, streams):
self.streams = streams self.streams = streams
self.istream = 0 self.istream = 0
PSStackParser.__init__(self, None, debug=debug) PSStackParser.__init__(self, None)
return return
def fillfp(self): def fillfp(self):
@ -607,6 +589,8 @@ class PDFContentParser(PSStackParser):
## Interpreter ## Interpreter
## ##
class PDFPageInterpreter(object): class PDFPageInterpreter(object):
debug = 0
class TextState(object): class TextState(object):
def __init__(self): def __init__(self):
@ -632,14 +616,13 @@ class PDFPageInterpreter(object):
self.linematrix = (0, 0) self.linematrix = (0, 0)
return return
def __init__(self, rsrc, device, debug=0): def __init__(self, rsrc, device):
self.rsrc = rsrc self.rsrc = rsrc
self.device = device self.device = device
self.debug = debug
return return
def dup(self): def dup(self):
return PDFPageInterpreter(self.rsrc, self.device, debug=self.debug) return PDFPageInterpreter(self.rsrc, self.device)
def init_resources(self, resources): def init_resources(self, resources):
self.fontmap = {} self.fontmap = {}
@ -940,8 +923,8 @@ class PDFPageInterpreter(object):
def do_TJ(self, seq): def do_TJ(self, seq):
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
textstate = self.textstate textstate = self.textstate
matrix = translate_matrix(textstate.matrix, textstate.linematrix) textmatrix = translate_matrix(textstate.matrix, textstate.linematrix)
self.device.render_string(textstate, matrix, seq) self.device.render_string(textstate, textmatrix, seq)
font = textstate.font font = textstate.font
s = ''.join( x for x in seq if isinstance(x, str) ) s = ''.join( x for x in seq if isinstance(x, str) )
n = sum( x for x in seq if not isinstance(x, str) ) n = sum( x for x in seq if not isinstance(x, str) )
@ -1030,7 +1013,7 @@ class PDFPageInterpreter(object):
def execute(self, streams): def execute(self, streams):
try: try:
parser = PDFContentParser(streams, debug=self.debug) parser = PDFContentParser(streams)
except PSEOF: except PSEOF:
# empty page # empty page
return return

View File

@ -410,9 +410,10 @@ class PDFXRefStream(object):
## A PDF parser is associated with the document. ## A PDF parser is associated with the document.
## ##
class PDFDocument(object): class PDFDocument(object):
debug = 0
def __init__(self, debug=0): def __init__(self):
self.debug = debug
self.xrefs = [] self.xrefs = []
self.objs = {} self.objs = {}
self.parsed_objs = {} self.parsed_objs = {}
@ -569,7 +570,7 @@ class PDFDocument(object):
if strmid in self.parsed_objs: if strmid in self.parsed_objs:
objs = self.parsed_objs[stream] objs = self.parsed_objs[stream]
else: else:
parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug) parser = PDFObjStrmParser(self, stream.get_data())
objs = [] objs = []
try: try:
while 1: while 1:
@ -601,7 +602,7 @@ class PDFDocument(object):
return obj return obj
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self, debug=0): def get_pages(self):
if not self.ready: if not self.ready:
raise PDFException('PDFDocument is not initialized') raise PDFException('PDFDocument is not initialized')
#assert self.xrefs #assert self.xrefs
@ -611,13 +612,13 @@ class PDFDocument(object):
if k in self.INHERITABLE_ATTRS and k not in tree: if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree: if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
if 1 <= debug: if 1 <= self.debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids'] print >>stderr, 'Pages: Kids=%r' % tree['Kids']
for c in tree['Kids']: for c in tree['Kids']:
for x in search(c, tree): for x in search(c, tree):
yield x yield x
elif tree.get('Type') == LITERAL_PAGE: elif tree.get('Type') == LITERAL_PAGE:
if 1 <= debug: if 1 <= self.debug:
print >>stderr, 'Page: %r' % tree print >>stderr, 'Page: %r' % tree
yield (obj.objid, tree) yield (obj.objid, tree)
if 'Pages' not in self.catalog: return if 'Pages' not in self.catalog: return
@ -673,8 +674,8 @@ class PDFDocument(object):
## ##
class PDFParser(PSStackParser): class PDFParser(PSStackParser):
def __init__(self, doc, fp, debug=0): def __init__(self, doc, fp):
PSStackParser.__init__(self, fp, debug=debug) PSStackParser.__init__(self, fp)
self.doc = doc self.doc = doc
self.doc.set_parser(self) self.doc.set_parser(self)
return return
@ -837,12 +838,13 @@ class PDFParser(PSStackParser):
## PDFObjStrmParser ## PDFObjStrmParser
## ##
class PDFObjStrmParser(PDFParser): class PDFObjStrmParser(PDFParser):
def __init__(self, doc, data, debug=0):
def __init__(self, doc, data):
try: try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
PDFParser.__init__(self, doc, StringIO(data), debug=debug) PDFParser.__init__(self, doc, StringIO(data))
return return
def flush(self): def flush(self):

View File

@ -121,10 +121,11 @@ class PSBaseParser(object):
Most basic PostScript parser that performs only basic tokenization. Most basic PostScript parser that performs only basic tokenization.
''' '''
BUFSIZ = 4096 BUFSIZ = 4096
debug = 0
def __init__(self, fp, debug=0): def __init__(self, fp):
self.fp = fp self.fp = fp
self.debug = debug
self.seek(0) self.seek(0)
return return
@ -421,8 +422,8 @@ class PSBaseParser(object):
## ##
class PSStackParser(PSBaseParser): class PSStackParser(PSBaseParser):
def __init__(self, fp, debug=0): def __init__(self, fp):
PSBaseParser.__init__(self,fp, debug=debug) PSBaseParser.__init__(self, fp)
self.reset() self.reset()
return return
@ -582,7 +583,7 @@ func/a/b{(c)do*}def
class MyParser(PSBaseParser): class MyParser(PSBaseParser):
def flush(self): def flush(self):
self.add_results(*self.popall()) self.add_results(*self.popall())
parser = MyParser(StringIO.StringIO(s), debug=1) parser = MyParser(StringIO.StringIO(s))
r = [] r = []
try: try:
while 1: while 1:
@ -596,7 +597,7 @@ func/a/b{(c)do*}def
class MyParser(PSStackParser): class MyParser(PSStackParser):
def flush(self): def flush(self):
self.add_results(*self.popall()) self.add_results(*self.popall())
parser = MyParser(StringIO.StringIO(s), debug=1) parser = MyParser(StringIO.StringIO(s))
r = [] r = []
try: try:
while 1: while 1:

View File

@ -1,5 +1,26 @@
#!/usr/bin/env python #!/usr/bin/env python
from struct import pack, unpack from struct import unpack
## Matrix operations
##
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def translate_matrix((a,b,c,d,e,f), (x,y)):
return (a,b,c,d,e+x,f+y)
def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to coordinates.'''
return (a*x+c*y+e, b*x+d*y+f)
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
return (a*p+c*q, b*p+d*q)
## Utilities ## Utilities
## ##

View File

@ -15,7 +15,7 @@ stderr = sys.stderr
ESC_PAT = re.compile(r'[\000-\037&<>\042\047\134\177-\377]') ESC_PAT = re.compile(r'[\000-\037&<>\042\047\134\177-\377]')
def esc(s): def esc(s):
return ESC_PAT.sub(lambda m:'\\x%02x' % ord(m.group(0)), s) return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
# dumpxml # dumpxml
@ -96,10 +96,10 @@ def dumpallobjs(out, doc, codec=None):
# dumpoutline # dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='', def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, debug=0): dumpall=False, codec=None):
doc = PDFDocument(debug=debug) doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp, debug=debug) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines(): for (level,title,dest,a,se) in doc.get_outlines():
@ -116,10 +116,10 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, debug=0): dumpall=False, codec=None):
doc = PDFDocument(debug=debug) doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp, debug=debug) parser = PDFParser(doc, fp)
doc.initialize(password) doc.initialize(password)
if objids: if objids:
for objid in objids: for objid in objids:
@ -174,9 +174,12 @@ def main(argv):
elif k == '-T': proc = dumpoutline elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
# #
PDFDocument.debug = debug
PDFParser.debug = debug
#
for fname in args: for fname in args:
proc(outfp, fname, objids, pagenos, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec, debug=debug) dumpall=dumpall, codec=codec)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

29
tools/prof.py Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
import sys
def prof_main(argv):
import getopt
import hotshot, hotshot.stats
def usage():
print 'usage: %s output.prof mod.func [args ...]' % argv[0]
return 100
args = argv[1:]
if len(args) < 2: return usage()
prof = args.pop(0)
name = args.pop(0)
i = name.rindex('.')
(modname, funcname) = (name[:i], name[i+1:])
func = getattr(__import__(modname, fromlist=[modname]), funcname)
if args:
args.insert(0, argv[0])
prof = hotshot.Profile(prof)
prof.runcall(lambda : func(args))
prof.close()
else:
stats = hotshot.stats.load(prof)
stats.strip_dirs()
stats.sort_stats('time', 'calls')
stats.print_stats(1000)
return
if __name__ == '__main__': sys.exit(prof_main(sys.argv))