diff --git a/Makefile b/Makefile
index 5a0c224..c2b674b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Makefile for pdfminer
PACKAGE=pdfminer
-VERSION=20080906
+VERSION=20081228
GNUTAR=tar
SVN=svn
PYTHON=python
diff --git a/README.html b/README.html
index 7d8e6a4..e8badfd 100644
--- a/README.html
+++ b/README.html
@@ -14,7 +14,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Sep 6 13:52:10 JST 2008
+Last Modified: Sun Dec 28 20:11:59 JST 2008
@@ -245,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+- 2008/12/28: Better handling of word spacing. Thanks to
- 2008/09/06: A sample pdf2html webapp added.
- 2008/08/30: ASCII85 encoding filter support.
- 2008/07/27: Tagged contents extraction support.
diff --git a/pdflib/cmap.py b/pdflib/cmap.py
index 404714e..41d0f7b 100644
--- a/pdflib/cmap.py
+++ b/pdflib/cmap.py
@@ -2,8 +2,8 @@
import sys
stderr = sys.stderr
from struct import pack, unpack
-from utils import choplist, nunpack
-from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
+from pdflib.utils import choplist, nunpack
+from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser
try:
@@ -18,9 +18,10 @@ class CMapError(Exception): pass
## CMap
##
class CMap(object):
+
+ debug = 0
- def __init__(self, debug=0):
- self.debug = debug
+ def __init__(self):
self.code2cid = {}
self.cid2code = {}
self.attrs = {}
@@ -90,8 +91,8 @@ class CMap(object):
##
class CDBCMap(CMap):
- def __init__(self, cdbname, debug=0):
- CMap.__init__(self, debug=debug)
+ def __init__(self, cdbname):
+ CMap.__init__(self)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
return
@@ -176,10 +177,9 @@ class CMapDB(object):
cmapdb = {}
@classmethod
- def initialize(klass, dirname, cdbdirname=None, debug=0):
+ def initialize(klass, dirname, cdbdirname=None):
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
- klass.debug = debug
return
@classmethod
@@ -200,7 +200,7 @@ class CMapDB(object):
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname, 'rb')
- CMapParser(cmap, fp, debug=klass.debug).run()
+ CMapParser(cmap, fp).run()
fp.close()
elif not strict:
cmap = CMap() # just create empty cmap
@@ -214,8 +214,8 @@ class CMapDB(object):
##
class CMapParser(PSStackParser):
- def __init__(self, cmap, fp, debug=0):
- PSStackParser.__init__(self, fp, debug=debug)
+ def __init__(self, cmap, fp):
+ PSStackParser.__init__(self, fp)
self.cmap = cmap
self.in_cmap = False
return
diff --git a/pdflib/lzw.py b/pdflib/lzw.py
index d5f3b4e..5013dfd 100755
--- a/pdflib/lzw.py
+++ b/pdflib/lzw.py
@@ -5,10 +5,11 @@ stderr = sys.stderr
## LZWDecoder
##
class LZWDecoder(object):
+
+ debug = 0
- def __init__(self, fp, debug=0):
+ def __init__(self, fp):
self.fp = fp
- self.debug = debug
self.buff = 0
self.bpos = 8
self.nbits = 9
@@ -88,7 +89,8 @@ def main(argv):
input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
fp = StringIO.StringIO(input)
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
- output = ''.join(LZWDecoder(fp, debug=1).run())
+ LZWDecoder.debug = 1
+ output = ''.join(LZWDecoder(fp).run())
print (input, expected, output)
print output == expected
return 0
diff --git a/pdflib/page.py b/pdflib/page.py
index d33dd0c..ad75bdd 100644
--- a/pdflib/page.py
+++ b/pdflib/page.py
@@ -2,8 +2,8 @@
import sys
stdout = sys.stdout
stderr = sys.stderr
-from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
- mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
+from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
+from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
## PageItem
@@ -46,6 +46,7 @@ class TextItem(object):
self.origin = (tx,ty)
self.direction = 0
self.text = ''
+ scaling *= .01
if not self.font.is_vertical():
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
self.direction = 1
@@ -62,12 +63,12 @@ class TextItem(object):
self.text += char
prev = char
dx = 0
- w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
+ w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
else:
dx -= t
- w += t * fontsize * .001 * scaling * .01
- self.adv = (w, 0)
+ w += t * fontsize * .001 * scaling
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
+ self.adv = (w, 0)
self.bbox = (tx, ty, tx+w, ty+h)
else:
self.direction = 2
@@ -78,33 +79,33 @@ class TextItem(object):
(disp,char) = t
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += char
- h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
+ h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
break
for t in text:
if isinstance(t, tuple):
(_,char) = t
self.text += char
- h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
- self.adv = (0, h)
+ h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
tx -= w/2
ty += disp
+ self.adv = (0, h)
self.bbox = (tx, ty+h, tx+w, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
return
def __repr__(self):
- return ('' %
- (self.matrix, self.font, self.fontsize, self.bbox, self.text))
+ return ('' %
+ (self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
## PageAggregator
##
class PageAggregator(PDFDevice):
- def __init__(self, rsrc, debug=0):
- PDFDevice.__init__(self, rsrc, debug=debug)
- self.pageno = 0
+ def __init__(self, rsrc, pageno=1):
+ PDFDevice.__init__(self, rsrc)
+ self.pageno = pageno
self.stack = []
return
@@ -138,6 +139,7 @@ class PageAggregator(PDFDevice):
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
text = []
+ textmatrix = mult_matrix(textmatrix, self.ctm)
for x in seq:
if isinstance(x, int) or isinstance(x, float):
text.append(x)
@@ -154,15 +156,13 @@ class PageAggregator(PDFDevice):
text.append(unc)
if cid == 32 and not font.is_multibyte():
if text:
- item = TextItem(mult_matrix(textmatrix, self.ctm),
- font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
+ item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item)
(dx,dy) = item.adv
dx += textstate.wordspace * textstate.scaling * .01
textmatrix = translate_matrix(textmatrix, (dx, dy))
text = []
if text:
- item = TextItem(mult_matrix(textmatrix, self.ctm),
- font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
+ item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
self.cur_item.add(item)
return
diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py
index f557a21..85bb638 100755
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@@ -2,11 +2,11 @@
import sys
stdout = sys.stdout
stderr = sys.stderr
-from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
-from pdfinterp import PDFDevice, PDFResourceManager, \
+from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
+from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined
-from cmap import CMapDB
-from page import PageItem, FigureItem, TextItem, PageAggregator
+from pdflib.cmap import CMapDB
+from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator
def enc(x, codec):
@@ -21,8 +21,8 @@ def encprops(props, codec):
## TextConverter
class TextConverter(PageAggregator):
- def __init__(self, rsrc, outfp, codec='ascii', debug=0):
- PageAggregator.__init__(self, rsrc, debug=debug)
+ def __init__(self, rsrc, outfp, codec='ascii'):
+ PageAggregator.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
return
@@ -60,8 +60,8 @@ class SGMLConverter(TextConverter):
##
class HTMLConverter(TextConverter):
- def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, debug=0):
- TextConverter.__init__(self, rsrc, outfp, codec=codec, debug=debug)
+ def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1):
+ TextConverter.__init__(self, rsrc, outfp, codec=codec)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
@@ -110,8 +110,8 @@ class HTMLConverter(TextConverter):
##
class TagExtractor(PDFDevice):
- def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
- PDFDevice.__init__(self, rsrc, debug=debug)
+ def __init__(self, rsrc, outfp, codec='utf-8'):
+ PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
self.pageno = 0
@@ -166,18 +166,18 @@ class TagExtractor(PDFDevice):
# pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass
-def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
- doc = PDFDocument(debug=debug)
+def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
+ doc = PDFDocument()
fp = file(fname, 'rb')
- parser = PDFParser(doc, fp, debug=debug)
+ parser = PDFParser(doc, fp)
try:
doc.initialize(password)
except PDFPasswordIncorrect:
raise TextExtractionNotAllowed('Incorrect password')
if not doc.is_extractable:
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
- interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
- for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
+ interpreter = PDFPageInterpreter(rsrc, device)
+ for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= pageno+1: break
@@ -217,19 +217,25 @@ def main(argv):
elif k == '-t': outtype = v
elif k == '-o': outfp = file(v, 'wb')
#
- CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
- rsrc = PDFResourceManager(debug=debug)
+ CMapDB.debug = debug
+ PDFResourceManager.debug = debug
+ PDFDocument.debug = debug
+ PDFParser.debug = debug
+ PDFPageInterpreter.debug = debug
+ #
+ CMapDB.initialize(cmapdir, cdbcmapdir)
+ rsrc = PDFResourceManager()
if outtype == 'sgml':
- device = SGMLConverter(rsrc, outfp, codec, debug=debug)
+ device = SGMLConverter(rsrc, outfp, codec)
elif outtype == 'html':
- device = HTMLConverter(rsrc, outfp, codec, debug=debug)
+ device = HTMLConverter(rsrc, outfp, codec)
elif outtype == 'tag':
- device = TagExtractor(rsrc, outfp, codec, debug=debug)
+ device = TagExtractor(rsrc, outfp, codec)
else:
return usage()
for fname in args:
convert(rsrc, device, fname, pagenos,
- maxpages=maxpages, password=password, debug=debug)
+ maxpages=maxpages, password=password)
return
if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py
index 7bf3cfc..220743c 100644
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@@ -6,14 +6,14 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
-from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
+from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSStackParser, PSLiteral, PSKeyword, STRICT, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
-from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
+from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
-from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
-from utils import choplist
+from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix
## Exceptions
@@ -65,25 +65,6 @@ PREDEFINED_COLORSPACE = dict(
}.iteritems())
-## Matrix operations
-##
-def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
- '''Multiplies two matrices.'''
- return (a0*a1+c0*b1, b0*a1+d0*b1,
- a0*c1+c0*d1, b0*c1+d0*d1,
- a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
-
-def translate_matrix((a,b,c,d,e,f), (x,y)):
- return (a,b,c,d,e+x,f+y)
-
-def apply_matrix((a,b,c,d,e,f), (x,y)):
- '''Applies a matrix to coordinates.'''
- return (a*x+c*y+e, b*x+d*y+f)
-
-def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
- return (a*x+c*y, b*x+d*y)
-
-
## Fonts
##
@@ -410,9 +391,9 @@ class PDFResourceManager(object):
such as fonts, images and cmaps so that large objects are not
allocated multiple times.
'''
+ debug = 0
- def __init__(self, debug=0):
- self.debug = debug
+ def __init__(self):
self.fonts = {}
return
@@ -477,10 +458,11 @@ class PDFResourceManager(object):
## PDFDevice
##
class PDFDevice(object):
+
+ debug = 0
- def __init__(self, rsrc, debug=0):
+ def __init__(self, rsrc):
self.rsrc = rsrc
- self.debug = debug
self.ctm = None
return
@@ -520,10 +502,10 @@ class PDFDevice(object):
##
class PDFContentParser(PSStackParser):
- def __init__(self, streams, debug=0):
+ def __init__(self, streams):
self.streams = streams
self.istream = 0
- PSStackParser.__init__(self, None, debug=debug)
+ PSStackParser.__init__(self, None)
return
def fillfp(self):
@@ -607,6 +589,8 @@ class PDFContentParser(PSStackParser):
## Interpreter
##
class PDFPageInterpreter(object):
+
+ debug = 0
class TextState(object):
def __init__(self):
@@ -632,14 +616,13 @@ class PDFPageInterpreter(object):
self.linematrix = (0, 0)
return
- def __init__(self, rsrc, device, debug=0):
+ def __init__(self, rsrc, device):
self.rsrc = rsrc
self.device = device
- self.debug = debug
return
def dup(self):
- return PDFPageInterpreter(self.rsrc, self.device, debug=self.debug)
+ return PDFPageInterpreter(self.rsrc, self.device)
def init_resources(self, resources):
self.fontmap = {}
@@ -940,8 +923,8 @@ class PDFPageInterpreter(object):
def do_TJ(self, seq):
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
textstate = self.textstate
- matrix = translate_matrix(textstate.matrix, textstate.linematrix)
- self.device.render_string(textstate, matrix, seq)
+ textmatrix = translate_matrix(textstate.matrix, textstate.linematrix)
+ self.device.render_string(textstate, textmatrix, seq)
font = textstate.font
s = ''.join( x for x in seq if isinstance(x, str) )
n = sum( x for x in seq if not isinstance(x, str) )
@@ -1030,7 +1013,7 @@ class PDFPageInterpreter(object):
def execute(self, streams):
try:
- parser = PDFContentParser(streams, debug=self.debug)
+ parser = PDFContentParser(streams)
except PSEOF:
# empty page
return
diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py
index 49367c7..3319b21 100755
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@@ -410,9 +410,10 @@ class PDFXRefStream(object):
## A PDF parser is associated with the document.
##
class PDFDocument(object):
+
+ debug = 0
- def __init__(self, debug=0):
- self.debug = debug
+ def __init__(self):
self.xrefs = []
self.objs = {}
self.parsed_objs = {}
@@ -569,7 +570,7 @@ class PDFDocument(object):
if strmid in self.parsed_objs:
objs = self.parsed_objs[stream]
else:
- parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug)
+ parser = PDFObjStrmParser(self, stream.get_data())
objs = []
try:
while 1:
@@ -601,7 +602,7 @@ class PDFDocument(object):
return obj
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
- def get_pages(self, debug=0):
+ def get_pages(self):
if not self.ready:
raise PDFException('PDFDocument is not initialized')
#assert self.xrefs
@@ -611,13 +612,13 @@ class PDFDocument(object):
if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
- if 1 <= debug:
+ if 1 <= self.debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
for c in tree['Kids']:
for x in search(c, tree):
yield x
elif tree.get('Type') == LITERAL_PAGE:
- if 1 <= debug:
+ if 1 <= self.debug:
print >>stderr, 'Page: %r' % tree
yield (obj.objid, tree)
if 'Pages' not in self.catalog: return
@@ -673,8 +674,8 @@ class PDFDocument(object):
##
class PDFParser(PSStackParser):
- def __init__(self, doc, fp, debug=0):
- PSStackParser.__init__(self, fp, debug=debug)
+ def __init__(self, doc, fp):
+ PSStackParser.__init__(self, fp)
self.doc = doc
self.doc.set_parser(self)
return
@@ -837,12 +838,13 @@ class PDFParser(PSStackParser):
## PDFObjStrmParser
##
class PDFObjStrmParser(PDFParser):
- def __init__(self, doc, data, debug=0):
+
+ def __init__(self, doc, data):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
- PDFParser.__init__(self, doc, StringIO(data), debug=debug)
+ PDFParser.__init__(self, doc, StringIO(data))
return
def flush(self):
diff --git a/pdflib/psparser.py b/pdflib/psparser.py
index d5773c8..2c5660d 100644
--- a/pdflib/psparser.py
+++ b/pdflib/psparser.py
@@ -121,10 +121,11 @@ class PSBaseParser(object):
Most basic PostScript parser that performs only basic tokenization.
'''
BUFSIZ = 4096
+
+ debug = 0
- def __init__(self, fp, debug=0):
+ def __init__(self, fp):
self.fp = fp
- self.debug = debug
self.seek(0)
return
@@ -421,8 +422,8 @@ class PSBaseParser(object):
##
class PSStackParser(PSBaseParser):
- def __init__(self, fp, debug=0):
- PSBaseParser.__init__(self,fp, debug=debug)
+ def __init__(self, fp):
+ PSBaseParser.__init__(self, fp)
self.reset()
return
@@ -582,7 +583,7 @@ func/a/b{(c)do*}def
class MyParser(PSBaseParser):
def flush(self):
self.add_results(*self.popall())
- parser = MyParser(StringIO.StringIO(s), debug=1)
+ parser = MyParser(StringIO.StringIO(s))
r = []
try:
while 1:
@@ -596,7 +597,7 @@ func/a/b{(c)do*}def
class MyParser(PSStackParser):
def flush(self):
self.add_results(*self.popall())
- parser = MyParser(StringIO.StringIO(s), debug=1)
+ parser = MyParser(StringIO.StringIO(s))
r = []
try:
while 1:
diff --git a/pdflib/utils.py b/pdflib/utils.py
index 6364875..2bf0c7a 100644
--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@@ -1,5 +1,26 @@
#!/usr/bin/env python
-from struct import pack, unpack
+from struct import unpack
+
+
+## Matrix operations
+##
+def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
+ '''Multiplies two matrices.'''
+ return (a0*a1+c0*b1, b0*a1+d0*b1,
+ a0*c1+c0*d1, b0*c1+d0*d1,
+ a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
+
+def translate_matrix((a,b,c,d,e,f), (x,y)):
+ return (a,b,c,d,e+x,f+y)
+
+def apply_matrix((a,b,c,d,e,f), (x,y)):
+ '''Applies a matrix to coordinates.'''
+ return (a*x+c*y+e, b*x+d*y+f)
+
+def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
+ '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
+ return (a*p+c*q, b*p+d*q)
+
## Utilities
##
diff --git a/tools/dumppdf.py b/tools/dumppdf.py
index 0bb0a4a..b385980 100755
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@@ -15,7 +15,7 @@ stderr = sys.stderr
ESC_PAT = re.compile(r'[\000-\037&<>\042\047\134\177-\377]')
def esc(s):
- return ESC_PAT.sub(lambda m:'\\x%02x' % ord(m.group(0)), s)
+ return ESC_PAT.sub(lambda m:'%d;' % ord(m.group(0)), s)
# dumpxml
@@ -96,10 +96,10 @@ def dumpallobjs(out, doc, codec=None):
# dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='',
- dumpall=False, codec=None, debug=0):
- doc = PDFDocument(debug=debug)
+ dumpall=False, codec=None):
+ doc = PDFDocument()
fp = file(fname, 'rb')
- parser = PDFParser(doc, fp, debug=debug)
+ parser = PDFParser(doc, fp)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
@@ -116,10 +116,10 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
# dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='',
- dumpall=False, codec=None, debug=0):
- doc = PDFDocument(debug=debug)
+ dumpall=False, codec=None):
+ doc = PDFDocument()
fp = file(fname, 'rb')
- parser = PDFParser(doc, fp, debug=debug)
+ parser = PDFParser(doc, fp)
doc.initialize(password)
if objids:
for objid in objids:
@@ -174,9 +174,12 @@ def main(argv):
elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb')
#
+ PDFDocument.debug = debug
+ PDFParser.debug = debug
+ #
for fname in args:
proc(outfp, fname, objids, pagenos, password=password,
- dumpall=dumpall, codec=codec, debug=debug)
+ dumpall=dumpall, codec=codec)
return
if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/tools/prof.py b/tools/prof.py
new file mode 100644
index 0000000..34746db
--- /dev/null
+++ b/tools/prof.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+import sys
+
+def prof_main(argv):
+ import getopt
+ import hotshot, hotshot.stats
+ def usage():
+ print 'usage: %s output.prof mod.func [args ...]' % argv[0]
+ return 100
+ args = argv[1:]
+ if len(args) < 2: return usage()
+ prof = args.pop(0)
+ name = args.pop(0)
+ i = name.rindex('.')
+ (modname, funcname) = (name[:i], name[i+1:])
+ func = getattr(__import__(modname, fromlist=[modname]), funcname)
+ if args:
+ args.insert(0, argv[0])
+ prof = hotshot.Profile(prof)
+ prof.runcall(lambda : func(args))
+ prof.close()
+ else:
+ stats = hotshot.stats.load(prof)
+ stats.strip_dirs()
+ stats.sort_stats('time', 'calls')
+ stats.print_stats(1000)
+ return
+
+if __name__ == '__main__': sys.exit(prof_main(sys.argv))