various bugfixes
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@56 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
71be16febe
commit
24bdd33557
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
# Makefile for pdfminer
|
# Makefile for pdfminer
|
||||||
|
|
||||||
PACKAGE=pdfminer
|
PACKAGE=pdfminer
|
||||||
VERSION=20080906
|
VERSION=20081228
|
||||||
GNUTAR=tar
|
GNUTAR=tar
|
||||||
SVN=svn
|
SVN=svn
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
|
|
|
@ -14,7 +14,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sat Sep 6 13:52:10 JST 2008
|
Last Modified: Sun Dec 28 20:11:59 JST 2008
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -245,6 +245,7 @@ no stream header is displayed for the ease of saving it to a file.
|
||||||
<hr noshade>
|
<hr noshade>
|
||||||
<h2>Changes</h2>
|
<h2>Changes</h2>
|
||||||
<ul>
|
<ul>
|
||||||
|
<li> 2008/12/28: Better handling of word spacing. Thanks to
|
||||||
<li> 2008/09/06: A sample pdf2html webapp added.
|
<li> 2008/09/06: A sample pdf2html webapp added.
|
||||||
<li> 2008/08/30: ASCII85 encoding filter support.
|
<li> 2008/08/30: ASCII85 encoding filter support.
|
||||||
<li> 2008/07/27: Tagged contents extraction support.
|
<li> 2008/07/27: Tagged contents extraction support.
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
import sys
|
import sys
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
from utils import choplist, nunpack
|
from pdflib.utils import choplist, nunpack
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
||||||
PSStackParser
|
PSStackParser
|
||||||
try:
|
try:
|
||||||
|
@ -18,9 +18,10 @@ class CMapError(Exception): pass
|
||||||
## CMap
|
## CMap
|
||||||
##
|
##
|
||||||
class CMap(object):
|
class CMap(object):
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, debug=0):
|
def __init__(self):
|
||||||
self.debug = debug
|
|
||||||
self.code2cid = {}
|
self.code2cid = {}
|
||||||
self.cid2code = {}
|
self.cid2code = {}
|
||||||
self.attrs = {}
|
self.attrs = {}
|
||||||
|
@ -90,8 +91,8 @@ class CMap(object):
|
||||||
##
|
##
|
||||||
class CDBCMap(CMap):
|
class CDBCMap(CMap):
|
||||||
|
|
||||||
def __init__(self, cdbname, debug=0):
|
def __init__(self, cdbname):
|
||||||
CMap.__init__(self, debug=debug)
|
CMap.__init__(self)
|
||||||
self.cdbname = cdbname
|
self.cdbname = cdbname
|
||||||
self.db = cdb.init(cdbname)
|
self.db = cdb.init(cdbname)
|
||||||
return
|
return
|
||||||
|
@ -176,10 +177,9 @@ class CMapDB(object):
|
||||||
cmapdb = {}
|
cmapdb = {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def initialize(klass, dirname, cdbdirname=None, debug=0):
|
def initialize(klass, dirname, cdbdirname=None):
|
||||||
klass.dirname = dirname
|
klass.dirname = dirname
|
||||||
klass.cdbdirname = cdbdirname or dirname
|
klass.cdbdirname = cdbdirname or dirname
|
||||||
klass.debug = debug
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -200,7 +200,7 @@ class CMapDB(object):
|
||||||
print >>stderr, 'Reading: CMap %r...' % fname
|
print >>stderr, 'Reading: CMap %r...' % fname
|
||||||
cmap = CMap()
|
cmap = CMap()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
CMapParser(cmap, fp, debug=klass.debug).run()
|
CMapParser(cmap, fp).run()
|
||||||
fp.close()
|
fp.close()
|
||||||
elif not strict:
|
elif not strict:
|
||||||
cmap = CMap() # just create empty cmap
|
cmap = CMap() # just create empty cmap
|
||||||
|
@ -214,8 +214,8 @@ class CMapDB(object):
|
||||||
##
|
##
|
||||||
class CMapParser(PSStackParser):
|
class CMapParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, cmap, fp, debug=0):
|
def __init__(self, cmap, fp):
|
||||||
PSStackParser.__init__(self, fp, debug=debug)
|
PSStackParser.__init__(self, fp)
|
||||||
self.cmap = cmap
|
self.cmap = cmap
|
||||||
self.in_cmap = False
|
self.in_cmap = False
|
||||||
return
|
return
|
||||||
|
|
|
@ -5,10 +5,11 @@ stderr = sys.stderr
|
||||||
## LZWDecoder
|
## LZWDecoder
|
||||||
##
|
##
|
||||||
class LZWDecoder(object):
|
class LZWDecoder(object):
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, fp):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.debug = debug
|
|
||||||
self.buff = 0
|
self.buff = 0
|
||||||
self.bpos = 8
|
self.bpos = 8
|
||||||
self.nbits = 9
|
self.nbits = 9
|
||||||
|
@ -88,7 +89,8 @@ def main(argv):
|
||||||
input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
|
input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01'
|
||||||
fp = StringIO.StringIO(input)
|
fp = StringIO.StringIO(input)
|
||||||
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
||||||
output = ''.join(LZWDecoder(fp, debug=1).run())
|
LZWDecoder.debug = 1
|
||||||
|
output = ''.join(LZWDecoder(fp).run())
|
||||||
print (input, expected, output)
|
print (input, expected, output)
|
||||||
print output == expected
|
print output == expected
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
import sys
|
import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \
|
from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
|
||||||
mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
||||||
|
|
||||||
|
|
||||||
## PageItem
|
## PageItem
|
||||||
|
@ -46,6 +46,7 @@ class TextItem(object):
|
||||||
self.origin = (tx,ty)
|
self.origin = (tx,ty)
|
||||||
self.direction = 0
|
self.direction = 0
|
||||||
self.text = ''
|
self.text = ''
|
||||||
|
scaling *= .01
|
||||||
if not self.font.is_vertical():
|
if not self.font.is_vertical():
|
||||||
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
|
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
|
||||||
self.direction = 1
|
self.direction = 1
|
||||||
|
@ -62,12 +63,12 @@ class TextItem(object):
|
||||||
self.text += char
|
self.text += char
|
||||||
prev = char
|
prev = char
|
||||||
dx = 0
|
dx = 0
|
||||||
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
||||||
else:
|
else:
|
||||||
dx -= t
|
dx -= t
|
||||||
w += t * fontsize * .001 * scaling * .01
|
w += t * fontsize * .001 * scaling
|
||||||
self.adv = (w, 0)
|
|
||||||
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
|
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
|
||||||
|
self.adv = (w, 0)
|
||||||
self.bbox = (tx, ty, tx+w, ty+h)
|
self.bbox = (tx, ty, tx+w, ty+h)
|
||||||
else:
|
else:
|
||||||
self.direction = 2
|
self.direction = 2
|
||||||
|
@ -78,33 +79,33 @@ class TextItem(object):
|
||||||
(disp,char) = t
|
(disp,char) = t
|
||||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||||
self.text += char
|
self.text += char
|
||||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
||||||
break
|
break
|
||||||
for t in text:
|
for t in text:
|
||||||
if isinstance(t, tuple):
|
if isinstance(t, tuple):
|
||||||
(_,char) = t
|
(_,char) = t
|
||||||
self.text += char
|
self.text += char
|
||||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01
|
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
||||||
self.adv = (0, h)
|
|
||||||
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
|
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
|
||||||
tx -= w/2
|
tx -= w/2
|
||||||
ty += disp
|
ty += disp
|
||||||
|
self.adv = (0, h)
|
||||||
self.bbox = (tx, ty+h, tx+w, ty)
|
self.bbox = (tx, ty+h, tx+w, ty)
|
||||||
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
|
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r>' %
|
return ('<text matrix=%r font=%r fontsize=%r bbox=%r text=%r adv=%r>' %
|
||||||
(self.matrix, self.font, self.fontsize, self.bbox, self.text))
|
(self.matrix, self.font, self.fontsize, self.bbox, self.text, self.adv))
|
||||||
|
|
||||||
|
|
||||||
## PageAggregator
|
## PageAggregator
|
||||||
##
|
##
|
||||||
class PageAggregator(PDFDevice):
|
class PageAggregator(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, debug=0):
|
def __init__(self, rsrc, pageno=1):
|
||||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
PDFDevice.__init__(self, rsrc)
|
||||||
self.pageno = 0
|
self.pageno = pageno
|
||||||
self.stack = []
|
self.stack = []
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -138,6 +139,7 @@ class PageAggregator(PDFDevice):
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
def render_string(self, textstate, textmatrix, seq):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
text = []
|
text = []
|
||||||
|
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||||
for x in seq:
|
for x in seq:
|
||||||
if isinstance(x, int) or isinstance(x, float):
|
if isinstance(x, int) or isinstance(x, float):
|
||||||
text.append(x)
|
text.append(x)
|
||||||
|
@ -154,15 +156,13 @@ class PageAggregator(PDFDevice):
|
||||||
text.append(unc)
|
text.append(unc)
|
||||||
if cid == 32 and not font.is_multibyte():
|
if cid == 32 and not font.is_multibyte():
|
||||||
if text:
|
if text:
|
||||||
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
||||||
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
(dx,dy) = item.adv
|
(dx,dy) = item.adv
|
||||||
dx += textstate.wordspace * textstate.scaling * .01
|
dx += textstate.wordspace * textstate.scaling * .01
|
||||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||||
text = []
|
text = []
|
||||||
if text:
|
if text:
|
||||||
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
||||||
font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return
|
return
|
||||||
|
|
|
@ -2,11 +2,11 @@
|
||||||
import sys
|
import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||||
from pdfinterp import PDFDevice, PDFResourceManager, \
|
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
|
||||||
PDFPageInterpreter, PDFUnicodeNotDefined
|
PDFPageInterpreter, PDFUnicodeNotDefined
|
||||||
from cmap import CMapDB
|
from pdflib.cmap import CMapDB
|
||||||
from page import PageItem, FigureItem, TextItem, PageAggregator
|
from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator
|
||||||
|
|
||||||
|
|
||||||
def enc(x, codec):
|
def enc(x, codec):
|
||||||
|
@ -21,8 +21,8 @@ def encprops(props, codec):
|
||||||
## TextConverter
|
## TextConverter
|
||||||
class TextConverter(PageAggregator):
|
class TextConverter(PageAggregator):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='ascii', debug=0):
|
def __init__(self, rsrc, outfp, codec='ascii'):
|
||||||
PageAggregator.__init__(self, rsrc, debug=debug)
|
PageAggregator.__init__(self, rsrc)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
@ -60,8 +60,8 @@ class SGMLConverter(TextConverter):
|
||||||
##
|
##
|
||||||
class HTMLConverter(TextConverter):
|
class HTMLConverter(TextConverter):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, debug=0):
|
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1):
|
||||||
TextConverter.__init__(self, rsrc, outfp, codec=codec, debug=debug)
|
TextConverter.__init__(self, rsrc, outfp, codec=codec)
|
||||||
self.pagenum = pagenum
|
self.pagenum = pagenum
|
||||||
self.pagepad = pagepad
|
self.pagepad = pagepad
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
|
@ -110,8 +110,8 @@ class HTMLConverter(TextConverter):
|
||||||
##
|
##
|
||||||
class TagExtractor(PDFDevice):
|
class TagExtractor(PDFDevice):
|
||||||
|
|
||||||
def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
|
def __init__(self, rsrc, outfp, codec='utf-8'):
|
||||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
PDFDevice.__init__(self, rsrc)
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
self.pageno = 0
|
self.pageno = 0
|
||||||
|
@ -166,18 +166,18 @@ class TagExtractor(PDFDevice):
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
class TextExtractionNotAllowed(RuntimeError): pass
|
class TextExtractionNotAllowed(RuntimeError): pass
|
||||||
|
|
||||||
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0):
|
def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp)
|
||||||
try:
|
try:
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
except PDFPasswordIncorrect:
|
except PDFPasswordIncorrect:
|
||||||
raise TextExtractionNotAllowed('Incorrect password')
|
raise TextExtractionNotAllowed('Incorrect password')
|
||||||
if not doc.is_extractable:
|
if not doc.is_extractable:
|
||||||
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
||||||
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
interpreter = PDFPageInterpreter(rsrc, device)
|
||||||
for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
|
for (pageno,page) in enumerate(doc.get_pages()):
|
||||||
if pagenos and (pageno not in pagenos): continue
|
if pagenos and (pageno not in pagenos): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
if maxpages and maxpages <= pageno+1: break
|
if maxpages and maxpages <= pageno+1: break
|
||||||
|
@ -217,19 +217,25 @@ def main(argv):
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
#
|
#
|
||||||
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
CMapDB.debug = debug
|
||||||
rsrc = PDFResourceManager(debug=debug)
|
PDFResourceManager.debug = debug
|
||||||
|
PDFDocument.debug = debug
|
||||||
|
PDFParser.debug = debug
|
||||||
|
PDFPageInterpreter.debug = debug
|
||||||
|
#
|
||||||
|
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||||
|
rsrc = PDFResourceManager()
|
||||||
if outtype == 'sgml':
|
if outtype == 'sgml':
|
||||||
device = SGMLConverter(rsrc, outfp, codec, debug=debug)
|
device = SGMLConverter(rsrc, outfp, codec)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrc, outfp, codec, debug=debug)
|
device = HTMLConverter(rsrc, outfp, codec)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrc, outfp, codec, debug=debug)
|
device = TagExtractor(rsrc, outfp, codec)
|
||||||
else:
|
else:
|
||||||
return usage()
|
return usage()
|
||||||
for fname in args:
|
for fname in args:
|
||||||
convert(rsrc, device, fname, pagenos,
|
convert(rsrc, device, fname, pagenos,
|
||||||
maxpages=maxpages, password=password, debug=debug)
|
maxpages=maxpages, password=password)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -6,14 +6,14 @@ try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||||
from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
|
from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
|
||||||
int_value, float_value, num_value, \
|
int_value, float_value, num_value, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||||
from utils import choplist
|
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix
|
||||||
|
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
|
@ -65,25 +65,6 @@ PREDEFINED_COLORSPACE = dict(
|
||||||
}.iteritems())
|
}.iteritems())
|
||||||
|
|
||||||
|
|
||||||
## Matrix operations
|
|
||||||
##
|
|
||||||
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
|
||||||
'''Multiplies two matrices.'''
|
|
||||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
|
||||||
a0*c1+c0*d1, b0*c1+d0*d1,
|
|
||||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
|
||||||
|
|
||||||
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
|
||||||
return (a,b,c,d,e+x,f+y)
|
|
||||||
|
|
||||||
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
|
||||||
'''Applies a matrix to coordinates.'''
|
|
||||||
return (a*x+c*y+e, b*x+d*y+f)
|
|
||||||
|
|
||||||
def apply_matrix_norm((a,b,c,d,e,f), (x,y)):
|
|
||||||
return (a*x+c*y, b*x+d*y)
|
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
## Fonts
|
||||||
##
|
##
|
||||||
|
|
||||||
|
@ -410,9 +391,9 @@ class PDFResourceManager(object):
|
||||||
such as fonts, images and cmaps so that large objects are not
|
such as fonts, images and cmaps so that large objects are not
|
||||||
allocated multiple times.
|
allocated multiple times.
|
||||||
'''
|
'''
|
||||||
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, debug=0):
|
def __init__(self):
|
||||||
self.debug = debug
|
|
||||||
self.fonts = {}
|
self.fonts = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -477,10 +458,11 @@ class PDFResourceManager(object):
|
||||||
## PDFDevice
|
## PDFDevice
|
||||||
##
|
##
|
||||||
class PDFDevice(object):
|
class PDFDevice(object):
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, rsrc, debug=0):
|
def __init__(self, rsrc):
|
||||||
self.rsrc = rsrc
|
self.rsrc = rsrc
|
||||||
self.debug = debug
|
|
||||||
self.ctm = None
|
self.ctm = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -520,10 +502,10 @@ class PDFDevice(object):
|
||||||
##
|
##
|
||||||
class PDFContentParser(PSStackParser):
|
class PDFContentParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, streams, debug=0):
|
def __init__(self, streams):
|
||||||
self.streams = streams
|
self.streams = streams
|
||||||
self.istream = 0
|
self.istream = 0
|
||||||
PSStackParser.__init__(self, None, debug=debug)
|
PSStackParser.__init__(self, None)
|
||||||
return
|
return
|
||||||
|
|
||||||
def fillfp(self):
|
def fillfp(self):
|
||||||
|
@ -607,6 +589,8 @@ class PDFContentParser(PSStackParser):
|
||||||
## Interpreter
|
## Interpreter
|
||||||
##
|
##
|
||||||
class PDFPageInterpreter(object):
|
class PDFPageInterpreter(object):
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
class TextState(object):
|
class TextState(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -632,14 +616,13 @@ class PDFPageInterpreter(object):
|
||||||
self.linematrix = (0, 0)
|
self.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __init__(self, rsrc, device, debug=0):
|
def __init__(self, rsrc, device):
|
||||||
self.rsrc = rsrc
|
self.rsrc = rsrc
|
||||||
self.device = device
|
self.device = device
|
||||||
self.debug = debug
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def dup(self):
|
def dup(self):
|
||||||
return PDFPageInterpreter(self.rsrc, self.device, debug=self.debug)
|
return PDFPageInterpreter(self.rsrc, self.device)
|
||||||
|
|
||||||
def init_resources(self, resources):
|
def init_resources(self, resources):
|
||||||
self.fontmap = {}
|
self.fontmap = {}
|
||||||
|
@ -940,8 +923,8 @@ class PDFPageInterpreter(object):
|
||||||
def do_TJ(self, seq):
|
def do_TJ(self, seq):
|
||||||
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
#print >>stderr, 'TJ(%r): %r' % (seq,self.textstate)
|
||||||
textstate = self.textstate
|
textstate = self.textstate
|
||||||
matrix = translate_matrix(textstate.matrix, textstate.linematrix)
|
textmatrix = translate_matrix(textstate.matrix, textstate.linematrix)
|
||||||
self.device.render_string(textstate, matrix, seq)
|
self.device.render_string(textstate, textmatrix, seq)
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
s = ''.join( x for x in seq if isinstance(x, str) )
|
||||||
n = sum( x for x in seq if not isinstance(x, str) )
|
n = sum( x for x in seq if not isinstance(x, str) )
|
||||||
|
@ -1030,7 +1013,7 @@ class PDFPageInterpreter(object):
|
||||||
|
|
||||||
def execute(self, streams):
|
def execute(self, streams):
|
||||||
try:
|
try:
|
||||||
parser = PDFContentParser(streams, debug=self.debug)
|
parser = PDFContentParser(streams)
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
# empty page
|
# empty page
|
||||||
return
|
return
|
||||||
|
|
|
@ -410,9 +410,10 @@ class PDFXRefStream(object):
|
||||||
## A PDF parser is associated with the document.
|
## A PDF parser is associated with the document.
|
||||||
##
|
##
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, debug=0):
|
def __init__(self):
|
||||||
self.debug = debug
|
|
||||||
self.xrefs = []
|
self.xrefs = []
|
||||||
self.objs = {}
|
self.objs = {}
|
||||||
self.parsed_objs = {}
|
self.parsed_objs = {}
|
||||||
|
@ -569,7 +570,7 @@ class PDFDocument(object):
|
||||||
if strmid in self.parsed_objs:
|
if strmid in self.parsed_objs:
|
||||||
objs = self.parsed_objs[stream]
|
objs = self.parsed_objs[stream]
|
||||||
else:
|
else:
|
||||||
parser = PDFObjStrmParser(self, stream.get_data(), debug=self.debug)
|
parser = PDFObjStrmParser(self, stream.get_data())
|
||||||
objs = []
|
objs = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -601,7 +602,7 @@ class PDFDocument(object):
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
def get_pages(self, debug=0):
|
def get_pages(self):
|
||||||
if not self.ready:
|
if not self.ready:
|
||||||
raise PDFException('PDFDocument is not initialized')
|
raise PDFException('PDFDocument is not initialized')
|
||||||
#assert self.xrefs
|
#assert self.xrefs
|
||||||
|
@ -611,13 +612,13 @@ class PDFDocument(object):
|
||||||
if k in self.INHERITABLE_ATTRS and k not in tree:
|
if k in self.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
|
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
|
||||||
if 1 <= debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||||
for c in tree['Kids']:
|
for c in tree['Kids']:
|
||||||
for x in search(c, tree):
|
for x in search(c, tree):
|
||||||
yield x
|
yield x
|
||||||
elif tree.get('Type') == LITERAL_PAGE:
|
elif tree.get('Type') == LITERAL_PAGE:
|
||||||
if 1 <= debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Page: %r' % tree
|
print >>stderr, 'Page: %r' % tree
|
||||||
yield (obj.objid, tree)
|
yield (obj.objid, tree)
|
||||||
if 'Pages' not in self.catalog: return
|
if 'Pages' not in self.catalog: return
|
||||||
|
@ -673,8 +674,8 @@ class PDFDocument(object):
|
||||||
##
|
##
|
||||||
class PDFParser(PSStackParser):
|
class PDFParser(PSStackParser):
|
||||||
|
|
||||||
def __init__(self, doc, fp, debug=0):
|
def __init__(self, doc, fp):
|
||||||
PSStackParser.__init__(self, fp, debug=debug)
|
PSStackParser.__init__(self, fp)
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.doc.set_parser(self)
|
self.doc.set_parser(self)
|
||||||
return
|
return
|
||||||
|
@ -837,12 +838,13 @@ class PDFParser(PSStackParser):
|
||||||
## PDFObjStrmParser
|
## PDFObjStrmParser
|
||||||
##
|
##
|
||||||
class PDFObjStrmParser(PDFParser):
|
class PDFObjStrmParser(PDFParser):
|
||||||
def __init__(self, doc, data, debug=0):
|
|
||||||
|
def __init__(self, doc, data):
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
PDFParser.__init__(self, doc, StringIO(data), debug=debug)
|
PDFParser.__init__(self, doc, StringIO(data))
|
||||||
return
|
return
|
||||||
|
|
||||||
def flush(self):
|
def flush(self):
|
||||||
|
|
|
@ -121,10 +121,11 @@ class PSBaseParser(object):
|
||||||
Most basic PostScript parser that performs only basic tokenization.
|
Most basic PostScript parser that performs only basic tokenization.
|
||||||
'''
|
'''
|
||||||
BUFSIZ = 4096
|
BUFSIZ = 4096
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, fp):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
self.debug = debug
|
|
||||||
self.seek(0)
|
self.seek(0)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -421,8 +422,8 @@ class PSBaseParser(object):
|
||||||
##
|
##
|
||||||
class PSStackParser(PSBaseParser):
|
class PSStackParser(PSBaseParser):
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, fp):
|
||||||
PSBaseParser.__init__(self,fp, debug=debug)
|
PSBaseParser.__init__(self, fp)
|
||||||
self.reset()
|
self.reset()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -582,7 +583,7 @@ func/a/b{(c)do*}def
|
||||||
class MyParser(PSBaseParser):
|
class MyParser(PSBaseParser):
|
||||||
def flush(self):
|
def flush(self):
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
parser = MyParser(StringIO.StringIO(s), debug=1)
|
parser = MyParser(StringIO.StringIO(s))
|
||||||
r = []
|
r = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -596,7 +597,7 @@ func/a/b{(c)do*}def
|
||||||
class MyParser(PSStackParser):
|
class MyParser(PSStackParser):
|
||||||
def flush(self):
|
def flush(self):
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
parser = MyParser(StringIO.StringIO(s), debug=1)
|
parser = MyParser(StringIO.StringIO(s))
|
||||||
r = []
|
r = []
|
||||||
try:
|
try:
|
||||||
while 1:
|
while 1:
|
||||||
|
|
|
@ -1,5 +1,26 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from struct import pack, unpack
|
from struct import unpack
|
||||||
|
|
||||||
|
|
||||||
|
## Matrix operations
|
||||||
|
##
|
||||||
|
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||||
|
'''Multiplies two matrices.'''
|
||||||
|
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||||
|
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||||
|
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||||
|
|
||||||
|
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
|
return (a,b,c,d,e+x,f+y)
|
||||||
|
|
||||||
|
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
|
'''Applies a matrix to coordinates.'''
|
||||||
|
return (a*x+c*y+e, b*x+d*y+f)
|
||||||
|
|
||||||
|
def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
|
'''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))'''
|
||||||
|
return (a*p+c*q, b*p+d*q)
|
||||||
|
|
||||||
|
|
||||||
## Utilities
|
## Utilities
|
||||||
##
|
##
|
||||||
|
|
|
@ -15,7 +15,7 @@ stderr = sys.stderr
|
||||||
|
|
||||||
ESC_PAT = re.compile(r'[\000-\037&<>\042\047\134\177-\377]')
|
ESC_PAT = re.compile(r'[\000-\037&<>\042\047\134\177-\377]')
|
||||||
def esc(s):
|
def esc(s):
|
||||||
return ESC_PAT.sub(lambda m:'\\x%02x' % ord(m.group(0)), s)
|
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
|
||||||
|
|
||||||
|
|
||||||
# dumpxml
|
# dumpxml
|
||||||
|
@ -96,10 +96,10 @@ def dumpallobjs(out, doc, codec=None):
|
||||||
|
|
||||||
# dumpoutline
|
# dumpoutline
|
||||||
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None, debug=0):
|
dumpall=False, codec=None):
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
||||||
for (level,title,dest,a,se) in doc.get_outlines():
|
for (level,title,dest,a,se) in doc.get_outlines():
|
||||||
|
@ -116,10 +116,10 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
|
|
||||||
# dumppdf
|
# dumppdf
|
||||||
def dumppdf(outfp, fname, objids, pagenos, password='',
|
def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None, debug=0):
|
dumpall=False, codec=None):
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument()
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp)
|
||||||
doc.initialize(password)
|
doc.initialize(password)
|
||||||
if objids:
|
if objids:
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
|
@ -174,9 +174,12 @@ def main(argv):
|
||||||
elif k == '-T': proc = dumpoutline
|
elif k == '-T': proc = dumpoutline
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
#
|
#
|
||||||
|
PDFDocument.debug = debug
|
||||||
|
PDFParser.debug = debug
|
||||||
|
#
|
||||||
for fname in args:
|
for fname in args:
|
||||||
proc(outfp, fname, objids, pagenos, password=password,
|
proc(outfp, fname, objids, pagenos, password=password,
|
||||||
dumpall=dumpall, codec=codec, debug=debug)
|
dumpall=dumpall, codec=codec)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def prof_main(argv):
|
||||||
|
import getopt
|
||||||
|
import hotshot, hotshot.stats
|
||||||
|
def usage():
|
||||||
|
print 'usage: %s output.prof mod.func [args ...]' % argv[0]
|
||||||
|
return 100
|
||||||
|
args = argv[1:]
|
||||||
|
if len(args) < 2: return usage()
|
||||||
|
prof = args.pop(0)
|
||||||
|
name = args.pop(0)
|
||||||
|
i = name.rindex('.')
|
||||||
|
(modname, funcname) = (name[:i], name[i+1:])
|
||||||
|
func = getattr(__import__(modname, fromlist=[modname]), funcname)
|
||||||
|
if args:
|
||||||
|
args.insert(0, argv[0])
|
||||||
|
prof = hotshot.Profile(prof)
|
||||||
|
prof.runcall(lambda : func(args))
|
||||||
|
prof.close()
|
||||||
|
else:
|
||||||
|
stats = hotshot.stats.load(prof)
|
||||||
|
stats.strip_dirs()
|
||||||
|
stats.sort_stats('time', 'calls')
|
||||||
|
stats.print_stats(1000)
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == '__main__': sys.exit(prof_main(sys.argv))
|
Loading…
Reference in New Issue