Code cleanup: removed some debug flags.

pull/1/head
Yusuke Shinyama 2014-06-14 15:43:10 +09:00
parent d9680fca7e
commit 1384a3fe8d
11 changed files with 45 additions and 68 deletions

View File

@ -215,7 +215,6 @@ class PyUnicodeMap(UnicodeMap):
## ##
class CMapDB(object): class CMapDB(object):
debug = 0
_cmap_cache = {} _cmap_cache = {}
_umap_cache = {} _umap_cache = {}
@ -225,7 +224,6 @@ class CMapDB(object):
@classmethod @classmethod
def _load_data(klass, name): def _load_data(klass, name):
filename = '%s.pickle.gz' % name filename = '%s.pickle.gz' % name
if klass.debug:
logging.info('loading: %r' % name) logging.info('loading: %r' % name)
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'), cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),) os.path.join(os.path.dirname(__file__), 'cmap'),)

View File

@ -104,7 +104,6 @@ class PDFLayoutAnalyzer(PDFTextDevice):
return item.adv return item.adv
def handle_undefined_char(self, font, cid): def handle_undefined_char(self, font, cid):
if self.debug:
logging.info('undefined: %r, %r' % (font, cid)) logging.info('undefined: %r, %r' % (font, cid))
return '(cid:%d)' % cid return '(cid:%d)' % cid
@ -207,7 +206,7 @@ class HTMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=1.0, layoutmode='normal', showpageno=True, scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
pagemargin=50, imagewriter=None, pagemargin=50, imagewriter=None, debug=0,
rect_colors={'curve': 'black', 'page': 'gray'}, rect_colors={'curve': 'black', 'page': 'gray'},
text_colors={'char': 'black'}): text_colors={'char': 'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
@ -219,7 +218,7 @@ class HTMLConverter(PDFConverter):
self.imagewriter = imagewriter self.imagewriter = imagewriter
self.rect_colors = rect_colors self.rect_colors = rect_colors
self.text_colors = text_colors self.text_colors = text_colors
if self.debug: if debug:
self.rect_colors.update(self.RECT_COLORS) self.rect_colors.update(self.RECT_COLORS)
self.text_colors.update(self.TEXT_COLORS) self.text_colors.update(self.TEXT_COLORS)
self._yoffset = self.pagemargin self._yoffset = self.pagemargin

View File

@ -607,6 +607,7 @@ class LTLayoutContainer(LTContainer):
y1 = max(obj1.y1, obj2.y1) y1 = max(obj1.y1, obj2.y1)
objs = set(plane.find((x0, y0, x1, y1))) objs = set(plane.find((x0, y0, x1, y1)))
return objs.difference((obj1, obj2)) return objs.difference((obj1, obj2))
# XXX this still takes O(n^2) :( # XXX this still takes O(n^2) :(
dists = [] dists = []
for i in xrange(len(boxes)): for i in xrange(len(boxes)):

View File

@ -15,8 +15,6 @@ class CorruptDataError(Exception):
## ##
class LZWDecoder(object): class LZWDecoder(object):
debug = 0
def __init__(self, fp): def __init__(self, fp):
self.fp = fp self.fp = fp
self.buff = 0 self.buff = 0
@ -94,9 +92,8 @@ class LZWDecoder(object):
# just ignore corrupt data and stop yielding there # just ignore corrupt data and stop yielding there
break break
yield x yield x
if self.debug: #logging.debug('nbits=%d, code=%d, output=%r, table=%r' %
logging.debug('nbits=%d, code=%d, output=%r, table=%r' % # (self.nbits, code, x, self.table[258:]))
(self.nbits, code, x, self.table[258:]))
return return

View File

@ -8,8 +8,6 @@ from pdffont import PDFUnicodeNotDefined
## ##
class PDFDevice(object): class PDFDevice(object):
debug = 0
def __init__(self, rsrcmgr): def __init__(self, rsrcmgr):
self.rsrcmgr = rsrcmgr self.rsrcmgr = rsrcmgr
self.ctm = None self.ctm = None
@ -125,11 +123,10 @@ class PDFTextDevice(PDFDevice):
## ##
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
def __init__(self, rsrcmgr, outfp, codec='utf-8', debug=0): def __init__(self, rsrcmgr, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrcmgr) PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.debug = debug
self.pageno = 0 self.pageno = 0
self._stack = [] self._stack = []
return return

View File

@ -84,7 +84,7 @@ class PDFXRef(PDFBaseXRef):
def __repr__(self): def __repr__(self):
return '<PDFXRef: offsets=%r>' % (self.offsets.keys()) return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
def load(self, parser, debug=0): def load(self, parser):
while 1: while 1:
try: try:
(pos, line) = parser.nextline() (pos, line) = parser.nextline()
@ -116,7 +116,6 @@ class PDFXRef(PDFBaseXRef):
if use != 'n': if use != 'n':
continue continue
self.offsets[objid] = (None, long(pos), int(genno)) self.offsets[objid] = (None, long(pos), int(genno))
if 1 <= debug:
logging.info('xref objects: %r' % self.offsets) logging.info('xref objects: %r' % self.offsets)
self.load_trailer(parser) self.load_trailer(parser)
return return
@ -158,7 +157,7 @@ class PDFXRefFallback(PDFXRef):
PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
def load(self, parser, debug=0): def load(self, parser):
parser.seek(0) parser.seek(0)
while 1: while 1:
try: try:
@ -168,7 +167,6 @@ class PDFXRefFallback(PDFXRef):
if line.startswith('trailer'): if line.startswith('trailer'):
parser.seek(pos) parser.seek(pos)
self.load_trailer(parser) self.load_trailer(parser)
if 1 <= debug:
logging.info('trailer: %r' % self.get_trailer()) logging.info('trailer: %r' % self.get_trailer())
break break
m = self.PDFOBJ_CUE.match(line) m = self.PDFOBJ_CUE.match(line)
@ -218,7 +216,7 @@ class PDFXRefStream(PDFBaseXRef):
def __repr__(self): def __repr__(self):
return '<PDFXRefStream: ranges=%r>' % (self.ranges) return '<PDFXRefStream: ranges=%r>' % (self.ranges)
def load(self, parser, debug=0): def load(self, parser):
(_, objid) = parser.nexttoken() # ignored (_, objid) = parser.nexttoken() # ignored
(_, genno) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored
(_, kwd) = parser.nexttoken() (_, kwd) = parser.nexttoken()
@ -234,7 +232,6 @@ class PDFXRefStream(PDFBaseXRef):
self.data = stream.get_data() self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3 self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.attrs self.trailer = stream.attrs
if 1 <= debug:
logging.info('xref stream: objid=%s, fields=%d,%d,%d' % logging.info('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.ranges)), (', '.join(map(repr, self.ranges)),
self.fl1, self.fl2, self.fl3)) self.fl1, self.fl2, self.fl3))
@ -635,7 +632,7 @@ class PDFDocument(object):
assert objid != 0 assert objid != 0
if not self.xrefs: if not self.xrefs:
raise PDFException('PDFDocument is not initialized') raise PDFException('PDFDocument is not initialized')
if 2 <= self.debug: if self.debug:
logging.debug('getobj: objid=%r' % objid) logging.debug('getobj: objid=%r' % objid)
if objid in self._cached_objs: if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid] (obj, genno) = self._cached_objs[objid]
@ -661,7 +658,7 @@ class PDFDocument(object):
continue continue
else: else:
raise PDFObjectNotFound(objid) raise PDFObjectNotFound(objid)
if 2 <= self.debug: if self.debug:
logging.debug('register: objid=%r: %r' % (objid, obj)) logging.debug('register: objid=%r: %r' % (objid, obj))
if self.caching: if self.caching:
self._cached_objs[objid] = (obj, genno) self._cached_objs[objid] = (obj, genno)
@ -735,7 +732,7 @@ class PDFDocument(object):
prev = None prev = None
for line in parser.revreadlines(): for line in parser.revreadlines():
line = line.strip() line = line.strip()
if 2 <= self.debug: if self.debug:
logging.debug('find_xref: %r' % line) logging.debug('find_xref: %r' % line)
if line == 'startxref': if line == 'startxref':
break break
@ -743,7 +740,6 @@ class PDFDocument(object):
prev = line prev = line
else: else:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
if 1 <= self.debug:
logging.info('xref found: pos=%r' % prev) logging.info('xref found: pos=%r' % prev)
return long(prev) return long(prev)
@ -756,22 +752,20 @@ class PDFDocument(object):
(pos, token) = parser.nexttoken() (pos, token) = parser.nexttoken()
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
if 1 <= self.debug:
logging.info('read_xref_from: start=%d, token=%r' % (start, token)) logging.info('read_xref_from: start=%d, token=%r' % (start, token))
if isinstance(token, int): if isinstance(token, int):
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
parser.seek(pos) parser.seek(pos)
parser.reset() parser.reset()
xref = PDFXRefStream() xref = PDFXRefStream()
xref.load(parser, debug=self.debug) xref.load(parser)
else: else:
if token is parser.KEYWORD_XREF: if token is parser.KEYWORD_XREF:
parser.nextline() parser.nextline()
xref = PDFXRef() xref = PDFXRef()
xref.load(parser, debug=self.debug) xref.load(parser)
xrefs.append(xref) xrefs.append(xref)
trailer = xref.get_trailer() trailer = xref.get_trailer()
if 1 <= self.debug:
logging.info('trailer: %r' % trailer) logging.info('trailer: %r' % trailer)
if 'XRefStm' in trailer: if 'XRefStm' in trailer:
pos = int_value(trailer['XRefStm']) pos = int_value(trailer['XRefStm'])

View File

@ -131,7 +131,6 @@ class PDFResourceManager(object):
such as fonts and images so that large objects are not such as fonts and images so that large objects are not
allocated multiple times. allocated multiple times.
""" """
debug = 0
def __init__(self, caching=True): def __init__(self, caching=True):
self.caching = caching self.caching = caching
@ -161,7 +160,6 @@ class PDFResourceManager(object):
if objid and objid in self._cached_fonts: if objid and objid in self._cached_fonts:
font = self._cached_fonts[objid] font = self._cached_fonts[objid]
else: else:
if 1 <= self.debug:
logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec))
if STRICT: if STRICT:
if spec['Type'] is not LITERAL_FONT: if spec['Type'] is not LITERAL_FONT:
@ -337,7 +335,7 @@ class PDFPageInterpreter(object):
else: else:
return PREDEFINED_COLORSPACE.get(name) return PREDEFINED_COLORSPACE.get(name)
for (k, v) in dict_value(resources).iteritems(): for (k, v) in dict_value(resources).iteritems():
if 2 <= self.debug: if self.debug:
logging.debug('Resource: %r: %r' % (k, v)) logging.debug('Resource: %r: %r' % (k, v))
if k == 'Font': if k == 'Font':
for (fontid, spec) in dict_value(v).iteritems(): for (fontid, spec) in dict_value(v).iteritems():
@ -794,7 +792,6 @@ class PDFPageInterpreter(object):
if STRICT: if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return return
if 1 <= self.debug:
logging.info('Processing xobj: %r' % xobj) logging.info('Processing xobj: %r' % xobj)
subtype = xobj.get('Subtype') subtype = xobj.get('Subtype')
if subtype is LITERAL_FORM and 'BBox' in xobj: if subtype is LITERAL_FORM and 'BBox' in xobj:
@ -818,7 +815,6 @@ class PDFPageInterpreter(object):
return return
def process_page(self, page): def process_page(self, page):
if 1 <= self.debug:
logging.info('Processing page: %r' % page) logging.info('Processing page: %r' % page)
(x0, y0, x1, y1) = page.mediabox (x0, y0, x1, y1) = page.mediabox
if page.rotate == 90: if page.rotate == 90:
@ -838,7 +834,6 @@ class PDFPageInterpreter(object):
# Render the content streams. # Render the content streams.
# This method may be called recursively. # This method may be called recursively.
def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
if 1 <= self.debug:
logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % logging.info('render_contents: resources=%r, streams=%r, ctm=%r' %
(resources, streams, ctm)) (resources, streams, ctm))
self.init_resources(resources) self.init_resources(resources)
@ -865,12 +860,12 @@ class PDFPageInterpreter(object):
nargs = func.func_code.co_argcount-1 nargs = func.func_code.co_argcount-1
if nargs: if nargs:
args = self.pop(nargs) args = self.pop(nargs)
if 2 <= self.debug: if self.debug:
logging.debug('exec: %s %r' % (name, args)) logging.debug('exec: %s %r' % (name, args))
if len(args) == nargs: if len(args) == nargs:
func(*args) func(*args)
else: else:
if 2 <= self.debug: if self.debug:
logging.debug('exec: %s' % name) logging.debug('exec: %s' % name)
func() func()
else: else:

View File

@ -74,7 +74,7 @@ class PDFPage(object):
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod @classmethod
def create_pages(klass, document, debug=0): def create_pages(klass, document):
def search(obj, parent): def search(obj, parent):
if isinstance(obj, int): if isinstance(obj, int):
objid = obj objid = obj
@ -86,13 +86,11 @@ class PDFPage(object):
if k in klass.INHERITABLE_ATTRS and k not in tree: if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
if 1 <= debug:
logging.info('Pages: Kids=%r' % tree['Kids']) logging.info('Pages: Kids=%r' % tree['Kids'])
for c in list_value(tree['Kids']): for c in list_value(tree['Kids']):
for x in search(c, tree): for x in search(c, tree):
yield x yield x
elif tree.get('Type') is LITERAL_PAGE: elif tree.get('Type') is LITERAL_PAGE:
if 1 <= debug:
logging.info('Page: %r' % tree) logging.info('Page: %r' % tree)
yield (objid, tree) yield (objid, tree)
pages = False pages = False

View File

@ -121,7 +121,7 @@ class PDFParser(PSStackParser):
data += line data += line
self.seek(pos+objlen) self.seek(pos+objlen)
# XXX limit objlen not to exceed object boundary # XXX limit objlen not to exceed object boundary
if 2 <= self.debug: if self.debug:
logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10])) (pos, objlen, dic, data[:10]))
obj = PDFStream(dic, data, self.doc.decipher) obj = PDFStream(dic, data, self.doc.decipher)

View File

@ -192,7 +192,7 @@ class PSBaseParser(object):
def seek(self, pos): def seek(self, pos):
"""Seeks the parser to the given position. """Seeks the parser to the given position.
""" """
if 2 <= self.debug: if self.debug:
logging.debug('seek: %r' % pos) logging.debug('seek: %r' % pos)
self.fp.seek(pos) self.fp.seek(pos)
# reset the status for nextline() # reset the status for nextline()
@ -243,7 +243,7 @@ class PSBaseParser(object):
else: else:
linebuf += self.buf[self.charpos:] linebuf += self.buf[self.charpos:]
self.charpos = len(self.buf) self.charpos = len(self.buf)
if 2 <= self.debug: if self.debug:
logging.debug('nextline: %r, %r' % (linepos, linebuf)) logging.debug('nextline: %r, %r' % (linepos, linebuf))
return (linepos, linebuf) return (linepos, linebuf)
@ -483,7 +483,7 @@ class PSBaseParser(object):
self.fillbuf() self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos) self.charpos = self._parse1(self.buf, self.charpos)
token = self._tokens.pop(0) token = self._tokens.pop(0)
if 2 <= self.debug: if self.debug:
logging.debug('nexttoken: %r' % token) logging.debug('nexttoken: %r' % token)
return token return token
@ -524,7 +524,7 @@ class PSStackParser(PSBaseParser):
return objs return objs
def add_results(self, *objs): def add_results(self, *objs):
if 2 <= self.debug: if self.debug:
logging.debug('add_results: %r' % objs) logging.debug('add_results: %r' % objs)
self.results.extend(objs) self.results.extend(objs)
return return
@ -532,7 +532,7 @@ class PSStackParser(PSBaseParser):
def start_type(self, pos, type): def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack)) self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, []) (self.curtype, self.curstack) = (type, [])
if 2 <= self.debug: if self.debug:
logging.debug('start_type: pos=%r, type=%r' % (pos, type)) logging.debug('start_type: pos=%r, type=%r' % (pos, type))
return return
@ -541,7 +541,7 @@ class PSStackParser(PSBaseParser):
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [obj for (_, obj) in self.curstack] objs = [obj for (_, obj) in self.curstack]
(pos, self.curtype, self.curstack) = self.context.pop() (pos, self.curtype, self.curstack) = self.context.pop()
if 2 <= self.debug: if self.debug:
logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)) logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs))
return (pos, objs) return (pos, objs)
@ -596,7 +596,7 @@ class PSStackParser(PSBaseParser):
if STRICT: if STRICT:
raise raise
else: else:
if 2 <= self.debug: if self.debug:
logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % \ logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % \
(pos, token, self.curstack)) (pos, token, self.curstack))
self.do_keyword(pos, token) self.do_keyword(pos, token)
@ -605,7 +605,7 @@ class PSStackParser(PSBaseParser):
else: else:
self.flush() self.flush()
obj = self.results.pop(0) obj = self.results.pop(0)
if 2 <= self.debug: if self.debug:
logging.debug('nextobject: %r' % obj) logging.debug('nextobject: %r' % obj)
return obj return obj

View File

@ -67,9 +67,7 @@ def main(argv):
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
CMapDB.debug = debug CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
# #
rsrcmgr = PDFResourceManager(caching=caching) rsrcmgr = PDFResourceManager(caching=caching)
if not outtype: if not outtype:
@ -94,7 +92,7 @@ def main(argv):
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter) imagewriter=imagewriter, debug=debug)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec) device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: else: