diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 9f39997..742a32e 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -215,7 +215,6 @@ class PyUnicodeMap(UnicodeMap): ## class CMapDB(object): - debug = 0 _cmap_cache = {} _umap_cache = {} @@ -225,8 +224,7 @@ class CMapDB(object): @classmethod def _load_data(klass, name): filename = '%s.pickle.gz' % name - if klass.debug: - logging.info('loading: %r' % name) + logging.info('loading: %r' % name) cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'), os.path.join(os.path.dirname(__file__), 'cmap'),) for directory in cmap_paths: diff --git a/pdfminer/converter.py b/pdfminer/converter.py index aacd4f0..463433c 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -104,8 +104,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): return item.adv def handle_undefined_char(self, font, cid): - if self.debug: - logging.info('undefined: %r, %r' % (font, cid)) + logging.info('undefined: %r, %r' % (font, cid)) return '(cid:%d)' % cid def receive_layout(self, ltpage): @@ -207,7 +206,7 @@ class HTMLConverter(PDFConverter): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, scale=1, fontscale=1.0, layoutmode='normal', showpageno=True, - pagemargin=50, imagewriter=None, + pagemargin=50, imagewriter=None, debug=0, rect_colors={'curve': 'black', 'page': 'gray'}, text_colors={'char': 'black'}): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) @@ -219,7 +218,7 @@ class HTMLConverter(PDFConverter): self.imagewriter = imagewriter self.rect_colors = rect_colors self.text_colors = text_colors - if self.debug: + if debug: self.rect_colors.update(self.RECT_COLORS) self.text_colors.update(self.TEXT_COLORS) self._yoffset = self.pagemargin diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 0980144..9bb836d 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -607,6 +607,7 @@ class LTLayoutContainer(LTContainer): y1 = max(obj1.y1, obj2.y1) objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) + # XXX this still takes O(n^2) :( dists = [] for i in xrange(len(boxes)): diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index 4c96c4e..3259c30 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -15,8 +15,6 @@ class CorruptDataError(Exception): ## class LZWDecoder(object): - debug = 0 - def __init__(self, fp): self.fp = fp self.buff = 0 @@ -94,9 +92,8 @@ class LZWDecoder(object): # just ignore corrupt data and stop yielding there break yield x - if self.debug: - logging.debug('nbits=%d, code=%d, output=%r, table=%r' % - (self.nbits, code, x, self.table[258:])) + #logging.debug('nbits=%d, code=%d, output=%r, table=%r' % + # (self.nbits, code, x, self.table[258:])) return diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index a11cd9e..15fe80a 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -8,8 +8,6 @@ from pdffont import PDFUnicodeNotDefined ## class PDFDevice(object): - debug = 0 - def __init__(self, rsrcmgr): self.rsrcmgr = rsrcmgr self.ctm = None @@ -125,11 +123,10 @@ class PDFTextDevice(PDFDevice): ## class TagExtractor(PDFDevice): - def __init__(self, rsrcmgr, outfp, codec='utf-8', debug=0): + def __init__(self, rsrcmgr, outfp, codec='utf-8'): PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.codec = codec - self.debug = debug self.pageno = 0 self._stack = [] return diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 2f8bb72..762a9cc 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -84,7 +84,7 @@ class PDFXRef(PDFBaseXRef): def __repr__(self): return '' % (self.offsets.keys()) - def load(self, parser, debug=0): + def load(self, parser): while 1: try: (pos, line) = parser.nextline() @@ -116,8 +116,7 @@ class PDFXRef(PDFBaseXRef): if use != 'n': continue self.offsets[objid] = (None, long(pos), int(genno)) - if 1 <= debug: - logging.info('xref objects: %r' % self.offsets) + logging.info('xref objects: %r' % self.offsets) self.load_trailer(parser) return @@ -158,7 +157,7 @@ class PDFXRefFallback(PDFXRef): PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') - def load(self, parser, debug=0): + def load(self, parser): parser.seek(0) while 1: try: @@ -168,8 +167,7 @@ class PDFXRefFallback(PDFXRef): if line.startswith('trailer'): parser.seek(pos) self.load_trailer(parser) - if 1 <= debug: - logging.info('trailer: %r' % self.get_trailer()) + logging.info('trailer: %r' % self.get_trailer()) break m = self.PDFOBJ_CUE.match(line) if not m: @@ -218,7 +216,7 @@ class PDFXRefStream(PDFBaseXRef): def __repr__(self): return '' % (self.ranges) - def load(self, parser, debug=0): + def load(self, parser): (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() @@ -234,10 +232,9 @@ class PDFXRefStream(PDFBaseXRef): self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs - if 1 <= debug: - logging.info('xref stream: objid=%s, fields=%d,%d,%d' % - (', '.join(map(repr, self.ranges)), - self.fl1, self.fl2, self.fl3)) + logging.info('xref stream: objid=%s, fields=%d,%d,%d' % + (', '.join(map(repr, self.ranges)), + self.fl1, self.fl2, self.fl3)) return def get_trailer(self): @@ -635,7 +632,7 @@ class PDFDocument(object): assert objid != 0 if not self.xrefs: raise PDFException('PDFDocument is not initialized') - if 2 <= self.debug: + if self.debug: logging.debug('getobj: objid=%r' % objid) if objid in self._cached_objs: (obj, genno) = self._cached_objs[objid] @@ -661,7 +658,7 @@ class PDFDocument(object): continue else: raise PDFObjectNotFound(objid) - if 2 <= self.debug: + if self.debug: logging.debug('register: objid=%r: %r' % (objid, obj)) if self.caching: self._cached_objs[objid] = (obj, genno) @@ -735,7 +732,7 @@ class PDFDocument(object): prev = None for line in parser.revreadlines(): line = line.strip() - if 2 <= self.debug: + if self.debug: logging.debug('find_xref: %r' % line) if line == 'startxref': break @@ -743,8 +740,7 @@ class PDFDocument(object): prev = line else: raise PDFNoValidXRef('Unexpected EOF') - if 1 <= self.debug: - logging.info('xref found: pos=%r' % prev) + logging.info('xref found: pos=%r' % prev) return long(prev) # read xref table @@ -756,23 +752,21 @@ class PDFDocument(object): (pos, token) = parser.nexttoken() except PSEOF: raise PDFNoValidXRef('Unexpected EOF') - if 1 <= self.debug: - logging.info('read_xref_from: start=%d, token=%r' % (start, token)) + logging.info('read_xref_from: start=%d, token=%r' % (start, token)) if isinstance(token, int): # XRefStream: PDF-1.5 parser.seek(pos) parser.reset() xref = PDFXRefStream() - xref.load(parser, debug=self.debug) + xref.load(parser) else: if token is parser.KEYWORD_XREF: parser.nextline() xref = PDFXRef() - xref.load(parser, debug=self.debug) + xref.load(parser) xrefs.append(xref) trailer = xref.get_trailer() - if 1 <= self.debug: - logging.info('trailer: %r' % trailer) + logging.info('trailer: %r' % trailer) if 'XRefStm' in trailer: pos = int_value(trailer['XRefStm']) self.read_xref_from(parser, pos, xrefs) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index d31c0c8..c734c68 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -131,7 +131,6 @@ class PDFResourceManager(object): such as fonts and images so that large objects are not allocated multiple times. """ - debug = 0 def __init__(self, caching=True): self.caching = caching @@ -161,8 +160,7 @@ class PDFResourceManager(object): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: - if 1 <= self.debug: - logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) + logging.info('get_font: create: objid=%r, spec=%r' % (objid, spec)) if STRICT: if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') @@ -337,7 +335,7 @@ class PDFPageInterpreter(object): else: return PREDEFINED_COLORSPACE.get(name) for (k, v) in dict_value(resources).iteritems(): - if 2 <= self.debug: + if self.debug: logging.debug('Resource: %r: %r' % (k, v)) if k == 'Font': for (fontid, spec) in dict_value(v).iteritems(): @@ -794,8 +792,7 @@ class PDFPageInterpreter(object): if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return - if 1 <= self.debug: - logging.info('Processing xobj: %r' % xobj) + logging.info('Processing xobj: %r' % xobj) subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() @@ -818,8 +815,7 @@ class PDFPageInterpreter(object): return def process_page(self, page): - if 1 <= self.debug: - logging.info('Processing page: %r' % page) + logging.info('Processing page: %r' % page) (x0, y0, x1, y1) = page.mediabox if page.rotate == 90: ctm = (0, -1, 1, 0, -y0, x1) @@ -838,9 +834,8 @@ class PDFPageInterpreter(object): # Render the content streams. # This method may be called recursively. def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): - if 1 <= self.debug: - logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % - (resources, streams, ctm)) + logging.info('render_contents: resources=%r, streams=%r, ctm=%r' % + (resources, streams, ctm)) self.init_resources(resources) self.init_state(ctm) self.execute(list_value(streams)) @@ -865,12 +860,12 @@ class PDFPageInterpreter(object): nargs = func.func_code.co_argcount-1 if nargs: args = self.pop(nargs) - if 2 <= self.debug: + if self.debug: logging.debug('exec: %s %r' % (name, args)) if len(args) == nargs: func(*args) else: - if 2 <= self.debug: + if self.debug: logging.debug('exec: %s' % name) func() else: diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 65251b7..a8a746d 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -74,7 +74,7 @@ class PDFPage(object): INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) @classmethod - def create_pages(klass, document, debug=0): + def create_pages(klass, document): def search(obj, parent): if isinstance(obj, int): objid = obj @@ -86,14 +86,12 @@ class PDFPage(object): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: - if 1 <= debug: - logging.info('Pages: Kids=%r' % tree['Kids']) + logging.info('Pages: Kids=%r' % tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree.get('Type') is LITERAL_PAGE: - if 1 <= debug: - logging.info('Page: %r' % tree) + logging.info('Page: %r' % tree) yield (objid, tree) pages = False if 'Pages' in document.catalog: diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 7b40898..7516f3c 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -121,7 +121,7 @@ class PDFParser(PSStackParser): data += line self.seek(pos+objlen) # XXX limit objlen not to exceed object boundary - if 2 <= self.debug: + if self.debug: logging.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ (pos, objlen, dic, data[:10])) obj = PDFStream(dic, data, self.doc.decipher) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 0d05319..2810cfa 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -192,7 +192,7 @@ class PSBaseParser(object): def seek(self, pos): """Seeks the parser to the given position. """ - if 2 <= self.debug: + if self.debug: logging.debug('seek: %r' % pos) self.fp.seek(pos) # reset the status for nextline() @@ -243,7 +243,7 @@ class PSBaseParser(object): else: linebuf += self.buf[self.charpos:] self.charpos = len(self.buf) - if 2 <= self.debug: + if self.debug: logging.debug('nextline: %r, %r' % (linepos, linebuf)) return (linepos, linebuf) @@ -483,7 +483,7 @@ class PSBaseParser(object): self.fillbuf() self.charpos = self._parse1(self.buf, self.charpos) token = self._tokens.pop(0) - if 2 <= self.debug: + if self.debug: logging.debug('nexttoken: %r' % token) return token @@ -524,7 +524,7 @@ class PSStackParser(PSBaseParser): return objs def add_results(self, *objs): - if 2 <= self.debug: + if self.debug: logging.debug('add_results: %r' % objs) self.results.extend(objs) return @@ -532,7 +532,7 @@ class PSStackParser(PSBaseParser): def start_type(self, pos, type): self.context.append((pos, self.curtype, self.curstack)) (self.curtype, self.curstack) = (type, []) - if 2 <= self.debug: + if self.debug: logging.debug('start_type: pos=%r, type=%r' % (pos, type)) return @@ -541,7 +541,7 @@ class PSStackParser(PSBaseParser): raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) objs = [obj for (_, obj) in self.curstack] (pos, self.curtype, self.curstack) = self.context.pop() - if 2 <= self.debug: + if self.debug: logging.debug('end_type: pos=%r, type=%r, objs=%r' % (pos, type, objs)) return (pos, objs) @@ -596,7 +596,7 @@ class PSStackParser(PSBaseParser): if STRICT: raise else: - if 2 <= self.debug: + if self.debug: logging.debug('do_keyword: pos=%r, token=%r, stack=%r' % \ (pos, token, self.curstack)) self.do_keyword(pos, token) @@ -605,7 +605,7 @@ class PSStackParser(PSBaseParser): else: self.flush() obj = self.results.pop(0) - if 2 <= self.debug: + if self.debug: logging.debug('nextobject: %r' % obj) return obj diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 06fbdac..27d85d5 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -67,9 +67,7 @@ def main(argv): PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug - PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug - PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: @@ -94,7 +92,7 @@ def main(argv): elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, - imagewriter=imagewriter) + imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: