diff --git a/README.html b/README.html index 8772b56..c31ea29 100644 --- a/README.html +++ b/README.html @@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }

PDFMiner

-Last Modified: Tue Jul 1 00:02:48 JST 2008 +Last Modified: Thu Jul 10 00:14:07 JST 2008
@@ -135,13 +135,13 @@ Unicode Standard.

Examples:

-$ ./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf
+$ python -m tools.pdf2txt -H -o output.html samples/naacl06-shinyama.pdf
 (extract text as an HTML file whose filename is output.html)
 
-$ ./pdf2txt.py -c euc-jp samples/jo.pdf
+$ python -m tools.pdf2txt -c euc-jp samples/jo.pdf
 (extract Japanese texts in vertical writing, CMap is required)
 
-$ ./pdf2txt.py -P mypassword secret.pdf
+$ python -m tools.pdf2txt -P mypassword secret.pdf
 (extract texts from an encrypted PDF file with a password)
 
@@ -181,10 +181,13 @@ but it's also possible to extract some meaningful contents

Examples:

-$ ./dumppdf.py -a foo.pdf
+$ python -m tools.dumppdf -a foo.pdf
 (dump all the headers and contents, except stream objects)
 
-$ ./dumppdf.py -r -i6 foo.pdf > pic.jpeg
+$ python -m tools.dumppdf -T foo.pdf
+(dump the table of contents)
+
+$ python -m tools.dumppdf -r -i6 foo.pdf > pic.jpeg
 (extract a JPEG image)
 
diff --git a/pdflib/arcfour.py b/pdflib/arcfour.py index c9c13a8..188acac 100755 --- a/pdflib/arcfour.py +++ b/pdflib/arcfour.py @@ -4,7 +4,7 @@ # * public domain * # -class Arcfour: +class Arcfour(object): def __init__(self, key): s = range(256) diff --git a/pdflib/cmap.py b/pdflib/cmap.py index 05461ad..404714e 100644 --- a/pdflib/cmap.py +++ b/pdflib/cmap.py @@ -17,7 +17,7 @@ class CMapError(Exception): pass ## CMap ## -class CMap: +class CMap(object): def __init__(self, debug=0): self.debug = debug @@ -163,7 +163,7 @@ class CDBCMap(CMap): ## CMapDB ## -class CMapDB: +class CMapDB(object): class CMapNotFound(CMapError): pass @@ -340,7 +340,7 @@ class CMapParser(PSStackParser): ## FontMetricsDB ## -class FontMetricsDB: +class FontMetricsDB(object): from fontmetrics import FONT_METRICS @classmethod @@ -350,7 +350,7 @@ class FontMetricsDB: ## EncodingDB ## -class EncodingDB: +class EncodingDB(object): from glyphlist import charname2unicode from latin_enc import ENCODING diff --git a/pdflib/lzw.py b/pdflib/lzw.py index 0dde44d..d5f3b4e 100755 --- a/pdflib/lzw.py +++ b/pdflib/lzw.py @@ -4,7 +4,7 @@ stderr = sys.stderr ## LZWDecoder ## -class LZWDecoder: +class LZWDecoder(object): def __init__(self, fp, debug=0): self.fp = fp diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index d045769..7e3da6c 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -9,7 +9,7 @@ except ImportError: from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name -from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \ +from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \ int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB @@ -26,7 +26,7 @@ class PDFUnicodeNotDefined(PDFFontError): pass ## ColorSpace ## -class ColorSpace: +class ColorSpace(object): def __init__(self, name, ncomponents): self.name = name self.ncomponents = ncomponents @@ -82,7 +82,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)): ## # PDFFont -class PDFFont: +class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): self.descriptor = descriptor @@ -208,7 +208,7 @@ class PDFType3Font(PDFSimpleFont): ## TrueTypeFont ## -class TrueTypeFont: +class TrueTypeFont(object): class CMapNotFound(Exception): pass @@ -391,7 +391,7 @@ class PDFCIDFont(PDFFont): ## Resource Manager ## -class PDFResourceManager: +class PDFResourceManager(object): ''' ResourceManager facilitates reuse of shared resources @@ -464,7 +464,7 @@ class PDFResourceManager: ## PDFDevice ## -class PDFDevice: +class PDFDevice(object): def __init__(self, rsrc, debug=0): self.rsrc = rsrc @@ -587,9 +587,9 @@ class PDFContentParser(PSStackParser): ## Interpreter ## -class PDFPageInterpreter: +class PDFPageInterpreter(object): - class TextState: + class TextState(object): def __init__(self): self.font = None self.fontsize = 0 diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index 8a4be0f..39a286f 100755 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -11,7 +11,7 @@ from utils import choplist, nunpack from arcfour import Arcfour from lzw import LZWDecoder from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ - PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ + PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, \ PSStackParser, STRICT @@ -46,10 +46,12 @@ KEYWORD_TRAILER = PSKeywordTable.intern('trailer') KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' +class PDFObject(PSObject): pass + ## PDFObjRef ## -class PDFObjRef: +class PDFObjRef(PDFObject): def __init__(self, doc, objid, _): if objid == 0: @@ -165,7 +167,7 @@ def stream_value(x): ## PDFStream type ## -class PDFStream: +class PDFStream(PDFObject): def __init__(self, dic, rawdata, decipher=None): self.dic = dic @@ -247,11 +249,11 @@ class PDFStream: ## PDFPage ## -class PDFPage: +class PDFPage(object): - def __init__(self, doc, pageidx, attrs): + def __init__(self, doc, pageid, attrs): self.doc = doc - self.pageid = pageidx + self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = self.attrs.get('LastModified') self.resources = resolve1(self.attrs['Resources']) @@ -397,7 +399,7 @@ class PDFXRefStream(object): ## at once. Rather it is parsed dynamically as processing goes. ## A PDF parser is associated with the document. ## -class PDFDocument: +class PDFDocument(object): def __init__(self, debug=0): self.debug = debug @@ -453,7 +455,6 @@ class PDFDocument: if self.catalog.get('Type') != LITERAL_CATALOG: if STRICT: raise PDFValueError('Catalog not found!') - self.outline = self.catalog.get('Outline') return # initialize(password='') @@ -608,11 +609,54 @@ class PDFDocument: elif tree.get('Type') == LITERAL_PAGE: if 1 <= debug: print >>stderr, 'Page: %r' % tree - yield tree + yield (obj.objid, tree) if 'Pages' not in self.catalog: return - for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)): - yield PDFPage(self, i, tree) - return + for (pageid,tree) in search(self.catalog['Pages'], self.catalog): + yield PDFPage(self, pageid, tree) + return + + def get_outlines(self): + if 'Outlines' not in self.catalog: + raise PDFException('no /Outlines defined!') + def search(entry, level): + entry = dict_value(entry) + if 'Title' in entry: + if 'A' in entry or 'Dest' in entry: + title = unicode(str_value(entry['Title']), 'utf-8', 'ignore') + dest = entry.get('Dest') + action = entry.get('A') + se = entry.get('SE') + yield (level, title, dest, action, se) + if 'First' in entry and 'Last' in entry: + for x in search(entry['First'], level+1): + yield x + if 'Next' in entry: + for x in search(entry['Next'], level): + yield x + return + return search(self.catalog['Outlines'], 0) + + def lookup_name(self, cat, key): + try: + names = dict_value(self.catalog['Names']) + except (PDFTypeError, KeyError): + raise KeyError((cat,key)) + # may raise KeyError + d0 = dict_value(names[cat]) + def lookup(d): + if 'Limits' in d: + (k1,k2) = list_value(d['Limits']) + if key < k1 or k2 < key: return None + if 'Names' in d: + objs = list_value(d['Names']) + names = dict(choplist(2, objs)) + return names[key] + if 'Kids' in d: + for c in list_value(d['Kids']): + v = lookup(dict_value(c)) + if v: return v + raise KeyError((cat,key)) + return lookup(d0) ## PDFParser diff --git a/pdflib/psparser.py b/pdflib/psparser.py index d814575..8450c14 100644 --- a/pdflib/psparser.py +++ b/pdflib/psparser.py @@ -19,7 +19,9 @@ class PSValueError(PSException): pass ## # PSLiteral -class PSLiteral: +class PSObject(object): pass + +class PSLiteral(PSObject): ''' PS literals (e.g. "/Name"). @@ -35,7 +37,7 @@ class PSLiteral: return '/%s' % self.name # PSKeyword -class PSKeyword: +class PSKeyword(PSObject): ''' PS keywords (e.g. "showpage"). @@ -51,7 +53,7 @@ class PSKeyword: return self.name # PSSymbolTable -class PSSymbolTable: +class PSSymbolTable(object): ''' Symbol table that stores PSLiteral or PSKeyword. @@ -113,7 +115,7 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') END_STRING = re.compile(r'[()\134]') OCT_STRING = re.compile(r'[0-7]') ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } -class PSBaseParser: +class PSBaseParser(object): ''' Most basic PostScript parser that performs only basic tokenization. @@ -129,6 +131,13 @@ class PSBaseParser: def __repr__(self): return '' % (self.fp, self.bufpos) + def flush(self): + return + + def close(self): + self.flush() + return + def tell(self): return self.fp.tell() @@ -463,8 +472,6 @@ class PSStackParser(PSBaseParser): def do_keyword(self, pos, token): return - def flush(self): - return def nextobject(self): ''' diff --git a/pdflib/pycdb.py b/pdflib/pycdb.py index e1a4944..71bf664 100755 --- a/pdflib/pycdb.py +++ b/pdflib/pycdb.py @@ -51,7 +51,7 @@ def cdbiter(fp, eod): # CDBReader -class CDBReader: +class CDBReader(object): def __init__(self, cdbname, docache=1): self.name = cdbname @@ -59,7 +59,7 @@ class CDBReader: hash0 = decode(self._fp.read(2048)) self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ] self._hash1 = [ None ] * 256 - self._eod = self._hash0[0] + self._eod = hash0[0] self._docache = docache self._cache = {} self._keyiter = None @@ -149,7 +149,7 @@ class CDBReader: # CDBMaker -class CDBMaker: +class CDBMaker(object): def __init__(self, cdbname, tmpname): self.fn = cdbname diff --git a/tools/dumppdf.py b/tools/dumppdf.py index fb0f8a6..a42a91a 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -8,7 +8,7 @@ # import sys, re from pdflib.pdfparser import PDFDocument, PDFParser, PDFStream, \ - PDFObjRef, PSKeyword, PSLiteral + PDFObjRef, PSKeyword, PSLiteral, resolve1 stdout = sys.stdout stderr = sys.stderr @@ -94,8 +94,28 @@ def dumpallobjs(out, doc, codec=None): out.write('') return +# dumpoutline +def dumpoutline(outfp, fname, objids, pagenos, password='', + dumpall=False, codec=None, debug=0): + doc = PDFDocument(debug=debug) + fp = file(fname, 'rb') + parser = PDFParser(doc, fp, debug=debug) + doc.initialize(password) + pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) + for (level,title,dest,a,se) in doc.get_outlines(): + pageno = None + if dest: + dest = resolve1( doc.lookup_name('Dests', dest) ) + if isinstance(dest, dict): + dest = dest['D'] + pageno = pages[dest[0].objid] + outfp.write(repr((level,title,dest,pageno))+'\n') + parser.close() + fp.close() + return + # dumppdf -def dumppdf(outfp, fname, objids, pageids, password='', +def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, debug=0): doc = PDFDocument(debug=debug) fp = file(fname, 'rb') @@ -110,13 +130,13 @@ def dumppdf(outfp, fname, objids, pageids, password='', outfp.write(obj.get_data()) else: dumpxml(outfp, obj, codec=codec) - if pageids: - for page in doc.get_pages(): - if page.pageid in pageids: + if pagenos: + for (pageno,page) in enumerate(doc.get_pages()): + if pageno in pagenos: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) - if (not objids) and (not pageids) and (not dumpall): + if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() outfp.write('\n') @@ -127,34 +147,36 @@ def dumppdf(outfp, fname, objids, pageids, password='', def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-i objid] file ...' % argv[0] + print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbti:') + (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:') except getopt.GetoptError: return usage() if not args: return usage() debug = 0 objids = [] - pageids = set() + pagenos = set() codec = None password = '' dumpall = False + proc = dumppdf outfp = stdout for (k, v) in opts: if k == '-d': debug += 1 elif k == '-i': objids.extend( int(x) for x in v.split(',') ) - elif k == '-p': pageids.update( int(x)-1 for x in v.split(',') ) + elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-P': password = v elif k == '-a': dumpall = True elif k == '-r': codec = 'raw' elif k == '-b': codec = 'binary' elif k == '-t': codec = 'text' + elif k == '-T': proc = dumpoutline elif k == '-o': outfp = file(v, 'wb') # for fname in args: - dumppdf(outfp, fname, objids, pageids, password=password, - dumpall=dumpall, codec=codec, debug=debug) + proc(outfp, fname, objids, pagenos, password=password, + dumpall=dumpall, codec=codec, debug=debug) return if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 33d652b..bcba45f 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -93,7 +93,7 @@ class TextConverter(PDFDevice): return def begin_page(self, page): - self.context = PageItem(str(page.pageid+1), page.mediabox, page.rotate) + self.context = PageItem(len(self.pages), page.mediabox, page.rotate) return def end_page(self, _): assert not self.stack @@ -205,7 +205,7 @@ class TextConverter(PDFDevice): # pdf2txt class TextExtractionNotAllowed(RuntimeError): pass -def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0): +def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password='', debug=0): device = TextConverter(rsrc, debug=debug) doc = PDFDocument(debug=debug) fp = file(fname, 'rb') @@ -218,10 +218,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password=' raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) device.reset() - for (i,page) in enumerate(doc.get_pages(debug=debug)): - if pages and (i not in pages): continue + for (pageno,page) in enumerate(doc.get_pages(debug=debug)): + if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) - if maxpages and maxpages <= i+1: break + if maxpages and maxpages <= pageno+1: break if html: device.dump_html(outfp, codec) else: @@ -235,7 +235,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password=' def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0] + print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:') @@ -246,14 +246,14 @@ def main(argv): cmapdir = 'CMap' cdbcmapdir = 'CDBCMap' codec = 'ascii' - pages = set() + pagenos = set() maxpages = 0 html = False password = '' outfp = stdout for (k, v) in opts: if k == '-d': debug += 1 - elif k == '-p': pages.update( int(x)-1 for x in v.split(',') ) + elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-P': password = v elif k == '-c': codec = v elif k == '-m': maxpages = int(v) @@ -265,7 +265,7 @@ def main(argv): CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) rsrc = PDFResourceManager(debug=debug) for fname in args: - pdf2txt(outfp, rsrc, fname, pages, codec, + pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=maxpages, html=html, password=password, debug=debug) return