diff --git a/README.html b/README.html index 8772b56..c31ea29 100644 --- a/README.html +++ b/README.html @@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
Examples:
@@ -181,10 +181,13 @@ but it's also possible to extract some meaningful contents-$ ./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf +$ python -m tools.pdf2txt -H -o output.html samples/naacl06-shinyama.pdf (extract text as an HTML file whose filename is output.html) -$ ./pdf2txt.py -c euc-jp samples/jo.pdf +$ python -m tools.pdf2txt -c euc-jp samples/jo.pdf (extract Japanese texts in vertical writing, CMap is required) -$ ./pdf2txt.py -P mypassword secret.pdf +$ python -m tools.pdf2txt -P mypassword secret.pdf (extract texts from an encrypted PDF file with a password)
Examples:
diff --git a/pdflib/arcfour.py b/pdflib/arcfour.py index c9c13a8..188acac 100755 --- a/pdflib/arcfour.py +++ b/pdflib/arcfour.py @@ -4,7 +4,7 @@ # * public domain * # -class Arcfour: +class Arcfour(object): def __init__(self, key): s = range(256) diff --git a/pdflib/cmap.py b/pdflib/cmap.py index 05461ad..404714e 100644 --- a/pdflib/cmap.py +++ b/pdflib/cmap.py @@ -17,7 +17,7 @@ class CMapError(Exception): pass ## CMap ## -class CMap: +class CMap(object): def __init__(self, debug=0): self.debug = debug @@ -163,7 +163,7 @@ class CDBCMap(CMap): ## CMapDB ## -class CMapDB: +class CMapDB(object): class CMapNotFound(CMapError): pass @@ -340,7 +340,7 @@ class CMapParser(PSStackParser): ## FontMetricsDB ## -class FontMetricsDB: +class FontMetricsDB(object): from fontmetrics import FONT_METRICS @classmethod @@ -350,7 +350,7 @@ class FontMetricsDB: ## EncodingDB ## -class EncodingDB: +class EncodingDB(object): from glyphlist import charname2unicode from latin_enc import ENCODING diff --git a/pdflib/lzw.py b/pdflib/lzw.py index 0dde44d..d5f3b4e 100755 --- a/pdflib/lzw.py +++ b/pdflib/lzw.py @@ -4,7 +4,7 @@ stderr = sys.stderr ## LZWDecoder ## -class LZWDecoder: +class LZWDecoder(object): def __init__(self, fp, debug=0): self.fp = fp diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index d045769..7e3da6c 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -9,7 +9,7 @@ except ImportError: from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name -from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \ +from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \ int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB @@ -26,7 +26,7 @@ class PDFUnicodeNotDefined(PDFFontError): pass ## ColorSpace ## -class ColorSpace: +class ColorSpace(object): def __init__(self, name, ncomponents): self.name = name self.ncomponents = ncomponents @@ -82,7 +82,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)): ## # PDFFont -class PDFFont: +class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): self.descriptor = descriptor @@ -208,7 +208,7 @@ class PDFType3Font(PDFSimpleFont): ## TrueTypeFont ## -class TrueTypeFont: +class TrueTypeFont(object): class CMapNotFound(Exception): pass @@ -391,7 +391,7 @@ class PDFCIDFont(PDFFont): ## Resource Manager ## -class PDFResourceManager: +class PDFResourceManager(object): ''' ResourceManager facilitates reuse of shared resources @@ -464,7 +464,7 @@ class PDFResourceManager: ## PDFDevice ## -class PDFDevice: +class PDFDevice(object): def __init__(self, rsrc, debug=0): self.rsrc = rsrc @@ -587,9 +587,9 @@ class PDFContentParser(PSStackParser): ## Interpreter ## -class PDFPageInterpreter: +class PDFPageInterpreter(object): - class TextState: + class TextState(object): def __init__(self): self.font = None self.fontsize = 0 diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index 8a4be0f..39a286f 100755 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -11,7 +11,7 @@ from utils import choplist, nunpack from arcfour import Arcfour from lzw import LZWDecoder from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ - PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ + PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, \ PSStackParser, STRICT @@ -46,10 +46,12 @@ KEYWORD_TRAILER = PSKeywordTable.intern('trailer') KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' +class PDFObject(PSObject): pass + ## PDFObjRef ## -class PDFObjRef: +class PDFObjRef(PDFObject): def __init__(self, doc, objid, _): if objid == 0: @@ -165,7 +167,7 @@ def stream_value(x): ## PDFStream type ## -class PDFStream: +class PDFStream(PDFObject): def __init__(self, dic, rawdata, decipher=None): self.dic = dic @@ -247,11 +249,11 @@ class PDFStream: ## PDFPage ## -class PDFPage: +class PDFPage(object): - def __init__(self, doc, pageidx, attrs): + def __init__(self, doc, pageid, attrs): self.doc = doc - self.pageid = pageidx + self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = self.attrs.get('LastModified') self.resources = resolve1(self.attrs['Resources']) @@ -397,7 +399,7 @@ class PDFXRefStream(object): ## at once. Rather it is parsed dynamically as processing goes. ## A PDF parser is associated with the document. ## -class PDFDocument: +class PDFDocument(object): def __init__(self, debug=0): self.debug = debug @@ -453,7 +455,6 @@ class PDFDocument: if self.catalog.get('Type') != LITERAL_CATALOG: if STRICT: raise PDFValueError('Catalog not found!') - self.outline = self.catalog.get('Outline') return # initialize(password='') @@ -608,11 +609,54 @@ class PDFDocument: elif tree.get('Type') == LITERAL_PAGE: if 1 <= debug: print >>stderr, 'Page: %r' % tree - yield tree + yield (obj.objid, tree) if 'Pages' not in self.catalog: return - for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)): - yield PDFPage(self, i, tree) - return + for (pageid,tree) in search(self.catalog['Pages'], self.catalog): + yield PDFPage(self, pageid, tree) + return + + def get_outlines(self): + if 'Outlines' not in self.catalog: + raise PDFException('no /Outlines defined!') + def search(entry, level): + entry = dict_value(entry) + if 'Title' in entry: + if 'A' in entry or 'Dest' in entry: + title = unicode(str_value(entry['Title']), 'utf-8', 'ignore') + dest = entry.get('Dest') + action = entry.get('A') + se = entry.get('SE') + yield (level, title, dest, action, se) + if 'First' in entry and 'Last' in entry: + for x in search(entry['First'], level+1): + yield x + if 'Next' in entry: + for x in search(entry['Next'], level): + yield x + return + return search(self.catalog['Outlines'], 0) + + def lookup_name(self, cat, key): + try: + names = dict_value(self.catalog['Names']) + except (PDFTypeError, KeyError): + raise KeyError((cat,key)) + # may raise KeyError + d0 = dict_value(names[cat]) + def lookup(d): + if 'Limits' in d: + (k1,k2) = list_value(d['Limits']) + if key < k1 or k2 < key: return None + if 'Names' in d: + objs = list_value(d['Names']) + names = dict(choplist(2, objs)) + return names[key] + if 'Kids' in d: + for c in list_value(d['Kids']): + v = lookup(dict_value(c)) + if v: return v + raise KeyError((cat,key)) + return lookup(d0) ## PDFParser diff --git a/pdflib/psparser.py b/pdflib/psparser.py index d814575..8450c14 100644 --- a/pdflib/psparser.py +++ b/pdflib/psparser.py @@ -19,7 +19,9 @@ class PSValueError(PSException): pass ## # PSLiteral -class PSLiteral: +class PSObject(object): pass + +class PSLiteral(PSObject): ''' PS literals (e.g. "/Name"). @@ -35,7 +37,7 @@ class PSLiteral: return '/%s' % self.name # PSKeyword -class PSKeyword: +class PSKeyword(PSObject): ''' PS keywords (e.g. "showpage"). @@ -51,7 +53,7 @@ class PSKeyword: return self.name # PSSymbolTable -class PSSymbolTable: +class PSSymbolTable(object): ''' Symbol table that stores PSLiteral or PSKeyword. @@ -113,7 +115,7 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') END_STRING = re.compile(r'[()\134]') OCT_STRING = re.compile(r'[0-7]') ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } -class PSBaseParser: +class PSBaseParser(object): ''' Most basic PostScript parser that performs only basic tokenization. @@ -129,6 +131,13 @@ class PSBaseParser: def __repr__(self): return '-$ ./dumppdf.py -a foo.pdf +$ python -m tools.dumppdf -a foo.pdf (dump all the headers and contents, except stream objects) -$ ./dumppdf.py -r -i6 foo.pdf > pic.jpeg +$ python -m tools.dumppdf -T foo.pdf +(dump the table of contents) + +$ python -m tools.dumppdf -r -i6 foo.pdf > pic.jpeg (extract a JPEG image)