outline (TOC) extraction supported.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@42 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-07-09 15:15:32 +00:00
parent cb02051481
commit 9740f26cec
10 changed files with 138 additions and 62 deletions

View File

@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
<h1>PDFMiner</h1> <h1>PDFMiner</h1>
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Tue Jul 1 00:02:48 JST 2008 Last Modified: Thu Jul 10 00:14:07 JST 2008
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -135,13 +135,13 @@ Unicode Standard.
<p> <p>
Examples: Examples:
<blockquote><pre> <blockquote><pre>
$ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong> $ <strong>python -m tools.pdf2txt -H -o output.html samples/naacl06-shinyama.pdf</strong>
(extract text as an HTML file whose filename is output.html) (extract text as an HTML file whose filename is output.html)
$ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong> $ <strong>python -m tools.pdf2txt -c euc-jp samples/jo.pdf</strong>
(extract Japanese texts in vertical writing, CMap is required) (extract Japanese texts in vertical writing, CMap is required)
$ <strong>./pdf2txt.py -P mypassword secret.pdf</strong> $ <strong>python -m tools.pdf2txt -P mypassword secret.pdf</strong>
(extract texts from an encrypted PDF file with a password) (extract texts from an encrypted PDF file with a password)
</pre></blockquote> </pre></blockquote>
@ -181,10 +181,13 @@ but it's also possible to extract some meaningful contents
<p> <p>
Examples: Examples:
<blockquote><pre> <blockquote><pre>
$ <strong>./dumppdf.py -a foo.pdf</strong> $ <strong>python -m tools.dumppdf -a foo.pdf</strong>
(dump all the headers and contents, except stream objects) (dump all the headers and contents, except stream objects)
$ <strong>./dumppdf.py -r -i6 foo.pdf &gt; pic.jpeg</strong> $ <strong>python -m tools.dumppdf -T foo.pdf</strong>
(dump the table of contents)
$ <strong>python -m tools.dumppdf -r -i6 foo.pdf &gt; pic.jpeg</strong>
(extract a JPEG image) (extract a JPEG image)
</pre></blockquote> </pre></blockquote>

View File

@ -4,7 +4,7 @@
# * public domain * # * public domain *
# #
class Arcfour: class Arcfour(object):
def __init__(self, key): def __init__(self, key):
s = range(256) s = range(256)

View File

@ -17,7 +17,7 @@ class CMapError(Exception): pass
## CMap ## CMap
## ##
class CMap: class CMap(object):
def __init__(self, debug=0): def __init__(self, debug=0):
self.debug = debug self.debug = debug
@ -163,7 +163,7 @@ class CDBCMap(CMap):
## CMapDB ## CMapDB
## ##
class CMapDB: class CMapDB(object):
class CMapNotFound(CMapError): pass class CMapNotFound(CMapError): pass
@ -340,7 +340,7 @@ class CMapParser(PSStackParser):
## FontMetricsDB ## FontMetricsDB
## ##
class FontMetricsDB: class FontMetricsDB(object):
from fontmetrics import FONT_METRICS from fontmetrics import FONT_METRICS
@classmethod @classmethod
@ -350,7 +350,7 @@ class FontMetricsDB:
## EncodingDB ## EncodingDB
## ##
class EncodingDB: class EncodingDB(object):
from glyphlist import charname2unicode from glyphlist import charname2unicode
from latin_enc import ENCODING from latin_enc import ENCODING

View File

@ -4,7 +4,7 @@ stderr = sys.stderr
## LZWDecoder ## LZWDecoder
## ##
class LZWDecoder: class LZWDecoder(object):
def __init__(self, fp, debug=0): def __init__(self, fp, debug=0):
self.fp = fp self.fp = fp

View File

@ -9,7 +9,7 @@ except ImportError:
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSStackParser, PSLiteral, PSKeyword, STRICT, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \ from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
int_value, float_value, num_value, \ int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
@ -26,7 +26,7 @@ class PDFUnicodeNotDefined(PDFFontError): pass
## ColorSpace ## ColorSpace
## ##
class ColorSpace: class ColorSpace(object):
def __init__(self, name, ncomponents): def __init__(self, name, ncomponents):
self.name = name self.name = name
self.ncomponents = ncomponents self.ncomponents = ncomponents
@ -82,7 +82,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
## ##
# PDFFont # PDFFont
class PDFFont: class PDFFont(object):
def __init__(self, descriptor, widths, default_width=None): def __init__(self, descriptor, widths, default_width=None):
self.descriptor = descriptor self.descriptor = descriptor
@ -208,7 +208,7 @@ class PDFType3Font(PDFSimpleFont):
## TrueTypeFont ## TrueTypeFont
## ##
class TrueTypeFont: class TrueTypeFont(object):
class CMapNotFound(Exception): pass class CMapNotFound(Exception): pass
@ -391,7 +391,7 @@ class PDFCIDFont(PDFFont):
## Resource Manager ## Resource Manager
## ##
class PDFResourceManager: class PDFResourceManager(object):
''' '''
ResourceManager facilitates reuse of shared resources ResourceManager facilitates reuse of shared resources
@ -464,7 +464,7 @@ class PDFResourceManager:
## PDFDevice ## PDFDevice
## ##
class PDFDevice: class PDFDevice(object):
def __init__(self, rsrc, debug=0): def __init__(self, rsrc, debug=0):
self.rsrc = rsrc self.rsrc = rsrc
@ -587,9 +587,9 @@ class PDFContentParser(PSStackParser):
## Interpreter ## Interpreter
## ##
class PDFPageInterpreter: class PDFPageInterpreter(object):
class TextState: class TextState(object):
def __init__(self): def __init__(self):
self.font = None self.font = None
self.fontsize = 0 self.fontsize = 0

View File

@ -11,7 +11,7 @@ from utils import choplist, nunpack
from arcfour import Arcfour from arcfour import Arcfour
from lzw import LZWDecoder from lzw import LZWDecoder
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \ literal_name, keyword_name, \
PSStackParser, STRICT PSStackParser, STRICT
@ -46,10 +46,12 @@ KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
class PDFObject(PSObject): pass
## PDFObjRef ## PDFObjRef
## ##
class PDFObjRef: class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _): def __init__(self, doc, objid, _):
if objid == 0: if objid == 0:
@ -165,7 +167,7 @@ def stream_value(x):
## PDFStream type ## PDFStream type
## ##
class PDFStream: class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None): def __init__(self, dic, rawdata, decipher=None):
self.dic = dic self.dic = dic
@ -247,11 +249,11 @@ class PDFStream:
## PDFPage ## PDFPage
## ##
class PDFPage: class PDFPage(object):
def __init__(self, doc, pageidx, attrs): def __init__(self, doc, pageid, attrs):
self.doc = doc self.doc = doc
self.pageid = pageidx self.pageid = pageid
self.attrs = dict_value(attrs) self.attrs = dict_value(attrs)
self.lastmod = self.attrs.get('LastModified') self.lastmod = self.attrs.get('LastModified')
self.resources = resolve1(self.attrs['Resources']) self.resources = resolve1(self.attrs['Resources'])
@ -397,7 +399,7 @@ class PDFXRefStream(object):
## at once. Rather it is parsed dynamically as processing goes. ## at once. Rather it is parsed dynamically as processing goes.
## A PDF parser is associated with the document. ## A PDF parser is associated with the document.
## ##
class PDFDocument: class PDFDocument(object):
def __init__(self, debug=0): def __init__(self, debug=0):
self.debug = debug self.debug = debug
@ -453,7 +455,6 @@ class PDFDocument:
if self.catalog.get('Type') != LITERAL_CATALOG: if self.catalog.get('Type') != LITERAL_CATALOG:
if STRICT: if STRICT:
raise PDFValueError('Catalog not found!') raise PDFValueError('Catalog not found!')
self.outline = self.catalog.get('Outline')
return return
# initialize(password='') # initialize(password='')
@ -608,11 +609,54 @@ class PDFDocument:
elif tree.get('Type') == LITERAL_PAGE: elif tree.get('Type') == LITERAL_PAGE:
if 1 <= debug: if 1 <= debug:
print >>stderr, 'Page: %r' % tree print >>stderr, 'Page: %r' % tree
yield tree yield (obj.objid, tree)
if 'Pages' not in self.catalog: return if 'Pages' not in self.catalog: return
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)): for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, i, tree) yield PDFPage(self, pageid, tree)
return return
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFException('no /Outlines defined!')
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
for x in search(entry['First'], level+1):
yield x
if 'Next' in entry:
for x in search(entry['Next'], level):
yield x
return
return search(self.catalog['Outlines'], 0)
def lookup_name(self, cat, key):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat,key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1,k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v: return v
raise KeyError((cat,key))
return lookup(d0)
## PDFParser ## PDFParser

View File

@ -19,7 +19,9 @@ class PSValueError(PSException): pass
## ##
# PSLiteral # PSLiteral
class PSLiteral: class PSObject(object): pass
class PSLiteral(PSObject):
''' '''
PS literals (e.g. "/Name"). PS literals (e.g. "/Name").
@ -35,7 +37,7 @@ class PSLiteral:
return '/%s' % self.name return '/%s' % self.name
# PSKeyword # PSKeyword
class PSKeyword: class PSKeyword(PSObject):
''' '''
PS keywords (e.g. "showpage"). PS keywords (e.g. "showpage").
@ -51,7 +53,7 @@ class PSKeyword:
return self.name return self.name
# PSSymbolTable # PSSymbolTable
class PSSymbolTable: class PSSymbolTable(object):
''' '''
Symbol table that stores PSLiteral or PSKeyword. Symbol table that stores PSLiteral or PSKeyword.
@ -113,7 +115,7 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]') END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]') OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
class PSBaseParser: class PSBaseParser(object):
''' '''
Most basic PostScript parser that performs only basic tokenization. Most basic PostScript parser that performs only basic tokenization.
@ -129,6 +131,13 @@ class PSBaseParser:
def __repr__(self): def __repr__(self):
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos) return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
def flush(self):
return
def close(self):
self.flush()
return
def tell(self): def tell(self):
return self.fp.tell() return self.fp.tell()
@ -463,8 +472,6 @@ class PSStackParser(PSBaseParser):
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
return return
def flush(self):
return
def nextobject(self): def nextobject(self):
''' '''

View File

@ -51,7 +51,7 @@ def cdbiter(fp, eod):
# CDBReader # CDBReader
class CDBReader: class CDBReader(object):
def __init__(self, cdbname, docache=1): def __init__(self, cdbname, docache=1):
self.name = cdbname self.name = cdbname
@ -59,7 +59,7 @@ class CDBReader:
hash0 = decode(self._fp.read(2048)) hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ] self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256 self._hash1 = [ None ] * 256
self._eod = self._hash0[0] self._eod = hash0[0]
self._docache = docache self._docache = docache
self._cache = {} self._cache = {}
self._keyiter = None self._keyiter = None
@ -149,7 +149,7 @@ class CDBReader:
# CDBMaker # CDBMaker
class CDBMaker: class CDBMaker(object):
def __init__(self, cdbname, tmpname): def __init__(self, cdbname, tmpname):
self.fn = cdbname self.fn = cdbname

View File

@ -8,7 +8,7 @@
# #
import sys, re import sys, re
from pdflib.pdfparser import PDFDocument, PDFParser, PDFStream, \ from pdflib.pdfparser import PDFDocument, PDFParser, PDFStream, \
PDFObjRef, PSKeyword, PSLiteral PDFObjRef, PSKeyword, PSLiteral, resolve1
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
@ -94,8 +94,28 @@ def dumpallobjs(out, doc, codec=None):
out.write('</pdf>') out.write('</pdf>')
return return
# dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, debug=0):
doc = PDFDocument(debug=debug)
fp = file(fname, 'rb')
parser = PDFParser(doc, fp, debug=debug)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pageids, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, debug=0): dumpall=False, codec=None, debug=0):
doc = PDFDocument(debug=debug) doc = PDFDocument(debug=debug)
fp = file(fname, 'rb') fp = file(fname, 'rb')
@ -110,13 +130,13 @@ def dumppdf(outfp, fname, objids, pageids, password='',
outfp.write(obj.get_data()) outfp.write(obj.get_data())
else: else:
dumpxml(outfp, obj, codec=codec) dumpxml(outfp, obj, codec=codec)
if pageids: if pagenos:
for page in doc.get_pages(): for (pageno,page) in enumerate(doc.get_pages()):
if page.pageid in pageids: if pageno in pagenos:
dumpxml(outfp, page.attrs) dumpxml(outfp, page.attrs)
if dumpall: if dumpall:
dumpallobjs(outfp, doc, codec=codec) dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pageids) and (not dumpall): if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc) dumptrailers(outfp, doc)
fp.close() fp.close()
outfp.write('\n') outfp.write('\n')
@ -127,34 +147,36 @@ def dumppdf(outfp, fname, objids, pageids, password='',
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-i objid] file ...' % argv[0] print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbti:') (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
debug = 0 debug = 0
objids = [] objids = []
pageids = set() pagenos = set()
codec = None codec = None
password = '' password = ''
dumpall = False dumpall = False
proc = dumppdf
outfp = stdout outfp = stdout
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pageids.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-a': dumpall = True elif k == '-a': dumpall = True
elif k == '-r': codec = 'raw' elif k == '-r': codec = 'raw'
elif k == '-b': codec = 'binary' elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text' elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = file(v, 'wb')
# #
for fname in args: for fname in args:
dumppdf(outfp, fname, objids, pageids, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec, debug=debug) dumpall=dumpall, codec=codec, debug=debug)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -93,7 +93,7 @@ class TextConverter(PDFDevice):
return return
def begin_page(self, page): def begin_page(self, page):
self.context = PageItem(str(page.pageid+1), page.mediabox, page.rotate) self.context = PageItem(len(self.pages), page.mediabox, page.rotate)
return return
def end_page(self, _): def end_page(self, _):
assert not self.stack assert not self.stack
@ -205,7 +205,7 @@ class TextConverter(PDFDevice):
# pdf2txt # pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass class TextExtractionNotAllowed(RuntimeError): pass
def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0): def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password='', debug=0):
device = TextConverter(rsrc, debug=debug) device = TextConverter(rsrc, debug=debug)
doc = PDFDocument(debug=debug) doc = PDFDocument(debug=debug)
fp = file(fname, 'rb') fp = file(fname, 'rb')
@ -218,10 +218,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname) raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug) interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
device.reset() device.reset()
for (i,page) in enumerate(doc.get_pages(debug=debug)): for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
if pages and (i not in pages): continue if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page) interpreter.process_page(page)
if maxpages and maxpages <= i+1: break if maxpages and maxpages <= pageno+1: break
if html: if html:
device.dump_html(outfp, codec) device.dump_html(outfp, codec)
else: else:
@ -235,7 +235,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0] print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
@ -246,14 +246,14 @@ def main(argv):
cmapdir = 'CMap' cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap' cdbcmapdir = 'CDBCMap'
codec = 'ascii' codec = 'ascii'
pages = set() pagenos = set()
maxpages = 0 maxpages = 0
html = False html = False
password = '' password = ''
outfp = stdout outfp = stdout
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': debug += 1
elif k == '-p': pages.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-m': maxpages = int(v) elif k == '-m': maxpages = int(v)
@ -265,7 +265,7 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug) rsrc = PDFResourceManager(debug=debug)
for fname in args: for fname in args:
pdf2txt(outfp, rsrc, fname, pages, codec, pdf2txt(outfp, rsrc, fname, pagenos, codec,
maxpages=maxpages, html=html, password=password, debug=debug) maxpages=maxpages, html=html, password=password, debug=debug)
return return