outline (TOC) extraction supported.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@42 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-07-09 15:15:32 +00:00
parent cb02051481
commit 9740f26cec
10 changed files with 138 additions and 62 deletions

View File

@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
<h1>PDFMiner</h1>
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Tue Jul 1 00:02:48 JST 2008
Last Modified: Thu Jul 10 00:14:07 JST 2008
<!-- hhmts end -->
</div>
@ -135,13 +135,13 @@ Unicode Standard.
<p>
Examples:
<blockquote><pre>
$ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong>
$ <strong>python -m tools.pdf2txt -H -o output.html samples/naacl06-shinyama.pdf</strong>
(extract text as an HTML file whose filename is output.html)
$ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong>
$ <strong>python -m tools.pdf2txt -c euc-jp samples/jo.pdf</strong>
(extract Japanese texts in vertical writing, CMap is required)
$ <strong>./pdf2txt.py -P mypassword secret.pdf</strong>
$ <strong>python -m tools.pdf2txt -P mypassword secret.pdf</strong>
(extract texts from an encrypted PDF file with a password)
</pre></blockquote>
@ -181,10 +181,13 @@ but it's also possible to extract some meaningful contents
<p>
Examples:
<blockquote><pre>
$ <strong>./dumppdf.py -a foo.pdf</strong>
$ <strong>python -m tools.dumppdf -a foo.pdf</strong>
(dump all the headers and contents, except stream objects)
$ <strong>./dumppdf.py -r -i6 foo.pdf &gt; pic.jpeg</strong>
$ <strong>python -m tools.dumppdf -T foo.pdf</strong>
(dump the table of contents)
$ <strong>python -m tools.dumppdf -r -i6 foo.pdf &gt; pic.jpeg</strong>
(extract a JPEG image)
</pre></blockquote>

View File

@ -4,7 +4,7 @@
# * public domain *
#
class Arcfour:
class Arcfour(object):
def __init__(self, key):
s = range(256)

View File

@ -17,7 +17,7 @@ class CMapError(Exception): pass
## CMap
##
class CMap:
class CMap(object):
def __init__(self, debug=0):
self.debug = debug
@ -163,7 +163,7 @@ class CDBCMap(CMap):
## CMapDB
##
class CMapDB:
class CMapDB(object):
class CMapNotFound(CMapError): pass
@ -340,7 +340,7 @@ class CMapParser(PSStackParser):
## FontMetricsDB
##
class FontMetricsDB:
class FontMetricsDB(object):
from fontmetrics import FONT_METRICS
@classmethod
@ -350,7 +350,7 @@ class FontMetricsDB:
## EncodingDB
##
class EncodingDB:
class EncodingDB(object):
from glyphlist import charname2unicode
from latin_enc import ENCODING

View File

@ -4,7 +4,7 @@ stderr = sys.stderr
## LZWDecoder
##
class LZWDecoder:
class LZWDecoder(object):
def __init__(self, fp, debug=0):
self.fp = fp

View File

@ -9,7 +9,7 @@ except ImportError:
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSStackParser, PSLiteral, PSKeyword, STRICT, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
@ -26,7 +26,7 @@ class PDFUnicodeNotDefined(PDFFontError): pass
## ColorSpace
##
class ColorSpace:
class ColorSpace(object):
def __init__(self, name, ncomponents):
self.name = name
self.ncomponents = ncomponents
@ -82,7 +82,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
##
# PDFFont
class PDFFont:
class PDFFont(object):
def __init__(self, descriptor, widths, default_width=None):
self.descriptor = descriptor
@ -208,7 +208,7 @@ class PDFType3Font(PDFSimpleFont):
## TrueTypeFont
##
class TrueTypeFont:
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
@ -391,7 +391,7 @@ class PDFCIDFont(PDFFont):
## Resource Manager
##
class PDFResourceManager:
class PDFResourceManager(object):
'''
ResourceManager facilitates reuse of shared resources
@ -464,7 +464,7 @@ class PDFResourceManager:
## PDFDevice
##
class PDFDevice:
class PDFDevice(object):
def __init__(self, rsrc, debug=0):
self.rsrc = rsrc
@ -587,9 +587,9 @@ class PDFContentParser(PSStackParser):
## Interpreter
##
class PDFPageInterpreter:
class PDFPageInterpreter(object):
class TextState:
class TextState(object):
def __init__(self):
self.font = None
self.fontsize = 0

View File

@ -11,7 +11,7 @@ from utils import choplist, nunpack
from arcfour import Arcfour
from lzw import LZWDecoder
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, \
PSStackParser, STRICT
@ -46,10 +46,12 @@ KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
class PDFObject(PSObject): pass
## PDFObjRef
##
class PDFObjRef:
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
if objid == 0:
@ -165,7 +167,7 @@ def stream_value(x):
## PDFStream type
##
class PDFStream:
class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
@ -247,11 +249,11 @@ class PDFStream:
## PDFPage
##
class PDFPage:
class PDFPage(object):
def __init__(self, doc, pageidx, attrs):
def __init__(self, doc, pageid, attrs):
self.doc = doc
self.pageid = pageidx
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = self.attrs.get('LastModified')
self.resources = resolve1(self.attrs['Resources'])
@ -397,7 +399,7 @@ class PDFXRefStream(object):
## at once. Rather it is parsed dynamically as processing goes.
## A PDF parser is associated with the document.
##
class PDFDocument:
class PDFDocument(object):
def __init__(self, debug=0):
self.debug = debug
@ -453,7 +455,6 @@ class PDFDocument:
if self.catalog.get('Type') != LITERAL_CATALOG:
if STRICT:
raise PDFValueError('Catalog not found!')
self.outline = self.catalog.get('Outline')
return
# initialize(password='')
@ -608,11 +609,54 @@ class PDFDocument:
elif tree.get('Type') == LITERAL_PAGE:
if 1 <= debug:
print >>stderr, 'Page: %r' % tree
yield tree
yield (obj.objid, tree)
if 'Pages' not in self.catalog: return
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree)
return
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
yield PDFPage(self, pageid, tree)
return
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFException('no /Outlines defined!')
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
yield (level, title, dest, action, se)
if 'First' in entry and 'Last' in entry:
for x in search(entry['First'], level+1):
yield x
if 'Next' in entry:
for x in search(entry['Next'], level):
yield x
return
return search(self.catalog['Outlines'], 0)
def lookup_name(self, cat, key):
try:
names = dict_value(self.catalog['Names'])
except (PDFTypeError, KeyError):
raise KeyError((cat,key))
# may raise KeyError
d0 = dict_value(names[cat])
def lookup(d):
if 'Limits' in d:
(k1,k2) = list_value(d['Limits'])
if key < k1 or k2 < key: return None
if 'Names' in d:
objs = list_value(d['Names'])
names = dict(choplist(2, objs))
return names[key]
if 'Kids' in d:
for c in list_value(d['Kids']):
v = lookup(dict_value(c))
if v: return v
raise KeyError((cat,key))
return lookup(d0)
## PDFParser

View File

@ -19,7 +19,9 @@ class PSValueError(PSException): pass
##
# PSLiteral
class PSLiteral:
class PSObject(object): pass
class PSLiteral(PSObject):
'''
PS literals (e.g. "/Name").
@ -35,7 +37,7 @@ class PSLiteral:
return '/%s' % self.name
# PSKeyword
class PSKeyword:
class PSKeyword(PSObject):
'''
PS keywords (e.g. "showpage").
@ -51,7 +53,7 @@ class PSKeyword:
return self.name
# PSSymbolTable
class PSSymbolTable:
class PSSymbolTable(object):
'''
Symbol table that stores PSLiteral or PSKeyword.
@ -113,7 +115,7 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
END_STRING = re.compile(r'[()\134]')
OCT_STRING = re.compile(r'[0-7]')
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
class PSBaseParser:
class PSBaseParser(object):
'''
Most basic PostScript parser that performs only basic tokenization.
@ -129,6 +131,13 @@ class PSBaseParser:
def __repr__(self):
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
def flush(self):
return
def close(self):
self.flush()
return
def tell(self):
return self.fp.tell()
@ -463,8 +472,6 @@ class PSStackParser(PSBaseParser):
def do_keyword(self, pos, token):
return
def flush(self):
return
def nextobject(self):
'''

View File

@ -51,7 +51,7 @@ def cdbiter(fp, eod):
# CDBReader
class CDBReader:
class CDBReader(object):
def __init__(self, cdbname, docache=1):
self.name = cdbname
@ -59,7 +59,7 @@ class CDBReader:
hash0 = decode(self._fp.read(2048))
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
self._hash1 = [ None ] * 256
self._eod = self._hash0[0]
self._eod = hash0[0]
self._docache = docache
self._cache = {}
self._keyiter = None
@ -149,7 +149,7 @@ class CDBReader:
# CDBMaker
class CDBMaker:
class CDBMaker(object):
def __init__(self, cdbname, tmpname):
self.fn = cdbname

View File

@ -8,7 +8,7 @@
#
import sys, re
from pdflib.pdfparser import PDFDocument, PDFParser, PDFStream, \
PDFObjRef, PSKeyword, PSLiteral
PDFObjRef, PSKeyword, PSLiteral, resolve1
stdout = sys.stdout
stderr = sys.stderr
@ -94,8 +94,28 @@ def dumpallobjs(out, doc, codec=None):
out.write('</pdf>')
return
# dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, debug=0):
doc = PDFDocument(debug=debug)
fp = file(fname, 'rb')
parser = PDFParser(doc, fp, debug=debug)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
for (level,title,dest,a,se) in doc.get_outlines():
pageno = None
if dest:
dest = resolve1( doc.lookup_name('Dests', dest) )
if isinstance(dest, dict):
dest = dest['D']
pageno = pages[dest[0].objid]
outfp.write(repr((level,title,dest,pageno))+'\n')
parser.close()
fp.close()
return
# dumppdf
def dumppdf(outfp, fname, objids, pageids, password='',
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, debug=0):
doc = PDFDocument(debug=debug)
fp = file(fname, 'rb')
@ -110,13 +130,13 @@ def dumppdf(outfp, fname, objids, pageids, password='',
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj, codec=codec)
if pageids:
for page in doc.get_pages():
if page.pageid in pageids:
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pageids) and (not dumpall):
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
outfp.write('\n')
@ -127,34 +147,36 @@ def dumppdf(outfp, fname, objids, pageids, password='',
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-i objid] file ...' % argv[0]
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbti:')
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
objids = []
pageids = set()
pagenos = set()
codec = None
password = ''
dumpall = False
proc = dumppdf
outfp = stdout
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pageids.update( int(x)-1 for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
elif k == '-a': dumpall = True
elif k == '-r': codec = 'raw'
elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = file(v, 'wb')
#
for fname in args:
dumppdf(outfp, fname, objids, pageids, password=password,
dumpall=dumpall, codec=codec, debug=debug)
proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec, debug=debug)
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -93,7 +93,7 @@ class TextConverter(PDFDevice):
return
def begin_page(self, page):
self.context = PageItem(str(page.pageid+1), page.mediabox, page.rotate)
self.context = PageItem(len(self.pages), page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
@ -205,7 +205,7 @@ class TextConverter(PDFDevice):
# pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass
def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0):
def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password='', debug=0):
device = TextConverter(rsrc, debug=debug)
doc = PDFDocument(debug=debug)
fp = file(fname, 'rb')
@ -218,10 +218,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
device.reset()
for (i,page) in enumerate(doc.get_pages(debug=debug)):
if pages and (i not in pages): continue
for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
if pagenos and (pageno not in pagenos): continue
interpreter.process_page(page)
if maxpages and maxpages <= i+1: break
if maxpages and maxpages <= pageno+1: break
if html:
device.dump_html(outfp, codec)
else:
@ -235,7 +235,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
@ -246,14 +246,14 @@ def main(argv):
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
codec = 'ascii'
pages = set()
pagenos = set()
maxpages = 0
html = False
password = ''
outfp = stdout
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
elif k == '-c': codec = v
elif k == '-m': maxpages = int(v)
@ -265,7 +265,7 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug)
for fname in args:
pdf2txt(outfp, rsrc, fname, pages, codec,
pdf2txt(outfp, rsrc, fname, pagenos, codec,
maxpages=maxpages, html=html, password=password, debug=debug)
return