outline (TOC) extraction supported.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@42 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
cb02051481
commit
9740f26cec
15
README.html
15
README.html
|
@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
|
||||||
<h1>PDFMiner</h1>
|
<h1>PDFMiner</h1>
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Tue Jul 1 00:02:48 JST 2008
|
Last Modified: Thu Jul 10 00:14:07 JST 2008
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -135,13 +135,13 @@ Unicode Standard.
|
||||||
<p>
|
<p>
|
||||||
Examples:
|
Examples:
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf</strong>
|
$ <strong>python -m tools.pdf2txt -H -o output.html samples/naacl06-shinyama.pdf</strong>
|
||||||
(extract text as an HTML file whose filename is output.html)
|
(extract text as an HTML file whose filename is output.html)
|
||||||
|
|
||||||
$ <strong>./pdf2txt.py -c euc-jp samples/jo.pdf</strong>
|
$ <strong>python -m tools.pdf2txt -c euc-jp samples/jo.pdf</strong>
|
||||||
(extract Japanese texts in vertical writing, CMap is required)
|
(extract Japanese texts in vertical writing, CMap is required)
|
||||||
|
|
||||||
$ <strong>./pdf2txt.py -P mypassword secret.pdf</strong>
|
$ <strong>python -m tools.pdf2txt -P mypassword secret.pdf</strong>
|
||||||
(extract texts from an encrypted PDF file with a password)
|
(extract texts from an encrypted PDF file with a password)
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
|
|
||||||
|
@ -181,10 +181,13 @@ but it's also possible to extract some meaningful contents
|
||||||
<p>
|
<p>
|
||||||
Examples:
|
Examples:
|
||||||
<blockquote><pre>
|
<blockquote><pre>
|
||||||
$ <strong>./dumppdf.py -a foo.pdf</strong>
|
$ <strong>python -m tools.dumppdf -a foo.pdf</strong>
|
||||||
(dump all the headers and contents, except stream objects)
|
(dump all the headers and contents, except stream objects)
|
||||||
|
|
||||||
$ <strong>./dumppdf.py -r -i6 foo.pdf > pic.jpeg</strong>
|
$ <strong>python -m tools.dumppdf -T foo.pdf</strong>
|
||||||
|
(dump the table of contents)
|
||||||
|
|
||||||
|
$ <strong>python -m tools.dumppdf -r -i6 foo.pdf > pic.jpeg</strong>
|
||||||
(extract a JPEG image)
|
(extract a JPEG image)
|
||||||
</pre></blockquote>
|
</pre></blockquote>
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# * public domain *
|
# * public domain *
|
||||||
#
|
#
|
||||||
|
|
||||||
class Arcfour:
|
class Arcfour(object):
|
||||||
|
|
||||||
def __init__(self, key):
|
def __init__(self, key):
|
||||||
s = range(256)
|
s = range(256)
|
||||||
|
|
|
@ -17,7 +17,7 @@ class CMapError(Exception): pass
|
||||||
|
|
||||||
## CMap
|
## CMap
|
||||||
##
|
##
|
||||||
class CMap:
|
class CMap(object):
|
||||||
|
|
||||||
def __init__(self, debug=0):
|
def __init__(self, debug=0):
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
@ -163,7 +163,7 @@ class CDBCMap(CMap):
|
||||||
|
|
||||||
## CMapDB
|
## CMapDB
|
||||||
##
|
##
|
||||||
class CMapDB:
|
class CMapDB(object):
|
||||||
|
|
||||||
class CMapNotFound(CMapError): pass
|
class CMapNotFound(CMapError): pass
|
||||||
|
|
||||||
|
@ -340,7 +340,7 @@ class CMapParser(PSStackParser):
|
||||||
|
|
||||||
## FontMetricsDB
|
## FontMetricsDB
|
||||||
##
|
##
|
||||||
class FontMetricsDB:
|
class FontMetricsDB(object):
|
||||||
from fontmetrics import FONT_METRICS
|
from fontmetrics import FONT_METRICS
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -350,7 +350,7 @@ class FontMetricsDB:
|
||||||
|
|
||||||
## EncodingDB
|
## EncodingDB
|
||||||
##
|
##
|
||||||
class EncodingDB:
|
class EncodingDB(object):
|
||||||
|
|
||||||
from glyphlist import charname2unicode
|
from glyphlist import charname2unicode
|
||||||
from latin_enc import ENCODING
|
from latin_enc import ENCODING
|
||||||
|
|
|
@ -4,7 +4,7 @@ stderr = sys.stderr
|
||||||
|
|
||||||
## LZWDecoder
|
## LZWDecoder
|
||||||
##
|
##
|
||||||
class LZWDecoder:
|
class LZWDecoder(object):
|
||||||
|
|
||||||
def __init__(self, fp, debug=0):
|
def __init__(self, fp, debug=0):
|
||||||
self.fp = fp
|
self.fp = fp
|
||||||
|
|
|
@ -9,7 +9,7 @@ except ImportError:
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||||
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
|
from pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
|
||||||
int_value, float_value, num_value, \
|
int_value, float_value, num_value, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||||
|
@ -26,7 +26,7 @@ class PDFUnicodeNotDefined(PDFFontError): pass
|
||||||
|
|
||||||
## ColorSpace
|
## ColorSpace
|
||||||
##
|
##
|
||||||
class ColorSpace:
|
class ColorSpace(object):
|
||||||
def __init__(self, name, ncomponents):
|
def __init__(self, name, ncomponents):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.ncomponents = ncomponents
|
self.ncomponents = ncomponents
|
||||||
|
@ -82,7 +82,7 @@ def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
##
|
##
|
||||||
|
|
||||||
# PDFFont
|
# PDFFont
|
||||||
class PDFFont:
|
class PDFFont(object):
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, default_width=None):
|
def __init__(self, descriptor, widths, default_width=None):
|
||||||
self.descriptor = descriptor
|
self.descriptor = descriptor
|
||||||
|
@ -208,7 +208,7 @@ class PDFType3Font(PDFSimpleFont):
|
||||||
|
|
||||||
## TrueTypeFont
|
## TrueTypeFont
|
||||||
##
|
##
|
||||||
class TrueTypeFont:
|
class TrueTypeFont(object):
|
||||||
|
|
||||||
class CMapNotFound(Exception): pass
|
class CMapNotFound(Exception): pass
|
||||||
|
|
||||||
|
@ -391,7 +391,7 @@ class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
## Resource Manager
|
## Resource Manager
|
||||||
##
|
##
|
||||||
class PDFResourceManager:
|
class PDFResourceManager(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
ResourceManager facilitates reuse of shared resources
|
ResourceManager facilitates reuse of shared resources
|
||||||
|
@ -464,7 +464,7 @@ class PDFResourceManager:
|
||||||
|
|
||||||
## PDFDevice
|
## PDFDevice
|
||||||
##
|
##
|
||||||
class PDFDevice:
|
class PDFDevice(object):
|
||||||
|
|
||||||
def __init__(self, rsrc, debug=0):
|
def __init__(self, rsrc, debug=0):
|
||||||
self.rsrc = rsrc
|
self.rsrc = rsrc
|
||||||
|
@ -587,9 +587,9 @@ class PDFContentParser(PSStackParser):
|
||||||
|
|
||||||
## Interpreter
|
## Interpreter
|
||||||
##
|
##
|
||||||
class PDFPageInterpreter:
|
class PDFPageInterpreter(object):
|
||||||
|
|
||||||
class TextState:
|
class TextState(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.font = None
|
self.font = None
|
||||||
self.fontsize = 0
|
self.fontsize = 0
|
||||||
|
|
|
@ -11,7 +11,7 @@ from utils import choplist, nunpack
|
||||||
from arcfour import Arcfour
|
from arcfour import Arcfour
|
||||||
from lzw import LZWDecoder
|
from lzw import LZWDecoder
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||||
literal_name, keyword_name, \
|
literal_name, keyword_name, \
|
||||||
PSStackParser, STRICT
|
PSStackParser, STRICT
|
||||||
|
|
||||||
|
@ -46,10 +46,12 @@ KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||||
|
|
||||||
|
class PDFObject(PSObject): pass
|
||||||
|
|
||||||
|
|
||||||
## PDFObjRef
|
## PDFObjRef
|
||||||
##
|
##
|
||||||
class PDFObjRef:
|
class PDFObjRef(PDFObject):
|
||||||
|
|
||||||
def __init__(self, doc, objid, _):
|
def __init__(self, doc, objid, _):
|
||||||
if objid == 0:
|
if objid == 0:
|
||||||
|
@ -165,7 +167,7 @@ def stream_value(x):
|
||||||
|
|
||||||
## PDFStream type
|
## PDFStream type
|
||||||
##
|
##
|
||||||
class PDFStream:
|
class PDFStream(PDFObject):
|
||||||
|
|
||||||
def __init__(self, dic, rawdata, decipher=None):
|
def __init__(self, dic, rawdata, decipher=None):
|
||||||
self.dic = dic
|
self.dic = dic
|
||||||
|
@ -247,11 +249,11 @@ class PDFStream:
|
||||||
|
|
||||||
## PDFPage
|
## PDFPage
|
||||||
##
|
##
|
||||||
class PDFPage:
|
class PDFPage(object):
|
||||||
|
|
||||||
def __init__(self, doc, pageidx, attrs):
|
def __init__(self, doc, pageid, attrs):
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.pageid = pageidx
|
self.pageid = pageid
|
||||||
self.attrs = dict_value(attrs)
|
self.attrs = dict_value(attrs)
|
||||||
self.lastmod = self.attrs.get('LastModified')
|
self.lastmod = self.attrs.get('LastModified')
|
||||||
self.resources = resolve1(self.attrs['Resources'])
|
self.resources = resolve1(self.attrs['Resources'])
|
||||||
|
@ -397,7 +399,7 @@ class PDFXRefStream(object):
|
||||||
## at once. Rather it is parsed dynamically as processing goes.
|
## at once. Rather it is parsed dynamically as processing goes.
|
||||||
## A PDF parser is associated with the document.
|
## A PDF parser is associated with the document.
|
||||||
##
|
##
|
||||||
class PDFDocument:
|
class PDFDocument(object):
|
||||||
|
|
||||||
def __init__(self, debug=0):
|
def __init__(self, debug=0):
|
||||||
self.debug = debug
|
self.debug = debug
|
||||||
|
@ -453,7 +455,6 @@ class PDFDocument:
|
||||||
if self.catalog.get('Type') != LITERAL_CATALOG:
|
if self.catalog.get('Type') != LITERAL_CATALOG:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('Catalog not found!')
|
raise PDFValueError('Catalog not found!')
|
||||||
self.outline = self.catalog.get('Outline')
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# initialize(password='')
|
# initialize(password='')
|
||||||
|
@ -608,11 +609,54 @@ class PDFDocument:
|
||||||
elif tree.get('Type') == LITERAL_PAGE:
|
elif tree.get('Type') == LITERAL_PAGE:
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>stderr, 'Page: %r' % tree
|
print >>stderr, 'Page: %r' % tree
|
||||||
yield tree
|
yield (obj.objid, tree)
|
||||||
if 'Pages' not in self.catalog: return
|
if 'Pages' not in self.catalog: return
|
||||||
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
|
||||||
yield PDFPage(self, i, tree)
|
yield PDFPage(self, pageid, tree)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def get_outlines(self):
|
||||||
|
if 'Outlines' not in self.catalog:
|
||||||
|
raise PDFException('no /Outlines defined!')
|
||||||
|
def search(entry, level):
|
||||||
|
entry = dict_value(entry)
|
||||||
|
if 'Title' in entry:
|
||||||
|
if 'A' in entry or 'Dest' in entry:
|
||||||
|
title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
|
||||||
|
dest = entry.get('Dest')
|
||||||
|
action = entry.get('A')
|
||||||
|
se = entry.get('SE')
|
||||||
|
yield (level, title, dest, action, se)
|
||||||
|
if 'First' in entry and 'Last' in entry:
|
||||||
|
for x in search(entry['First'], level+1):
|
||||||
|
yield x
|
||||||
|
if 'Next' in entry:
|
||||||
|
for x in search(entry['Next'], level):
|
||||||
|
yield x
|
||||||
|
return
|
||||||
|
return search(self.catalog['Outlines'], 0)
|
||||||
|
|
||||||
|
def lookup_name(self, cat, key):
|
||||||
|
try:
|
||||||
|
names = dict_value(self.catalog['Names'])
|
||||||
|
except (PDFTypeError, KeyError):
|
||||||
|
raise KeyError((cat,key))
|
||||||
|
# may raise KeyError
|
||||||
|
d0 = dict_value(names[cat])
|
||||||
|
def lookup(d):
|
||||||
|
if 'Limits' in d:
|
||||||
|
(k1,k2) = list_value(d['Limits'])
|
||||||
|
if key < k1 or k2 < key: return None
|
||||||
|
if 'Names' in d:
|
||||||
|
objs = list_value(d['Names'])
|
||||||
|
names = dict(choplist(2, objs))
|
||||||
|
return names[key]
|
||||||
|
if 'Kids' in d:
|
||||||
|
for c in list_value(d['Kids']):
|
||||||
|
v = lookup(dict_value(c))
|
||||||
|
if v: return v
|
||||||
|
raise KeyError((cat,key))
|
||||||
|
return lookup(d0)
|
||||||
|
|
||||||
|
|
||||||
## PDFParser
|
## PDFParser
|
||||||
|
|
|
@ -19,7 +19,9 @@ class PSValueError(PSException): pass
|
||||||
##
|
##
|
||||||
|
|
||||||
# PSLiteral
|
# PSLiteral
|
||||||
class PSLiteral:
|
class PSObject(object): pass
|
||||||
|
|
||||||
|
class PSLiteral(PSObject):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
PS literals (e.g. "/Name").
|
PS literals (e.g. "/Name").
|
||||||
|
@ -35,7 +37,7 @@ class PSLiteral:
|
||||||
return '/%s' % self.name
|
return '/%s' % self.name
|
||||||
|
|
||||||
# PSKeyword
|
# PSKeyword
|
||||||
class PSKeyword:
|
class PSKeyword(PSObject):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
PS keywords (e.g. "showpage").
|
PS keywords (e.g. "showpage").
|
||||||
|
@ -51,7 +53,7 @@ class PSKeyword:
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
# PSSymbolTable
|
# PSSymbolTable
|
||||||
class PSSymbolTable:
|
class PSSymbolTable(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Symbol table that stores PSLiteral or PSKeyword.
|
Symbol table that stores PSLiteral or PSKeyword.
|
||||||
|
@ -113,7 +115,7 @@ END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]')
|
||||||
END_STRING = re.compile(r'[()\134]')
|
END_STRING = re.compile(r'[()\134]')
|
||||||
OCT_STRING = re.compile(r'[0-7]')
|
OCT_STRING = re.compile(r'[0-7]')
|
||||||
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
|
ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 }
|
||||||
class PSBaseParser:
|
class PSBaseParser(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Most basic PostScript parser that performs only basic tokenization.
|
Most basic PostScript parser that performs only basic tokenization.
|
||||||
|
@ -129,6 +131,13 @@ class PSBaseParser:
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
|
return '<PSBaseParser: %r, bufpos=%d>' % (self.fp, self.bufpos)
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.flush()
|
||||||
|
return
|
||||||
|
|
||||||
def tell(self):
|
def tell(self):
|
||||||
return self.fp.tell()
|
return self.fp.tell()
|
||||||
|
|
||||||
|
@ -463,8 +472,6 @@ class PSStackParser(PSBaseParser):
|
||||||
|
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
return
|
return
|
||||||
def flush(self):
|
|
||||||
return
|
|
||||||
|
|
||||||
def nextobject(self):
|
def nextobject(self):
|
||||||
'''
|
'''
|
||||||
|
|
|
@ -51,7 +51,7 @@ def cdbiter(fp, eod):
|
||||||
|
|
||||||
|
|
||||||
# CDBReader
|
# CDBReader
|
||||||
class CDBReader:
|
class CDBReader(object):
|
||||||
|
|
||||||
def __init__(self, cdbname, docache=1):
|
def __init__(self, cdbname, docache=1):
|
||||||
self.name = cdbname
|
self.name = cdbname
|
||||||
|
@ -59,7 +59,7 @@ class CDBReader:
|
||||||
hash0 = decode(self._fp.read(2048))
|
hash0 = decode(self._fp.read(2048))
|
||||||
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
|
self._hash0 = [ (hash0[i], hash0[i+1]) for i in xrange(0, 512, 2) ]
|
||||||
self._hash1 = [ None ] * 256
|
self._hash1 = [ None ] * 256
|
||||||
self._eod = self._hash0[0]
|
self._eod = hash0[0]
|
||||||
self._docache = docache
|
self._docache = docache
|
||||||
self._cache = {}
|
self._cache = {}
|
||||||
self._keyiter = None
|
self._keyiter = None
|
||||||
|
@ -149,7 +149,7 @@ class CDBReader:
|
||||||
|
|
||||||
|
|
||||||
# CDBMaker
|
# CDBMaker
|
||||||
class CDBMaker:
|
class CDBMaker(object):
|
||||||
|
|
||||||
def __init__(self, cdbname, tmpname):
|
def __init__(self, cdbname, tmpname):
|
||||||
self.fn = cdbname
|
self.fn = cdbname
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
#
|
#
|
||||||
import sys, re
|
import sys, re
|
||||||
from pdflib.pdfparser import PDFDocument, PDFParser, PDFStream, \
|
from pdflib.pdfparser import PDFDocument, PDFParser, PDFStream, \
|
||||||
PDFObjRef, PSKeyword, PSLiteral
|
PDFObjRef, PSKeyword, PSLiteral, resolve1
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
|
|
||||||
|
@ -94,8 +94,28 @@ def dumpallobjs(out, doc, codec=None):
|
||||||
out.write('</pdf>')
|
out.write('</pdf>')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# dumpoutline
|
||||||
|
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
|
dumpall=False, codec=None, debug=0):
|
||||||
|
doc = PDFDocument(debug=debug)
|
||||||
|
fp = file(fname, 'rb')
|
||||||
|
parser = PDFParser(doc, fp, debug=debug)
|
||||||
|
doc.initialize(password)
|
||||||
|
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
|
||||||
|
for (level,title,dest,a,se) in doc.get_outlines():
|
||||||
|
pageno = None
|
||||||
|
if dest:
|
||||||
|
dest = resolve1( doc.lookup_name('Dests', dest) )
|
||||||
|
if isinstance(dest, dict):
|
||||||
|
dest = dest['D']
|
||||||
|
pageno = pages[dest[0].objid]
|
||||||
|
outfp.write(repr((level,title,dest,pageno))+'\n')
|
||||||
|
parser.close()
|
||||||
|
fp.close()
|
||||||
|
return
|
||||||
|
|
||||||
# dumppdf
|
# dumppdf
|
||||||
def dumppdf(outfp, fname, objids, pageids, password='',
|
def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None, debug=0):
|
dumpall=False, codec=None, debug=0):
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
|
@ -110,13 +130,13 @@ def dumppdf(outfp, fname, objids, pageids, password='',
|
||||||
outfp.write(obj.get_data())
|
outfp.write(obj.get_data())
|
||||||
else:
|
else:
|
||||||
dumpxml(outfp, obj, codec=codec)
|
dumpxml(outfp, obj, codec=codec)
|
||||||
if pageids:
|
if pagenos:
|
||||||
for page in doc.get_pages():
|
for (pageno,page) in enumerate(doc.get_pages()):
|
||||||
if page.pageid in pageids:
|
if pageno in pagenos:
|
||||||
dumpxml(outfp, page.attrs)
|
dumpxml(outfp, page.attrs)
|
||||||
if dumpall:
|
if dumpall:
|
||||||
dumpallobjs(outfp, doc, codec=codec)
|
dumpallobjs(outfp, doc, codec=codec)
|
||||||
if (not objids) and (not pageids) and (not dumpall):
|
if (not objids) and (not pagenos) and (not dumpall):
|
||||||
dumptrailers(outfp, doc)
|
dumptrailers(outfp, doc)
|
||||||
fp.close()
|
fp.close()
|
||||||
outfp.write('\n')
|
outfp.write('\n')
|
||||||
|
@ -127,34 +147,36 @@ def dumppdf(outfp, fname, objids, pageids, password='',
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-i objid] file ...' % argv[0]
|
print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbti:')
|
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
debug = 0
|
debug = 0
|
||||||
objids = []
|
objids = []
|
||||||
pageids = set()
|
pagenos = set()
|
||||||
codec = None
|
codec = None
|
||||||
password = ''
|
password = ''
|
||||||
dumpall = False
|
dumpall = False
|
||||||
|
proc = dumppdf
|
||||||
outfp = stdout
|
outfp = stdout
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
||||||
elif k == '-p': pageids.update( int(x)-1 for x in v.split(',') )
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-a': dumpall = True
|
elif k == '-a': dumpall = True
|
||||||
elif k == '-r': codec = 'raw'
|
elif k == '-r': codec = 'raw'
|
||||||
elif k == '-b': codec = 'binary'
|
elif k == '-b': codec = 'binary'
|
||||||
elif k == '-t': codec = 'text'
|
elif k == '-t': codec = 'text'
|
||||||
|
elif k == '-T': proc = dumpoutline
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = file(v, 'wb')
|
||||||
#
|
#
|
||||||
for fname in args:
|
for fname in args:
|
||||||
dumppdf(outfp, fname, objids, pageids, password=password,
|
proc(outfp, fname, objids, pagenos, password=password,
|
||||||
dumpall=dumpall, codec=codec, debug=debug)
|
dumpall=dumpall, codec=codec, debug=debug)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -93,7 +93,7 @@ class TextConverter(PDFDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page):
|
def begin_page(self, page):
|
||||||
self.context = PageItem(str(page.pageid+1), page.mediabox, page.rotate)
|
self.context = PageItem(len(self.pages), page.mediabox, page.rotate)
|
||||||
return
|
return
|
||||||
def end_page(self, _):
|
def end_page(self, _):
|
||||||
assert not self.stack
|
assert not self.stack
|
||||||
|
@ -205,7 +205,7 @@ class TextConverter(PDFDevice):
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
class TextExtractionNotAllowed(RuntimeError): pass
|
class TextExtractionNotAllowed(RuntimeError): pass
|
||||||
|
|
||||||
def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0):
|
def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password='', debug=0):
|
||||||
device = TextConverter(rsrc, debug=debug)
|
device = TextConverter(rsrc, debug=debug)
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
|
@ -218,10 +218,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
|
||||||
raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
|
raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
|
||||||
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
||||||
device.reset()
|
device.reset()
|
||||||
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
|
||||||
if pages and (i not in pages): continue
|
if pagenos and (pageno not in pagenos): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
if maxpages and maxpages <= i+1: break
|
if maxpages and maxpages <= pageno+1: break
|
||||||
if html:
|
if html:
|
||||||
device.dump_html(outfp, codec)
|
device.dump_html(outfp, codec)
|
||||||
else:
|
else:
|
||||||
|
@ -235,7 +235,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
|
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
|
||||||
|
@ -246,14 +246,14 @@ def main(argv):
|
||||||
cmapdir = 'CMap'
|
cmapdir = 'CMap'
|
||||||
cdbcmapdir = 'CDBCMap'
|
cdbcmapdir = 'CDBCMap'
|
||||||
codec = 'ascii'
|
codec = 'ascii'
|
||||||
pages = set()
|
pagenos = set()
|
||||||
maxpages = 0
|
maxpages = 0
|
||||||
html = False
|
html = False
|
||||||
password = ''
|
password = ''
|
||||||
outfp = stdout
|
outfp = stdout
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-m': maxpages = int(v)
|
elif k == '-m': maxpages = int(v)
|
||||||
|
@ -265,7 +265,7 @@ def main(argv):
|
||||||
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
||||||
rsrc = PDFResourceManager(debug=debug)
|
rsrc = PDFResourceManager(debug=debug)
|
||||||
for fname in args:
|
for fname in args:
|
||||||
pdf2txt(outfp, rsrc, fname, pages, codec,
|
pdf2txt(outfp, rsrc, fname, pagenos, codec,
|
||||||
maxpages=maxpages, html=html, password=password, debug=debug)
|
maxpages=maxpages, html=html, password=password, debug=debug)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue