patch from Troy Bollinger.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@71 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
13a6603151
commit
b432a3f4ae
12
README.html
12
README.html
|
@ -164,19 +164,19 @@ $ <strong>python -m pdflib.pdf2txt -P mypassword secret.pdf</strong>
|
||||||
Options:
|
Options:
|
||||||
<dl>
|
<dl>
|
||||||
<dt> <code>-o <em>filename</em></code>
|
<dt> <code>-o <em>filename</em></code>
|
||||||
<dd> Speficies the output file name.
|
<dd> Specifies the output file name.
|
||||||
By default, it prints the extracted contents to stdout.
|
By default, it prints the extracted contents to stdout.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
||||||
<dd> Speficies the comma-separated list of the page numbers to be extracted.
|
<dd> Specifies the comma-separated list of the page numbers to be extracted.
|
||||||
Page numbers are starting from one.
|
Page numbers are starting from one.
|
||||||
By default, it extracts texts from all the pages.
|
By default, it extracts texts from all the pages.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-c <em>codec</em></code>
|
<dt> <code>-c <em>codec</em></code>
|
||||||
<dd> Speficies the output codec for non-ASCII texts.
|
<dd> Specifies the output codec for non-ASCII texts.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-t <em>type</em></code>
|
<dt> <code>-t <em>type</em></code>
|
||||||
<dd> Speficies the output format. The following formats are currently supported.
|
<dd> Specifies the output format. The following formats are currently supported.
|
||||||
<ul>
|
<ul>
|
||||||
<li> <code>html</code> : HTML format. (Default)
|
<li> <code>html</code> : HTML format. (Default)
|
||||||
<li> <code>sgml</code> : SGML format.
|
<li> <code>sgml</code> : SGML format.
|
||||||
|
@ -221,14 +221,14 @@ Options:
|
||||||
By default, it only prints the document trailer (like a header).
|
By default, it only prints the document trailer (like a header).
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-p <em>pageno</em></code>
|
<dt> <code>-p <em>pageno</em></code>
|
||||||
<dd> Speficies the page number to be extracted.
|
<dd> Specifies the page number to be extracted.
|
||||||
Multiple <code>-p</code> options are allowed.
|
Multiple <code>-p</code> options are allowed.
|
||||||
Note that page numbers start from one.
|
Note that page numbers start from one.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-r</code> (raw)
|
<dt> <code>-r</code> (raw)
|
||||||
<dt> <code>-b</code> (binary)
|
<dt> <code>-b</code> (binary)
|
||||||
<dt> <code>-t</code> (text)
|
<dt> <code>-t</code> (text)
|
||||||
<dd> Speficies the output format of stream contents.
|
<dd> Specifies the output format of stream contents.
|
||||||
Because the contents of stream objects can be very large,
|
Because the contents of stream objects can be very large,
|
||||||
they are omitted when none of the options above is specified.
|
they are omitted when none of the options above is specified.
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -1,6 +1,32 @@
|
||||||
# Makefile for pdfminer
|
# Makefile for pdfminer
|
||||||
|
|
||||||
all:
|
DESTDIR=/usr/local/src/pdflib
|
||||||
|
|
||||||
|
PDFLIB = ${DESTDIR}/__init__.py \
|
||||||
|
${DESTDIR}/arcfour.py \
|
||||||
|
${DESTDIR}/ascii85.py \
|
||||||
|
${DESTDIR}/cmap.py \
|
||||||
|
${DESTDIR}/fontmetrics.py \
|
||||||
|
${DESTDIR}/glyphlist.py \
|
||||||
|
${DESTDIR}/latin_enc.py \
|
||||||
|
${DESTDIR}/lzw.py \
|
||||||
|
${DESTDIR}/pdf2txt.py \
|
||||||
|
${DESTDIR}/pdfcolor.py \
|
||||||
|
${DESTDIR}/pdfdevice.py \
|
||||||
|
${DESTDIR}/pdffont.py \
|
||||||
|
${DESTDIR}/pdfinterp.py \
|
||||||
|
${DESTDIR}/pdfparser.py \
|
||||||
|
${DESTDIR}/pdftypes.py \
|
||||||
|
${DESTDIR}/psparser.py \
|
||||||
|
${DESTDIR}/pycdb.py \
|
||||||
|
${DESTDIR}/rijndael.py \
|
||||||
|
${DESTDIR}/utils.py \
|
||||||
|
|
||||||
|
${DESTDIR}/%: %
|
||||||
|
cp $? $@
|
||||||
|
chmod 755 $@
|
||||||
|
|
||||||
|
all: ${PDFLIB}
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm *.pyc *.pyo
|
-rm *.pyc *.pyo
|
||||||
|
|
|
@ -59,9 +59,13 @@ class PDFBaseXRef(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def objids(self):
|
def objids(self):
|
||||||
for objid_range in self.objid_ranges:
|
if self.objid_ranges:
|
||||||
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
|
for objid_range in self.objid_ranges:
|
||||||
yield objid
|
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
|
||||||
|
yield objid
|
||||||
|
else:
|
||||||
|
for objid in self.offsets:
|
||||||
|
yield objid
|
||||||
return
|
return
|
||||||
|
|
||||||
## PDFXRef
|
## PDFXRef
|
||||||
|
@ -70,6 +74,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
PDFBaseXRef.__init__(self)
|
PDFBaseXRef.__init__(self)
|
||||||
self.offsets = None
|
self.offsets = None
|
||||||
|
self.trailer = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -81,6 +86,8 @@ class PDFXRef(PDFBaseXRef):
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||||
if not line:
|
if not line:
|
||||||
|
@ -112,7 +119,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
print >>stderr, 'xref objects:', self.offsets
|
print >>stderr, 'xref objects:', self.offsets
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
return
|
return
|
||||||
|
|
||||||
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||||
def load_trailer(self, parser):
|
def load_trailer(self, parser):
|
||||||
try:
|
try:
|
||||||
|
@ -124,7 +131,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
if not x:
|
if not x:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||||
(_,dic) = x[0]
|
(_,dic) = x[0]
|
||||||
self.trailer = dict_value(dic)
|
self.trailer.update( dict_value(dic))
|
||||||
return
|
return
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
|
@ -199,7 +206,7 @@ class PDFXRefStream(PDFBaseXRef):
|
||||||
## PDFPage
|
## PDFPage
|
||||||
##
|
##
|
||||||
class PDFPage(object):
|
class PDFPage(object):
|
||||||
|
|
||||||
def __init__(self, doc, pageid, attrs):
|
def __init__(self, doc, pageid, attrs):
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.pageid = pageid
|
self.pageid = pageid
|
||||||
|
@ -237,7 +244,7 @@ class PDFPage(object):
|
||||||
class PDFDocument(object):
|
class PDFDocument(object):
|
||||||
|
|
||||||
debug = 0
|
debug = 0
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.xrefs = []
|
self.xrefs = []
|
||||||
self.objs = {}
|
self.objs = {}
|
||||||
|
@ -257,7 +264,7 @@ class PDFDocument(object):
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
# The document is set to be temporarily ready during collecting
|
# The document is set to be temporarily ready during collecting
|
||||||
# all the basic information about the document, e.g.
|
# all the basic information about the document, e.g.
|
||||||
# the header, the encryption information, and the access rights
|
# the header, the encryption information, and the access rights
|
||||||
# for the document.
|
# for the document.
|
||||||
self.ready = True
|
self.ready = True
|
||||||
# Retrieve the information of each header that was appended
|
# Retrieve the information of each header that was appended
|
||||||
|
@ -292,7 +299,7 @@ class PDFDocument(object):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('Catalog not found!')
|
raise PDFSyntaxError('Catalog not found!')
|
||||||
return
|
return
|
||||||
|
|
||||||
# initialize(password='')
|
# initialize(password='')
|
||||||
# Perform the initialization with a given password.
|
# Perform the initialization with a given password.
|
||||||
# This step is mandatory even if there's no password associated
|
# This step is mandatory even if there's no password associated
|
||||||
|
@ -316,7 +323,7 @@ class PDFDocument(object):
|
||||||
raise PDFEncryptionError('Unknown revision: %r' % R)
|
raise PDFEncryptionError('Unknown revision: %r' % R)
|
||||||
U = str_value(param['U'])
|
U = str_value(param['U'])
|
||||||
P = int_value(param['P'])
|
P = int_value(param['P'])
|
||||||
self.is_printable = bool(P & 4)
|
self.is_printable = bool(P & 4)
|
||||||
self.is_modifiable = bool(P & 8)
|
self.is_modifiable = bool(P & 8)
|
||||||
self.is_extractable = bool(P & 16)
|
self.is_extractable = bool(P & 16)
|
||||||
# Algorithm 3.2
|
# Algorithm 3.2
|
||||||
|
@ -418,8 +425,18 @@ class PDFDocument(object):
|
||||||
self.parser.seek(index)
|
self.parser.seek(index)
|
||||||
(_,objid1) = self.parser.nexttoken() # objid
|
(_,objid1) = self.parser.nexttoken() # objid
|
||||||
(_,genno) = self.parser.nexttoken() # genno
|
(_,genno) = self.parser.nexttoken() # genno
|
||||||
#assert objid1 == objid, (objid, objid1)
|
|
||||||
(_,kwd) = self.parser.nexttoken()
|
(_,kwd) = self.parser.nexttoken()
|
||||||
|
# #### hack around malformed pdf files
|
||||||
|
# assert objid1 == objid, (objid, objid1)
|
||||||
|
if objid1 != objid:
|
||||||
|
x = []
|
||||||
|
while kwd is not self.KEYWORD_OBJ:
|
||||||
|
(_,kwd) = self.parser.nexttoken()
|
||||||
|
x.append(kwd)
|
||||||
|
if x:
|
||||||
|
objid1 = x[-2]
|
||||||
|
genno = x[-1]
|
||||||
|
# #### end hack around malformed pdf files
|
||||||
if kwd is not self.KEYWORD_OBJ:
|
if kwd is not self.KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||||
(_,obj) = self.parser.nextobject()
|
(_,obj) = self.parser.nextobject()
|
||||||
|
@ -431,7 +448,7 @@ class PDFDocument(object):
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
def get_pages(self):
|
def get_pages(self):
|
||||||
if not self.ready:
|
if not self.ready:
|
||||||
|
@ -526,7 +543,7 @@ class PDFParser(PSStackParser):
|
||||||
if token is self.KEYWORD_ENDOBJ:
|
if token is self.KEYWORD_ENDOBJ:
|
||||||
self.add_results(*self.pop(4))
|
self.add_results(*self.pop(4))
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_R:
|
if token is self.KEYWORD_R:
|
||||||
# reference to indirect object
|
# reference to indirect object
|
||||||
try:
|
try:
|
||||||
|
@ -537,7 +554,7 @@ class PDFParser(PSStackParser):
|
||||||
except PSSyntaxError:
|
except PSSyntaxError:
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
if token is self.KEYWORD_STREAM:
|
if token is self.KEYWORD_STREAM:
|
||||||
# stream object
|
# stream object
|
||||||
((_,dic),) = self.pop(1)
|
((_,dic),) = self.pop(1)
|
||||||
|
@ -580,7 +597,7 @@ class PDFParser(PSStackParser):
|
||||||
obj = PDFStream(dic, data, self.doc.decipher)
|
obj = PDFStream(dic, data, self.doc.decipher)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
return
|
return
|
||||||
|
|
||||||
# others
|
# others
|
||||||
self.push((pos, token))
|
self.push((pos, token))
|
||||||
return
|
return
|
||||||
|
@ -611,17 +628,15 @@ class PDFParser(PSStackParser):
|
||||||
raise PDFNoValidXRef('Unexpected EOF')
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
|
print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
|
||||||
if isinstance(token, int):
|
try:
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
self.reset()
|
self.reset()
|
||||||
xref = PDFXRefStream()
|
xref = PDFXRefStream()
|
||||||
xref.load(self, debug=self.debug)
|
xref.load(self, debug=self.debug)
|
||||||
else:
|
except:
|
||||||
if token is not self.KEYWORD_XREF:
|
if token is self.KEYWORD_XREF:
|
||||||
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
self.nextline()
|
||||||
(pos, token))
|
|
||||||
self.nextline()
|
|
||||||
xref = PDFXRef()
|
xref = PDFXRef()
|
||||||
xref.load(self, debug=self.debug)
|
xref.load(self, debug=self.debug)
|
||||||
xrefs.append(xref)
|
xrefs.append(xref)
|
||||||
|
@ -636,7 +651,7 @@ class PDFParser(PSStackParser):
|
||||||
pos = int_value(trailer['Prev'])
|
pos = int_value(trailer['Prev'])
|
||||||
self.read_xref_from(pos, xrefs)
|
self.read_xref_from(pos, xrefs)
|
||||||
return
|
return
|
||||||
|
|
||||||
# read xref tables and trailers
|
# read xref tables and trailers
|
||||||
def read_xref(self):
|
def read_xref(self):
|
||||||
xrefs = []
|
xrefs = []
|
||||||
|
@ -656,17 +671,17 @@ class PDFParser(PSStackParser):
|
||||||
(pos, line) = self.nextline()
|
(pos, line) = self.nextline()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
break
|
break
|
||||||
if line.startswith('trailer'): break
|
if line.startswith('trailer'):
|
||||||
|
xref.offsets = offsets
|
||||||
|
self.seek(pos)
|
||||||
|
xref.load_trailer(self)
|
||||||
|
if 1 <= self.debug:
|
||||||
|
print >>stderr, 'trailer: %r' % xref.trailer
|
||||||
|
continue
|
||||||
m = pat.match(line)
|
m = pat.match(line)
|
||||||
if not m: continue
|
if not m: continue
|
||||||
(objid, genno) = m.groups()
|
(objid, genno) = m.groups()
|
||||||
offsets[int(objid)] = (0, pos)
|
offsets[int(objid)] = (0, pos)
|
||||||
if not offsets: raise
|
|
||||||
xref.offsets = offsets
|
|
||||||
self.seek(pos)
|
|
||||||
xref.load_trailer(self)
|
|
||||||
if 1 <= self.debug:
|
|
||||||
print >>stderr, 'trailer: %r' % xref.trailer
|
|
||||||
xrefs.append(xref)
|
xrefs.append(xref)
|
||||||
return xrefs
|
return xrefs
|
||||||
|
|
||||||
|
@ -674,7 +689,7 @@ class PDFParser(PSStackParser):
|
||||||
## PDFObjStrmParser
|
## PDFObjStrmParser
|
||||||
##
|
##
|
||||||
class PDFObjStrmParser(PDFParser):
|
class PDFObjStrmParser(PDFParser):
|
||||||
|
|
||||||
def __init__(self, doc, data):
|
def __init__(self, doc, data):
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
@ -682,7 +697,7 @@ class PDFObjStrmParser(PDFParser):
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
PDFParser.__init__(self, doc, StringIO(data))
|
PDFParser.__init__(self, doc, StringIO(data))
|
||||||
return
|
return
|
||||||
|
|
||||||
def flush(self):
|
def flush(self):
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
return
|
return
|
||||||
|
|
|
@ -159,6 +159,20 @@ class PDFStream(PDFObject):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||||
|
|
||||||
|
def decomp(self,data):
|
||||||
|
import zlib
|
||||||
|
buf = data
|
||||||
|
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
||||||
|
# end. remove chars from the end to try and decompress the buffer
|
||||||
|
while len(buf) > 10:
|
||||||
|
try:
|
||||||
|
# will get errors if the document is encrypted.
|
||||||
|
dco = zlib.decompressobj()
|
||||||
|
return dco.decompress(buf)
|
||||||
|
except:
|
||||||
|
buf = buf[:-1]
|
||||||
|
raise Exception, "zlib.error while decompressing data"
|
||||||
|
|
||||||
def decode(self):
|
def decode(self):
|
||||||
assert self.data == None and self.rawdata != None
|
assert self.data == None and self.rawdata != None
|
||||||
data = self.rawdata
|
data = self.rawdata
|
||||||
|
@ -175,7 +189,7 @@ class PDFStream(PDFObject):
|
||||||
for f in filters:
|
for f in filters:
|
||||||
if f in LITERALS_FLATE_DECODE:
|
if f in LITERALS_FLATE_DECODE:
|
||||||
# will get errors if the document is encrypted.
|
# will get errors if the document is encrypted.
|
||||||
data = zlib.decompress(data)
|
data = self.decomp(data)
|
||||||
elif f in LITERALS_LZW_DECODE:
|
elif f in LITERALS_LZW_DECODE:
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
|
@ -29,7 +29,7 @@ def dumpxml(out, obj, codec=None):
|
||||||
out.write('</value>\n')
|
out.write('</value>\n')
|
||||||
out.write('</dict>')
|
out.write('</dict>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, list):
|
if isinstance(obj, list):
|
||||||
out.write('<list size="%d">\n' % len(obj))
|
out.write('<list size="%d">\n' % len(obj))
|
||||||
for v in obj:
|
for v in obj:
|
||||||
|
@ -37,11 +37,11 @@ def dumpxml(out, obj, codec=None):
|
||||||
out.write('\n')
|
out.write('\n')
|
||||||
out.write('</list>')
|
out.write('</list>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, str):
|
if isinstance(obj, str):
|
||||||
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
|
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
out.write('<stream>\n<props>\n')
|
out.write('<stream>\n<props>\n')
|
||||||
dumpxml(out, obj.dic)
|
dumpxml(out, obj.dic)
|
||||||
|
@ -51,11 +51,11 @@ def dumpxml(out, obj, codec=None):
|
||||||
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
||||||
out.write('</stream>')
|
out.write('</stream>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFObjRef):
|
if isinstance(obj, PDFObjRef):
|
||||||
out.write('<ref id="%d"/>' % obj.objid)
|
out.write('<ref id="%d"/>' % obj.objid)
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PSKeyword):
|
if isinstance(obj, PSKeyword):
|
||||||
out.write('<keyword>%s</keyword>' % obj.name)
|
out.write('<keyword>%s</keyword>' % obj.name)
|
||||||
return
|
return
|
||||||
|
@ -63,7 +63,7 @@ def dumpxml(out, obj, codec=None):
|
||||||
if isinstance(obj, PSLiteral):
|
if isinstance(obj, PSLiteral):
|
||||||
out.write('<literal>%s</literal>' % obj.name)
|
out.write('<literal>%s</literal>' % obj.name)
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, int) or isinstance(obj, float):
|
if isinstance(obj, int) or isinstance(obj, float):
|
||||||
out.write('<number>%s</number>' % obj)
|
out.write('<number>%s</number>' % obj)
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue