patch from Troy Bollinger.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@71 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
13a6603151
commit
b432a3f4ae
12
README.html
12
README.html
|
@ -164,19 +164,19 @@ $ <strong>python -m pdflib.pdf2txt -P mypassword secret.pdf</strong>
|
||||||
Options:
|
Options:
|
||||||
<dl>
|
<dl>
|
||||||
<dt> <code>-o <em>filename</em></code>
|
<dt> <code>-o <em>filename</em></code>
|
||||||
<dd> Speficies the output file name.
|
<dd> Specifies the output file name.
|
||||||
By default, it prints the extracted contents to stdout.
|
By default, it prints the extracted contents to stdout.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
||||||
<dd> Speficies the comma-separated list of the page numbers to be extracted.
|
<dd> Specifies the comma-separated list of the page numbers to be extracted.
|
||||||
Page numbers are starting from one.
|
Page numbers are starting from one.
|
||||||
By default, it extracts texts from all the pages.
|
By default, it extracts texts from all the pages.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-c <em>codec</em></code>
|
<dt> <code>-c <em>codec</em></code>
|
||||||
<dd> Speficies the output codec for non-ASCII texts.
|
<dd> Specifies the output codec for non-ASCII texts.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-t <em>type</em></code>
|
<dt> <code>-t <em>type</em></code>
|
||||||
<dd> Speficies the output format. The following formats are currently supported.
|
<dd> Specifies the output format. The following formats are currently supported.
|
||||||
<ul>
|
<ul>
|
||||||
<li> <code>html</code> : HTML format. (Default)
|
<li> <code>html</code> : HTML format. (Default)
|
||||||
<li> <code>sgml</code> : SGML format.
|
<li> <code>sgml</code> : SGML format.
|
||||||
|
@ -221,14 +221,14 @@ Options:
|
||||||
By default, it only prints the document trailer (like a header).
|
By default, it only prints the document trailer (like a header).
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-p <em>pageno</em></code>
|
<dt> <code>-p <em>pageno</em></code>
|
||||||
<dd> Speficies the page number to be extracted.
|
<dd> Specifies the page number to be extracted.
|
||||||
Multiple <code>-p</code> options are allowed.
|
Multiple <code>-p</code> options are allowed.
|
||||||
Note that page numbers start from one.
|
Note that page numbers start from one.
|
||||||
<p>
|
<p>
|
||||||
<dt> <code>-r</code> (raw)
|
<dt> <code>-r</code> (raw)
|
||||||
<dt> <code>-b</code> (binary)
|
<dt> <code>-b</code> (binary)
|
||||||
<dt> <code>-t</code> (text)
|
<dt> <code>-t</code> (text)
|
||||||
<dd> Speficies the output format of stream contents.
|
<dd> Specifies the output format of stream contents.
|
||||||
Because the contents of stream objects can be very large,
|
Because the contents of stream objects can be very large,
|
||||||
they are omitted when none of the options above is specified.
|
they are omitted when none of the options above is specified.
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -1,6 +1,32 @@
|
||||||
# Makefile for pdfminer
|
# Makefile for pdfminer
|
||||||
|
|
||||||
all:
|
DESTDIR=/usr/local/src/pdflib
|
||||||
|
|
||||||
|
PDFLIB = ${DESTDIR}/__init__.py \
|
||||||
|
${DESTDIR}/arcfour.py \
|
||||||
|
${DESTDIR}/ascii85.py \
|
||||||
|
${DESTDIR}/cmap.py \
|
||||||
|
${DESTDIR}/fontmetrics.py \
|
||||||
|
${DESTDIR}/glyphlist.py \
|
||||||
|
${DESTDIR}/latin_enc.py \
|
||||||
|
${DESTDIR}/lzw.py \
|
||||||
|
${DESTDIR}/pdf2txt.py \
|
||||||
|
${DESTDIR}/pdfcolor.py \
|
||||||
|
${DESTDIR}/pdfdevice.py \
|
||||||
|
${DESTDIR}/pdffont.py \
|
||||||
|
${DESTDIR}/pdfinterp.py \
|
||||||
|
${DESTDIR}/pdfparser.py \
|
||||||
|
${DESTDIR}/pdftypes.py \
|
||||||
|
${DESTDIR}/psparser.py \
|
||||||
|
${DESTDIR}/pycdb.py \
|
||||||
|
${DESTDIR}/rijndael.py \
|
||||||
|
${DESTDIR}/utils.py \
|
||||||
|
|
||||||
|
${DESTDIR}/%: %
|
||||||
|
cp $? $@
|
||||||
|
chmod 755 $@
|
||||||
|
|
||||||
|
all: ${PDFLIB}
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
-rm *.pyc *.pyo
|
-rm *.pyc *.pyo
|
||||||
|
|
|
@ -59,9 +59,13 @@ class PDFBaseXRef(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def objids(self):
|
def objids(self):
|
||||||
|
if self.objid_ranges:
|
||||||
for objid_range in self.objid_ranges:
|
for objid_range in self.objid_ranges:
|
||||||
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
|
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
|
||||||
yield objid
|
yield objid
|
||||||
|
else:
|
||||||
|
for objid in self.offsets:
|
||||||
|
yield objid
|
||||||
return
|
return
|
||||||
|
|
||||||
## PDFXRef
|
## PDFXRef
|
||||||
|
@ -70,6 +74,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
PDFBaseXRef.__init__(self)
|
PDFBaseXRef.__init__(self)
|
||||||
self.offsets = None
|
self.offsets = None
|
||||||
|
self.trailer = {}
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -81,6 +86,8 @@ class PDFXRef(PDFBaseXRef):
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||||
if not line:
|
if not line:
|
||||||
|
@ -124,7 +131,7 @@ class PDFXRef(PDFBaseXRef):
|
||||||
if not x:
|
if not x:
|
||||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||||
(_,dic) = x[0]
|
(_,dic) = x[0]
|
||||||
self.trailer = dict_value(dic)
|
self.trailer.update( dict_value(dic))
|
||||||
return
|
return
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
|
@ -418,8 +425,18 @@ class PDFDocument(object):
|
||||||
self.parser.seek(index)
|
self.parser.seek(index)
|
||||||
(_,objid1) = self.parser.nexttoken() # objid
|
(_,objid1) = self.parser.nexttoken() # objid
|
||||||
(_,genno) = self.parser.nexttoken() # genno
|
(_,genno) = self.parser.nexttoken() # genno
|
||||||
#assert objid1 == objid, (objid, objid1)
|
|
||||||
(_,kwd) = self.parser.nexttoken()
|
(_,kwd) = self.parser.nexttoken()
|
||||||
|
# #### hack around malformed pdf files
|
||||||
|
# assert objid1 == objid, (objid, objid1)
|
||||||
|
if objid1 != objid:
|
||||||
|
x = []
|
||||||
|
while kwd is not self.KEYWORD_OBJ:
|
||||||
|
(_,kwd) = self.parser.nexttoken()
|
||||||
|
x.append(kwd)
|
||||||
|
if x:
|
||||||
|
objid1 = x[-2]
|
||||||
|
genno = x[-1]
|
||||||
|
# #### end hack around malformed pdf files
|
||||||
if kwd is not self.KEYWORD_OBJ:
|
if kwd is not self.KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||||
(_,obj) = self.parser.nextobject()
|
(_,obj) = self.parser.nextobject()
|
||||||
|
@ -611,16 +628,14 @@ class PDFParser(PSStackParser):
|
||||||
raise PDFNoValidXRef('Unexpected EOF')
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
|
print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
|
||||||
if isinstance(token, int):
|
try:
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
self.reset()
|
self.reset()
|
||||||
xref = PDFXRefStream()
|
xref = PDFXRefStream()
|
||||||
xref.load(self, debug=self.debug)
|
xref.load(self, debug=self.debug)
|
||||||
else:
|
except:
|
||||||
if token is not self.KEYWORD_XREF:
|
if token is self.KEYWORD_XREF:
|
||||||
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
|
||||||
(pos, token))
|
|
||||||
self.nextline()
|
self.nextline()
|
||||||
xref = PDFXRef()
|
xref = PDFXRef()
|
||||||
xref.load(self, debug=self.debug)
|
xref.load(self, debug=self.debug)
|
||||||
|
@ -656,17 +671,17 @@ class PDFParser(PSStackParser):
|
||||||
(pos, line) = self.nextline()
|
(pos, line) = self.nextline()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
break
|
break
|
||||||
if line.startswith('trailer'): break
|
if line.startswith('trailer'):
|
||||||
m = pat.match(line)
|
|
||||||
if not m: continue
|
|
||||||
(objid, genno) = m.groups()
|
|
||||||
offsets[int(objid)] = (0, pos)
|
|
||||||
if not offsets: raise
|
|
||||||
xref.offsets = offsets
|
xref.offsets = offsets
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
xref.load_trailer(self)
|
xref.load_trailer(self)
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'trailer: %r' % xref.trailer
|
print >>stderr, 'trailer: %r' % xref.trailer
|
||||||
|
continue
|
||||||
|
m = pat.match(line)
|
||||||
|
if not m: continue
|
||||||
|
(objid, genno) = m.groups()
|
||||||
|
offsets[int(objid)] = (0, pos)
|
||||||
xrefs.append(xref)
|
xrefs.append(xref)
|
||||||
return xrefs
|
return xrefs
|
||||||
|
|
||||||
|
|
|
@ -159,6 +159,20 @@ class PDFStream(PDFObject):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||||
|
|
||||||
|
def decomp(self,data):
|
||||||
|
import zlib
|
||||||
|
buf = data
|
||||||
|
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
||||||
|
# end. remove chars from the end to try and decompress the buffer
|
||||||
|
while len(buf) > 10:
|
||||||
|
try:
|
||||||
|
# will get errors if the document is encrypted.
|
||||||
|
dco = zlib.decompressobj()
|
||||||
|
return dco.decompress(buf)
|
||||||
|
except:
|
||||||
|
buf = buf[:-1]
|
||||||
|
raise Exception, "zlib.error while decompressing data"
|
||||||
|
|
||||||
def decode(self):
|
def decode(self):
|
||||||
assert self.data == None and self.rawdata != None
|
assert self.data == None and self.rawdata != None
|
||||||
data = self.rawdata
|
data = self.rawdata
|
||||||
|
@ -175,7 +189,7 @@ class PDFStream(PDFObject):
|
||||||
for f in filters:
|
for f in filters:
|
||||||
if f in LITERALS_FLATE_DECODE:
|
if f in LITERALS_FLATE_DECODE:
|
||||||
# will get errors if the document is encrypted.
|
# will get errors if the document is encrypted.
|
||||||
data = zlib.decompress(data)
|
data = self.decomp(data)
|
||||||
elif f in LITERALS_LZW_DECODE:
|
elif f in LITERALS_LZW_DECODE:
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
|
|
Loading…
Reference in New Issue