patch from Troy Bollinger.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@71 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-02-28 05:44:08 +00:00
parent 13a6603151
commit b432a3f4ae
5 changed files with 100 additions and 45 deletions

View File

@ -164,19 +164,19 @@ $ <strong>python -m pdflib.pdf2txt -P mypassword secret.pdf</strong>
Options: Options:
<dl> <dl>
<dt> <code>-o <em>filename</em></code> <dt> <code>-o <em>filename</em></code>
<dd> Speficies the output file name. <dd> Specifies the output file name.
By default, it prints the extracted contents to stdout. By default, it prints the extracted contents to stdout.
<p> <p>
<dt> <code>-p <em>pageno[,pageno,...]</em></code> <dt> <code>-p <em>pageno[,pageno,...]</em></code>
<dd> Speficies the comma-separated list of the page numbers to be extracted. <dd> Specifies the comma-separated list of the page numbers to be extracted.
Page numbers are starting from one. Page numbers are starting from one.
By default, it extracts texts from all the pages. By default, it extracts texts from all the pages.
<p> <p>
<dt> <code>-c <em>codec</em></code> <dt> <code>-c <em>codec</em></code>
<dd> Speficies the output codec for non-ASCII texts. <dd> Specifies the output codec for non-ASCII texts.
<p> <p>
<dt> <code>-t <em>type</em></code> <dt> <code>-t <em>type</em></code>
<dd> Speficies the output format. The following formats are currently supported. <dd> Specifies the output format. The following formats are currently supported.
<ul> <ul>
<li> <code>html</code> : HTML format. (Default) <li> <code>html</code> : HTML format. (Default)
<li> <code>sgml</code> : SGML format. <li> <code>sgml</code> : SGML format.
@ -221,14 +221,14 @@ Options:
By default, it only prints the document trailer (like a header). By default, it only prints the document trailer (like a header).
<p> <p>
<dt> <code>-p <em>pageno</em></code> <dt> <code>-p <em>pageno</em></code>
<dd> Speficies the page number to be extracted. <dd> Specifies the page number to be extracted.
Multiple <code>-p</code> options are allowed. Multiple <code>-p</code> options are allowed.
Note that page numbers start from one. Note that page numbers start from one.
<p> <p>
<dt> <code>-r</code> (raw) <dt> <code>-r</code> (raw)
<dt> <code>-b</code> (binary) <dt> <code>-b</code> (binary)
<dt> <code>-t</code> (text) <dt> <code>-t</code> (text)
<dd> Speficies the output format of stream contents. <dd> Specifies the output format of stream contents.
Because the contents of stream objects can be very large, Because the contents of stream objects can be very large,
they are omitted when none of the options above is specified. they are omitted when none of the options above is specified.
<p> <p>

View File

@ -1,6 +1,32 @@
# Makefile for pdfminer # Makefile for pdfminer
all: DESTDIR=/usr/local/src/pdflib
PDFLIB = ${DESTDIR}/__init__.py \
${DESTDIR}/arcfour.py \
${DESTDIR}/ascii85.py \
${DESTDIR}/cmap.py \
${DESTDIR}/fontmetrics.py \
${DESTDIR}/glyphlist.py \
${DESTDIR}/latin_enc.py \
${DESTDIR}/lzw.py \
${DESTDIR}/pdf2txt.py \
${DESTDIR}/pdfcolor.py \
${DESTDIR}/pdfdevice.py \
${DESTDIR}/pdffont.py \
${DESTDIR}/pdfinterp.py \
${DESTDIR}/pdfparser.py \
${DESTDIR}/pdftypes.py \
${DESTDIR}/psparser.py \
${DESTDIR}/pycdb.py \
${DESTDIR}/rijndael.py \
${DESTDIR}/utils.py \
${DESTDIR}/%: %
cp $? $@
chmod 755 $@
all: ${PDFLIB}
clean: clean:
-rm *.pyc *.pyo -rm *.pyc *.pyo

View File

@ -59,9 +59,13 @@ class PDFBaseXRef(object):
return return
def objids(self): def objids(self):
for objid_range in self.objid_ranges: if self.objid_ranges:
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1): for objid_range in self.objid_ranges:
yield objid for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
yield objid
else:
for objid in self.offsets:
yield objid
return return
## PDFXRef ## PDFXRef
@ -70,6 +74,7 @@ class PDFXRef(PDFBaseXRef):
def __init__(self): def __init__(self):
PDFBaseXRef.__init__(self) PDFBaseXRef.__init__(self)
self.offsets = None self.offsets = None
self.trailer = {}
return return
def __repr__(self): def __repr__(self):
@ -81,6 +86,8 @@ class PDFXRef(PDFBaseXRef):
while 1: while 1:
try: try:
(pos, line) = parser.nextline() (pos, line) = parser.nextline()
if not line.strip():
continue
except PSEOF: except PSEOF:
raise PDFNoValidXRef('Unexpected EOF - file corrupted?') raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
if not line: if not line:
@ -124,7 +131,7 @@ class PDFXRef(PDFBaseXRef):
if not x: if not x:
raise PDFNoValidXRef('Unexpected EOF - file corrupted') raise PDFNoValidXRef('Unexpected EOF - file corrupted')
(_,dic) = x[0] (_,dic) = x[0]
self.trailer = dict_value(dic) self.trailer.update( dict_value(dic))
return return
def getpos(self, objid): def getpos(self, objid):
@ -418,8 +425,18 @@ class PDFDocument(object):
self.parser.seek(index) self.parser.seek(index)
(_,objid1) = self.parser.nexttoken() # objid (_,objid1) = self.parser.nexttoken() # objid
(_,genno) = self.parser.nexttoken() # genno (_,genno) = self.parser.nexttoken() # genno
#assert objid1 == objid, (objid, objid1)
(_,kwd) = self.parser.nexttoken() (_,kwd) = self.parser.nexttoken()
# #### hack around malformed pdf files
# assert objid1 == objid, (objid, objid1)
if objid1 != objid:
x = []
while kwd is not self.KEYWORD_OBJ:
(_,kwd) = self.parser.nexttoken()
x.append(kwd)
if x:
objid1 = x[-2]
genno = x[-1]
# #### end hack around malformed pdf files
if kwd is not self.KEYWORD_OBJ: if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index) raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
(_,obj) = self.parser.nextobject() (_,obj) = self.parser.nextobject()
@ -611,17 +628,15 @@ class PDFParser(PSStackParser):
raise PDFNoValidXRef('Unexpected EOF') raise PDFNoValidXRef('Unexpected EOF')
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token) print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
if isinstance(token, int): try:
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
self.seek(pos) self.seek(pos)
self.reset() self.reset()
xref = PDFXRefStream() xref = PDFXRefStream()
xref.load(self, debug=self.debug) xref.load(self, debug=self.debug)
else: except:
if token is not self.KEYWORD_XREF: if token is self.KEYWORD_XREF:
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % self.nextline()
(pos, token))
self.nextline()
xref = PDFXRef() xref = PDFXRef()
xref.load(self, debug=self.debug) xref.load(self, debug=self.debug)
xrefs.append(xref) xrefs.append(xref)
@ -656,17 +671,17 @@ class PDFParser(PSStackParser):
(pos, line) = self.nextline() (pos, line) = self.nextline()
except PSEOF: except PSEOF:
break break
if line.startswith('trailer'): break if line.startswith('trailer'):
xref.offsets = offsets
self.seek(pos)
xref.load_trailer(self)
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % xref.trailer
continue
m = pat.match(line) m = pat.match(line)
if not m: continue if not m: continue
(objid, genno) = m.groups() (objid, genno) = m.groups()
offsets[int(objid)] = (0, pos) offsets[int(objid)] = (0, pos)
if not offsets: raise
xref.offsets = offsets
self.seek(pos)
xref.load_trailer(self)
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % xref.trailer
xrefs.append(xref) xrefs.append(xref)
return xrefs return xrefs

View File

@ -159,6 +159,20 @@ class PDFStream(PDFObject):
def __repr__(self): def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic) return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decomp(self,data):
import zlib
buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer
while len(buf) > 10:
try:
# will get errors if the document is encrypted.
dco = zlib.decompressobj()
return dco.decompress(buf)
except:
buf = buf[:-1]
raise Exception, "zlib.error while decompressing data"
def decode(self): def decode(self):
assert self.data == None and self.rawdata != None assert self.data == None and self.rawdata != None
data = self.rawdata data = self.rawdata
@ -175,7 +189,7 @@ class PDFStream(PDFObject):
for f in filters: for f in filters:
if f in LITERALS_FLATE_DECODE: if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted. # will get errors if the document is encrypted.
data = zlib.decompress(data) data = self.decomp(data)
elif f in LITERALS_LZW_DECODE: elif f in LITERALS_LZW_DECODE:
try: try:
from cStringIO import StringIO from cStringIO import StringIO