patch from Troy Bollinger.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@71 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
13a6603151
commit
b432a3f4ae
12
README.html
12
README.html
|
@ -164,19 +164,19 @@ $ <strong>python -m pdflib.pdf2txt -P mypassword secret.pdf</strong>
|
|||
Options:
|
||||
<dl>
|
||||
<dt> <code>-o <em>filename</em></code>
|
||||
<dd> Speficies the output file name.
|
||||
<dd> Specifies the output file name.
|
||||
By default, it prints the extracted contents to stdout.
|
||||
<p>
|
||||
<dt> <code>-p <em>pageno[,pageno,...]</em></code>
|
||||
<dd> Speficies the comma-separated list of the page numbers to be extracted.
|
||||
<dd> Specifies the comma-separated list of the page numbers to be extracted.
|
||||
Page numbers are starting from one.
|
||||
By default, it extracts texts from all the pages.
|
||||
<p>
|
||||
<dt> <code>-c <em>codec</em></code>
|
||||
<dd> Speficies the output codec for non-ASCII texts.
|
||||
<dd> Specifies the output codec for non-ASCII texts.
|
||||
<p>
|
||||
<dt> <code>-t <em>type</em></code>
|
||||
<dd> Speficies the output format. The following formats are currently supported.
|
||||
<dd> Specifies the output format. The following formats are currently supported.
|
||||
<ul>
|
||||
<li> <code>html</code> : HTML format. (Default)
|
||||
<li> <code>sgml</code> : SGML format.
|
||||
|
@ -221,14 +221,14 @@ Options:
|
|||
By default, it only prints the document trailer (like a header).
|
||||
<p>
|
||||
<dt> <code>-p <em>pageno</em></code>
|
||||
<dd> Speficies the page number to be extracted.
|
||||
<dd> Specifies the page number to be extracted.
|
||||
Multiple <code>-p</code> options are allowed.
|
||||
Note that page numbers start from one.
|
||||
<p>
|
||||
<dt> <code>-r</code> (raw)
|
||||
<dt> <code>-b</code> (binary)
|
||||
<dt> <code>-t</code> (text)
|
||||
<dd> Speficies the output format of stream contents.
|
||||
<dd> Specifies the output format of stream contents.
|
||||
Because the contents of stream objects can be very large,
|
||||
they are omitted when none of the options above is specified.
|
||||
<p>
|
||||
|
|
|
@ -1,6 +1,32 @@
|
|||
# Makefile for pdfminer
|
||||
|
||||
all:
|
||||
DESTDIR=/usr/local/src/pdflib
|
||||
|
||||
PDFLIB = ${DESTDIR}/__init__.py \
|
||||
${DESTDIR}/arcfour.py \
|
||||
${DESTDIR}/ascii85.py \
|
||||
${DESTDIR}/cmap.py \
|
||||
${DESTDIR}/fontmetrics.py \
|
||||
${DESTDIR}/glyphlist.py \
|
||||
${DESTDIR}/latin_enc.py \
|
||||
${DESTDIR}/lzw.py \
|
||||
${DESTDIR}/pdf2txt.py \
|
||||
${DESTDIR}/pdfcolor.py \
|
||||
${DESTDIR}/pdfdevice.py \
|
||||
${DESTDIR}/pdffont.py \
|
||||
${DESTDIR}/pdfinterp.py \
|
||||
${DESTDIR}/pdfparser.py \
|
||||
${DESTDIR}/pdftypes.py \
|
||||
${DESTDIR}/psparser.py \
|
||||
${DESTDIR}/pycdb.py \
|
||||
${DESTDIR}/rijndael.py \
|
||||
${DESTDIR}/utils.py \
|
||||
|
||||
${DESTDIR}/%: %
|
||||
cp $? $@
|
||||
chmod 755 $@
|
||||
|
||||
all: ${PDFLIB}
|
||||
|
||||
clean:
|
||||
-rm *.pyc *.pyo
|
||||
|
|
|
@ -59,9 +59,13 @@ class PDFBaseXRef(object):
|
|||
return
|
||||
|
||||
def objids(self):
|
||||
for objid_range in self.objid_ranges:
|
||||
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
|
||||
yield objid
|
||||
if self.objid_ranges:
|
||||
for objid_range in self.objid_ranges:
|
||||
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
|
||||
yield objid
|
||||
else:
|
||||
for objid in self.offsets:
|
||||
yield objid
|
||||
return
|
||||
|
||||
## PDFXRef
|
||||
|
@ -70,6 +74,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
def __init__(self):
|
||||
PDFBaseXRef.__init__(self)
|
||||
self.offsets = None
|
||||
self.trailer = {}
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -81,6 +86,8 @@ class PDFXRef(PDFBaseXRef):
|
|||
while 1:
|
||||
try:
|
||||
(pos, line) = parser.nextline()
|
||||
if not line.strip():
|
||||
continue
|
||||
except PSEOF:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
|
||||
if not line:
|
||||
|
@ -112,7 +119,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
print >>stderr, 'xref objects:', self.offsets
|
||||
self.load_trailer(parser)
|
||||
return
|
||||
|
||||
|
||||
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||
def load_trailer(self, parser):
|
||||
try:
|
||||
|
@ -124,7 +131,7 @@ class PDFXRef(PDFBaseXRef):
|
|||
if not x:
|
||||
raise PDFNoValidXRef('Unexpected EOF - file corrupted')
|
||||
(_,dic) = x[0]
|
||||
self.trailer = dict_value(dic)
|
||||
self.trailer.update( dict_value(dic))
|
||||
return
|
||||
|
||||
def getpos(self, objid):
|
||||
|
@ -199,7 +206,7 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
## PDFPage
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
||||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
self.doc = doc
|
||||
self.pageid = pageid
|
||||
|
@ -237,7 +244,7 @@ class PDFPage(object):
|
|||
class PDFDocument(object):
|
||||
|
||||
debug = 0
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.xrefs = []
|
||||
self.objs = {}
|
||||
|
@ -257,7 +264,7 @@ class PDFDocument(object):
|
|||
self.parser = parser
|
||||
# The document is set to be temporarily ready during collecting
|
||||
# all the basic information about the document, e.g.
|
||||
# the header, the encryption information, and the access rights
|
||||
# the header, the encryption information, and the access rights
|
||||
# for the document.
|
||||
self.ready = True
|
||||
# Retrieve the information of each header that was appended
|
||||
|
@ -292,7 +299,7 @@ class PDFDocument(object):
|
|||
if STRICT:
|
||||
raise PDFSyntaxError('Catalog not found!')
|
||||
return
|
||||
|
||||
|
||||
# initialize(password='')
|
||||
# Perform the initialization with a given password.
|
||||
# This step is mandatory even if there's no password associated
|
||||
|
@ -316,7 +323,7 @@ class PDFDocument(object):
|
|||
raise PDFEncryptionError('Unknown revision: %r' % R)
|
||||
U = str_value(param['U'])
|
||||
P = int_value(param['P'])
|
||||
self.is_printable = bool(P & 4)
|
||||
self.is_printable = bool(P & 4)
|
||||
self.is_modifiable = bool(P & 8)
|
||||
self.is_extractable = bool(P & 16)
|
||||
# Algorithm 3.2
|
||||
|
@ -418,8 +425,18 @@ class PDFDocument(object):
|
|||
self.parser.seek(index)
|
||||
(_,objid1) = self.parser.nexttoken() # objid
|
||||
(_,genno) = self.parser.nexttoken() # genno
|
||||
#assert objid1 == objid, (objid, objid1)
|
||||
(_,kwd) = self.parser.nexttoken()
|
||||
# #### hack around malformed pdf files
|
||||
# assert objid1 == objid, (objid, objid1)
|
||||
if objid1 != objid:
|
||||
x = []
|
||||
while kwd is not self.KEYWORD_OBJ:
|
||||
(_,kwd) = self.parser.nexttoken()
|
||||
x.append(kwd)
|
||||
if x:
|
||||
objid1 = x[-2]
|
||||
genno = x[-1]
|
||||
# #### end hack around malformed pdf files
|
||||
if kwd is not self.KEYWORD_OBJ:
|
||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||
(_,obj) = self.parser.nextobject()
|
||||
|
@ -431,7 +448,7 @@ class PDFDocument(object):
|
|||
if self.decipher:
|
||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||
return obj
|
||||
|
||||
|
||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||
def get_pages(self):
|
||||
if not self.ready:
|
||||
|
@ -526,7 +543,7 @@ class PDFParser(PSStackParser):
|
|||
if token is self.KEYWORD_ENDOBJ:
|
||||
self.add_results(*self.pop(4))
|
||||
return
|
||||
|
||||
|
||||
if token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
|
@ -537,7 +554,7 @@ class PDFParser(PSStackParser):
|
|||
except PSSyntaxError:
|
||||
pass
|
||||
return
|
||||
|
||||
|
||||
if token is self.KEYWORD_STREAM:
|
||||
# stream object
|
||||
((_,dic),) = self.pop(1)
|
||||
|
@ -580,7 +597,7 @@ class PDFParser(PSStackParser):
|
|||
obj = PDFStream(dic, data, self.doc.decipher)
|
||||
self.push((pos, obj))
|
||||
return
|
||||
|
||||
|
||||
# others
|
||||
self.push((pos, token))
|
||||
return
|
||||
|
@ -611,17 +628,15 @@ class PDFParser(PSStackParser):
|
|||
raise PDFNoValidXRef('Unexpected EOF')
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
|
||||
if isinstance(token, int):
|
||||
try:
|
||||
# XRefStream: PDF-1.5
|
||||
self.seek(pos)
|
||||
self.reset()
|
||||
xref = PDFXRefStream()
|
||||
xref.load(self, debug=self.debug)
|
||||
else:
|
||||
if token is not self.KEYWORD_XREF:
|
||||
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
||||
(pos, token))
|
||||
self.nextline()
|
||||
except:
|
||||
if token is self.KEYWORD_XREF:
|
||||
self.nextline()
|
||||
xref = PDFXRef()
|
||||
xref.load(self, debug=self.debug)
|
||||
xrefs.append(xref)
|
||||
|
@ -636,7 +651,7 @@ class PDFParser(PSStackParser):
|
|||
pos = int_value(trailer['Prev'])
|
||||
self.read_xref_from(pos, xrefs)
|
||||
return
|
||||
|
||||
|
||||
# read xref tables and trailers
|
||||
def read_xref(self):
|
||||
xrefs = []
|
||||
|
@ -656,17 +671,17 @@ class PDFParser(PSStackParser):
|
|||
(pos, line) = self.nextline()
|
||||
except PSEOF:
|
||||
break
|
||||
if line.startswith('trailer'): break
|
||||
if line.startswith('trailer'):
|
||||
xref.offsets = offsets
|
||||
self.seek(pos)
|
||||
xref.load_trailer(self)
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'trailer: %r' % xref.trailer
|
||||
continue
|
||||
m = pat.match(line)
|
||||
if not m: continue
|
||||
(objid, genno) = m.groups()
|
||||
offsets[int(objid)] = (0, pos)
|
||||
if not offsets: raise
|
||||
xref.offsets = offsets
|
||||
self.seek(pos)
|
||||
xref.load_trailer(self)
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'trailer: %r' % xref.trailer
|
||||
xrefs.append(xref)
|
||||
return xrefs
|
||||
|
||||
|
@ -674,7 +689,7 @@ class PDFParser(PSStackParser):
|
|||
## PDFObjStrmParser
|
||||
##
|
||||
class PDFObjStrmParser(PDFParser):
|
||||
|
||||
|
||||
def __init__(self, doc, data):
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
|
@ -682,7 +697,7 @@ class PDFObjStrmParser(PDFParser):
|
|||
from StringIO import StringIO
|
||||
PDFParser.__init__(self, doc, StringIO(data))
|
||||
return
|
||||
|
||||
|
||||
def flush(self):
|
||||
self.add_results(*self.popall())
|
||||
return
|
||||
|
|
|
@ -159,6 +159,20 @@ class PDFStream(PDFObject):
|
|||
def __repr__(self):
|
||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||
|
||||
def decomp(self,data):
|
||||
import zlib
|
||||
buf = data
|
||||
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
||||
# end. remove chars from the end to try and decompress the buffer
|
||||
while len(buf) > 10:
|
||||
try:
|
||||
# will get errors if the document is encrypted.
|
||||
dco = zlib.decompressobj()
|
||||
return dco.decompress(buf)
|
||||
except:
|
||||
buf = buf[:-1]
|
||||
raise Exception, "zlib.error while decompressing data"
|
||||
|
||||
def decode(self):
|
||||
assert self.data == None and self.rawdata != None
|
||||
data = self.rawdata
|
||||
|
@ -175,7 +189,7 @@ class PDFStream(PDFObject):
|
|||
for f in filters:
|
||||
if f in LITERALS_FLATE_DECODE:
|
||||
# will get errors if the document is encrypted.
|
||||
data = zlib.decompress(data)
|
||||
data = self.decomp(data)
|
||||
elif f in LITERALS_LZW_DECODE:
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
|
|
|
@ -29,7 +29,7 @@ def dumpxml(out, obj, codec=None):
|
|||
out.write('</value>\n')
|
||||
out.write('</dict>')
|
||||
return
|
||||
|
||||
|
||||
if isinstance(obj, list):
|
||||
out.write('<list size="%d">\n' % len(obj))
|
||||
for v in obj:
|
||||
|
@ -37,11 +37,11 @@ def dumpxml(out, obj, codec=None):
|
|||
out.write('\n')
|
||||
out.write('</list>')
|
||||
return
|
||||
|
||||
|
||||
if isinstance(obj, str):
|
||||
out.write('<string size="%d">%s</string>' % (len(obj), esc(obj)))
|
||||
return
|
||||
|
||||
|
||||
if isinstance(obj, PDFStream):
|
||||
out.write('<stream>\n<props>\n')
|
||||
dumpxml(out, obj.dic)
|
||||
|
@ -51,11 +51,11 @@ def dumpxml(out, obj, codec=None):
|
|||
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
||||
out.write('</stream>')
|
||||
return
|
||||
|
||||
|
||||
if isinstance(obj, PDFObjRef):
|
||||
out.write('<ref id="%d"/>' % obj.objid)
|
||||
return
|
||||
|
||||
|
||||
if isinstance(obj, PSKeyword):
|
||||
out.write('<keyword>%s</keyword>' % obj.name)
|
||||
return
|
||||
|
@ -63,7 +63,7 @@ def dumpxml(out, obj, codec=None):
|
|||
if isinstance(obj, PSLiteral):
|
||||
out.write('<literal>%s</literal>' % obj.name)
|
||||
return
|
||||
|
||||
|
||||
if isinstance(obj, int) or isinstance(obj, float):
|
||||
out.write('<number>%s</number>' % obj)
|
||||
return
|
||||
|
|
Loading…
Reference in New Issue