bug fixes
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@20 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
1300046181
commit
b1163b69bb
51
pdf2txt.py
51
pdf2txt.py
|
@ -7,31 +7,26 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
|
|||
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
||||
mult_matrix, apply_matrix
|
||||
from cmap import CMapDB
|
||||
from extent import Rect, ExtSet, ExtGrid
|
||||
|
||||
|
||||
## PageItem
|
||||
##
|
||||
class PageItem:
|
||||
|
||||
GRID_SIZE = 20
|
||||
|
||||
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
||||
self.id = id
|
||||
self.bbox = Rect(x0, y0, x1-x0, y1-y0)
|
||||
self.bbox = (x0, y0, x1-x0, y1-y0)
|
||||
self.rotate = rotate
|
||||
self.grid = ExtGrid(self.GRID_SIZE)
|
||||
self.objs = []
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
bbox = self.bbox
|
||||
return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' %
|
||||
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate))
|
||||
bbox = '%d,%d,%d,%d' % self.bbox
|
||||
return ('<page id=%r bbox="%s" rotate="%d">' %
|
||||
(self.id, bbox, self.rotate))
|
||||
|
||||
def add(self, obj):
|
||||
self.objs.append(obj)
|
||||
self.grid.add(obj.bbox, obj)
|
||||
return
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
|
@ -41,23 +36,14 @@ class PageItem:
|
|||
outfp.write('</page>\n')
|
||||
return
|
||||
|
||||
def fuse(self):
|
||||
for obj1 in self.objs:
|
||||
f = (lambda obj: obj.bbox)
|
||||
for rect in obj1.search_range():
|
||||
neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
|
||||
#print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
|
||||
return
|
||||
|
||||
|
||||
## FigureItem
|
||||
##
|
||||
class FigureItem(PageItem):
|
||||
|
||||
def __repr__(self):
|
||||
bbox = self.bbox
|
||||
return ('<figure id=%r bbox="%d,%d,%d,%d">' %
|
||||
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
|
||||
bbox = '%d,%d,%d,%d' % self.bbox
|
||||
return ('<figure id=%r bbox="%s">' % (self.id, bbox))
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
outfp.write(repr(self)+'\n')
|
||||
|
@ -66,9 +52,6 @@ class FigureItem(PageItem):
|
|||
outfp.write('</figure>\n')
|
||||
return
|
||||
|
||||
def search_range(self):
|
||||
return []
|
||||
|
||||
|
||||
## TextItem
|
||||
##
|
||||
|
@ -86,12 +69,12 @@ class TextItem:
|
|||
self.direction = 1
|
||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
|
||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
|
||||
self.bbox = Rect(tx, ty+descent, self.width, self.size)
|
||||
self.bbox = (tx, ty+descent, self.width, self.size)
|
||||
else:
|
||||
self.direction = 2
|
||||
mindisp = min( d for (d,_) in text )
|
||||
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
|
||||
self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width)
|
||||
self.bbox = (tx-mindisp, ty+self.width, self.size, self.width)
|
||||
self.text = ''.join( c for (_,c) in text )
|
||||
return
|
||||
|
||||
|
@ -107,12 +90,6 @@ class TextItem:
|
|||
outfp.write('</text>\n')
|
||||
return
|
||||
|
||||
def search_range(self):
|
||||
if self.direction == 1:
|
||||
return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
|
||||
else:
|
||||
return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]
|
||||
|
||||
|
||||
## TextConverter
|
||||
##
|
||||
|
@ -120,6 +97,10 @@ class TextConverter(PDFDevice):
|
|||
|
||||
def __init__(self, rsrc, debug=0):
|
||||
PDFDevice.__init__(self, rsrc, debug=debug)
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def reset(self):
|
||||
self.pages = []
|
||||
self.stack = []
|
||||
return
|
||||
|
@ -173,11 +154,8 @@ class TextConverter(PDFDevice):
|
|||
return
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
outfp.write('<document>\n')
|
||||
for page in self.pages:
|
||||
#page.fuse()
|
||||
page.dump(outfp, codec)
|
||||
outfp.write('</document>\n')
|
||||
return
|
||||
|
||||
|
||||
|
@ -188,12 +166,15 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
|||
fp = file(fname)
|
||||
parser = PDFParser(doc, fp, debug=debug)
|
||||
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
||||
outfp.write('<document>\n')
|
||||
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
||||
if pages and (i not in pages): continue
|
||||
device.reset()
|
||||
interpreter.process_page(page)
|
||||
fp.close()
|
||||
device.dump(outfp, codec)
|
||||
fp.close()
|
||||
device.close()
|
||||
outfp.write('</document>\n')
|
||||
return
|
||||
|
||||
|
||||
|
|
21
pdfinterp.py
21
pdfinterp.py
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, re
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
try:
|
||||
|
@ -292,8 +292,18 @@ class PDFCIDFont(PDFFont):
|
|||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
||||
self.cidsysteminfo.get('Ordering', 'unknown'))
|
||||
try:
|
||||
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('cmap is missing')
|
||||
self.cmap = None
|
||||
try:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('FontDescriptor is missing')
|
||||
descriptor = {}
|
||||
ttf = None
|
||||
if 'FontFile2' in descriptor:
|
||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||
|
@ -486,9 +496,6 @@ class PDFContentParser(PSStackParser):
|
|||
PSStackParser.__init__(self, None, debug=debug)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFParser: linepos=%d>' % self.linepos
|
||||
|
||||
def fillfp(self):
|
||||
if not self.fp:
|
||||
if self.istream < len(self.streams):
|
||||
|
@ -611,9 +618,9 @@ class PDFPageInterpreter:
|
|||
name = literal_name(spec[0])
|
||||
else:
|
||||
name = literal_name(spec)
|
||||
if name == 'ICCBased':
|
||||
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
|
||||
return ColorSpace(name, stream_value(spec[1]).dic['N'])
|
||||
elif name == 'DeviceN':
|
||||
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
|
||||
return ColorSpace(name, len(list_value(spec[1])))
|
||||
else:
|
||||
return PREDEFINED_COLORSPACE[name]
|
||||
|
@ -935,7 +942,7 @@ class PDFPageInterpreter:
|
|||
if STRICT:
|
||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||
return
|
||||
if xobj.dic['Subtype'] == LITERAL_FORM:
|
||||
if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic:
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Processing xobj: %r' % xobj
|
||||
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
||||
|
|
80
pdfparser.py
80
pdfparser.py
|
@ -30,6 +30,7 @@ class PDFEncryptionError(PDFException): pass
|
|||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||
class PDFTypeError(PDFException): pass
|
||||
class PDFValueError(PDFException): pass
|
||||
class PDFNotImplementedError(PSException): pass
|
||||
|
||||
|
||||
# some predefined literals and keywords.
|
||||
|
@ -40,11 +41,13 @@ LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
|||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
||||
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
|
||||
KEYWORD_R = PSKeywordTable.intern('R')
|
||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
|
@ -184,12 +187,13 @@ class PDFStream:
|
|||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFStream: %r>' % (self.dic)
|
||||
return '<PDFStream(%d): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||
|
||||
def decode(self):
|
||||
assert self.data == None and self.rawdata != None
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
data = self.decipher(self.objid, self.genno, data)
|
||||
if 'Filter' not in self.dic:
|
||||
self.data = data
|
||||
|
@ -203,13 +207,19 @@ class PDFStream:
|
|||
import zlib
|
||||
# will get errors if the document is encrypted.
|
||||
data = zlib.decompress(data)
|
||||
elif f == LITERAL_LZW_DECODE:
|
||||
raise PDFNotImplementedError('LZWDecode is currently unsupported.')
|
||||
elif f == LITERAL_CRYPT:
|
||||
raise PDFEncryptionError
|
||||
else:
|
||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||
# apply predictors
|
||||
params = self.dic.get('DecodeParms', {})
|
||||
if 'Predictor' in params:
|
||||
pred = int_value(params['Predictor'])
|
||||
if pred:
|
||||
if pred != 12:
|
||||
raise PDFValueError('Unsupported predictor: %r' % pred)
|
||||
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
||||
if 'Columns' not in params:
|
||||
raise PDFValueError('Columns undefined for predictor=12')
|
||||
columns = int_value(params['Columns'])
|
||||
|
@ -223,11 +233,6 @@ class PDFStream:
|
|||
buf += ent1
|
||||
ent0 = ent1
|
||||
data = buf
|
||||
if f == LITERAL_CRYPT:
|
||||
raise PDFEncryptionError
|
||||
else:
|
||||
if STRICT:
|
||||
raise PDFValueError('Invalid filter spec: %r' % f)
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
|
@ -274,18 +279,19 @@ class PDFXRef:
|
|||
|
||||
def __init__(self, parser):
|
||||
while 1:
|
||||
(_, line) = parser.nextline()
|
||||
(pos, line) = parser.nextline()
|
||||
if not line:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
||||
break
|
||||
line = line.strip()
|
||||
f = line.split(' ')
|
||||
if line.startswith('trailer'):
|
||||
parser.seek(pos)
|
||||
break
|
||||
f = line.strip().split(' ')
|
||||
if len(f) != 2:
|
||||
if line != 'trailer':
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
||||
break
|
||||
continue
|
||||
(start, nobjs) = map(long, f)
|
||||
self.objid0 = start
|
||||
self.objid1 = start+nobjs
|
||||
|
@ -300,6 +306,8 @@ class PDFXRef:
|
|||
(pos, genno, use) = f
|
||||
self.offsets.append((int(genno), long(pos), use))
|
||||
# read trailer
|
||||
(_,kwd) = parser.nexttoken()
|
||||
assert kwd == KEYWORD_TRAILER
|
||||
(_,dic) = parser.nextobject()
|
||||
self.trailer = dict_value(dic)
|
||||
return
|
||||
|
@ -319,9 +327,9 @@ class PDFXRef:
|
|||
class PDFXRefStream:
|
||||
|
||||
def __init__(self, parser):
|
||||
(_,objid) = parser.nextobject()
|
||||
(_,genno) = parser.nextobject()
|
||||
parser.nextobject()
|
||||
(_,objid) = parser.nexttoken()
|
||||
(_,genno) = parser.nexttoken()
|
||||
(_,kwd) = parser.nexttoken()
|
||||
(_,stream) = parser.nextobject()
|
||||
if STRICT:
|
||||
if stream.dic['Type'] != LITERAL_XREF:
|
||||
|
@ -367,6 +375,7 @@ class PDFDocument:
|
|||
self.parser = None
|
||||
self.encryption = None
|
||||
self.decipher = None
|
||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||
return
|
||||
|
||||
def set_parser(self, parser):
|
||||
|
@ -401,9 +410,9 @@ class PDFDocument:
|
|||
raise PDFEncryptionError('unknown revision: %r' % R)
|
||||
U = str_value(param['U'])
|
||||
P = int_value(param['P'])
|
||||
is_printable = bool(P & 4)
|
||||
is_modifiable = bool(P & 8)
|
||||
is_extractable = bool(P & 16)
|
||||
self.is_printable = bool(P & 4)
|
||||
self.is_modifiable = bool(P & 8)
|
||||
self.is_extractable = bool(P & 16)
|
||||
# Algorithm 3.2
|
||||
password = (password+PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
|
@ -411,7 +420,8 @@ class PDFDocument:
|
|||
hash.update(struct.pack('<L', P)) # 4
|
||||
hash.update(docid[0]) # 5
|
||||
if 4 <= R:
|
||||
raise NotImplementedError # 6
|
||||
# 6
|
||||
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
|
||||
if 3 <= R:
|
||||
# 8
|
||||
for _ in xrange(50):
|
||||
|
@ -429,8 +439,6 @@ class PDFDocument:
|
|||
k = ''.join( chr(c ^ i) for c in key )
|
||||
x = Arcfour(k).process(x)
|
||||
u1 = x+x # 32bytes total
|
||||
else:
|
||||
raise PDFEncryptionError('unknown revision: %r' % R)
|
||||
if R == 2:
|
||||
is_authenticated = (u1 == U)
|
||||
else:
|
||||
|
@ -485,10 +493,10 @@ class PDFDocument:
|
|||
obj.set_objid(objid, 0)
|
||||
else:
|
||||
self.parser.seek(index)
|
||||
(_,objid1) = self.parser.nextobject() # objid
|
||||
(_,genno) = self.parser.nextobject() # genno
|
||||
assert objid1 == objid
|
||||
(_,kwd) = self.parser.nextobject()
|
||||
(_,objid1) = self.parser.nexttoken() # objid
|
||||
(_,genno) = self.parser.nexttoken() # genno
|
||||
assert objid1 == objid, (objid, objid1)
|
||||
(_,kwd) = self.parser.nexttoken()
|
||||
if kwd != KEYWORD_OBJ:
|
||||
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
|
||||
(_,obj) = self.parser.nextobject()
|
||||
|
@ -582,7 +590,11 @@ class PDFParser(PSStackParser):
|
|||
self.seek(pos+objlen)
|
||||
while 1:
|
||||
(linepos, line) = self.nextline()
|
||||
if line.startswith('endstream'): break
|
||||
if 'endstream' in line:
|
||||
i = line.index('endstream')
|
||||
objlen += i
|
||||
data += line[:i]
|
||||
break
|
||||
objlen += len(line)
|
||||
data += line
|
||||
if 1 <= self.debug:
|
||||
|
@ -597,7 +609,7 @@ class PDFParser(PSStackParser):
|
|||
return
|
||||
|
||||
def find_xref(self):
|
||||
# find the first xref table
|
||||
# search the last xref table by scanning the file backwards.
|
||||
prev = None
|
||||
for line in self.revreadlines():
|
||||
line = line.strip()
|
||||
|
@ -620,19 +632,19 @@ class PDFParser(PSStackParser):
|
|||
self.find_xref()
|
||||
while 1:
|
||||
# read xref table
|
||||
(linepos, line) = self.nextline()
|
||||
(pos, token) = self.nexttoken()
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'read_xref: %r' % line
|
||||
if line[0].isdigit():
|
||||
print >>stderr, 'read_xref: %r' % token
|
||||
if isinstance(token, int):
|
||||
# XRefStream: PDF-1.5
|
||||
self.seek(linepos)
|
||||
self.seek(pos)
|
||||
self.reset()
|
||||
xref = PDFXRefStream(self)
|
||||
else:
|
||||
if line.strip() != 'xref':
|
||||
if token != KEYWORD_XREF:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
||||
(linepos, line))
|
||||
raise PDFSyntaxError('xref not found: pos=%d, token=%r' %
|
||||
(pos, token))
|
||||
xref = PDFXRef(self)
|
||||
yield xref
|
||||
trailer = xref.trailer
|
||||
|
|
|
@ -424,6 +424,11 @@ class PSStackParser(PSBaseParser):
|
|||
self.results = []
|
||||
return
|
||||
|
||||
def seek(self, pos):
|
||||
PSBaseParser.seek(self, pos)
|
||||
self.reset()
|
||||
return
|
||||
|
||||
def push(self, *objs):
|
||||
self.curstack.extend(objs)
|
||||
return
|
||||
|
|
Loading…
Reference in New Issue