Merge pull request #3 from Cybjit/master

Samples and latin1 passwords
pull/4/head
Philippe Guglielmetti 2014-09-17 07:22:52 +02:00
commit 0e40264071
10 changed files with 102 additions and 56 deletions

View File

@ -243,12 +243,17 @@ class HTMLConverter(PDFConverter):
return
def write(self, text):
if self.codec:
text = text.encode(self.codec)
self.outfp.write(text)
return
def write_header(self):
self.write('<html><head>\n')
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
if self.codec:
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
else:
self.write('<meta http-equiv="Content-Type" content="text/html">\n')
self.write('</head><body>\n')
return
@ -259,7 +264,7 @@ class HTMLConverter(PDFConverter):
return
def write_text(self, text):
self.write(enc(text, self.codec))
self.write(enc(text, None))
return
def place_rect(self, color, borderwidth, x, y, w, h):
@ -281,7 +286,7 @@ class HTMLConverter(PDFConverter):
name = self.imagewriter.export_image(item)
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name), borderwidth,
(enc(name, None), borderwidth,
x*self.scale, (self._yoffset-y)*self.scale,
w*self.scale, h*self.scale))
return
@ -411,88 +416,97 @@ class XMLConverter(PDFConverter):
self.write_header()
return
def write(self, text):
if self.codec:
text = text.encode(self.codec)
self.outfp.write(text)
return
def write_header(self):
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
self.outfp.write('<pages>\n')
if self.codec:
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
else:
self.write('<?xml version="1.0" ?>\n')
self.write('<pages>\n')
return
def write_footer(self):
self.outfp.write('</pages>\n')
self.write('</pages>\n')
return
def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub(u'', text)
self.outfp.write(enc(text, self.codec))
self.write(enc(text, None))
return
def receive_layout(self, ltpage):
def show_group(item):
if isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%d" bbox="%s" />\n' %
self.write('<textbox id="%d" bbox="%s" />\n' %
(item.index, bbox2str(item.bbox)))
elif isinstance(item, LTTextGroup):
self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
show_group(child)
self.outfp.write('</textgroup>\n')
self.write('</textgroup>\n')
return
def render(item):
if isinstance(item, LTPage):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
self.write('<page id="%s" bbox="%s" rotate="%d">\n' %
(item.pageid, bbox2str(item.bbox), item.rotate))
for child in item:
render(child)
if item.groups is not None:
self.outfp.write('<layout>\n')
self.write('<layout>\n')
for group in item.groups:
show_group(group)
self.outfp.write('</layout>\n')
self.outfp.write('</page>\n')
self.write('</layout>\n')
self.write('</page>\n')
elif isinstance(item, LTLine):
self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
self.write('<line linewidth="%d" bbox="%s" />\n' %
(item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTRect):
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
self.write('<rect linewidth="%d" bbox="%s" />\n' %
(item.linewidth, bbox2str(item.bbox)))
elif isinstance(item, LTCurve):
self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
self.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
elif isinstance(item, LTFigure):
self.outfp.write('<figure name="%s" bbox="%s">\n' %
self.write('<figure name="%s" bbox="%s">\n' %
(item.name, bbox2str(item.bbox)))
for child in item:
render(child)
self.outfp.write('</figure>\n')
self.write('</figure>\n')
elif isinstance(item, LTTextLine):
self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
for child in item:
render(child)
self.outfp.write('</textline>\n')
self.write('</textline>\n')
elif isinstance(item, LTTextBox):
wmode = ''
if isinstance(item, LTTextBoxVertical):
wmode = ' wmode="vertical"'
self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' %
self.write('<textbox id="%d" bbox="%s"%s>\n' %
(item.index, bbox2str(item.bbox), wmode))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
self.write('</textbox>\n')
elif isinstance(item, LTChar):
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
(enc(item.fontname), bbox2str(item.bbox), item.size))
self.write('<text font="%s" bbox="%s" size="%.3f">' %
(enc(item.fontname, None), bbox2str(item.bbox), item.size))
self.write_text(item.get_text())
self.outfp.write('</text>\n')
self.write('</text>\n')
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.get_text())
self.write('<text>%s</text>\n' % item.get_text())
elif isinstance(item, LTImage):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name), item.width, item.height))
self.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name, None), item.width, item.height))
else:
self.outfp.write('<image width="%d" height="%d" />\n' %
self.write('<image width="%d" height="%d" />\n' %
(item.width, item.height))
else:
assert 0, item

View File

@ -45,7 +45,7 @@ class BMPWriter(object):
self.fp.write(struct.pack('BBBx', i, i, i))
elif ncols == 256:
# grayscale color table
for i in xrange(256):
for i in range(256):
self.fp.write(struct.pack('BBBx', i, i, i))
self.pos0 = self.fp.tell()
self.pos1 = self.pos0 + self.datasize
@ -98,7 +98,7 @@ class ImageWriter(object):
data = stream.get_data()
i = 0
width = (width+7)//8
for y in xrange(height):
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB:
@ -106,14 +106,14 @@ class ImageWriter(object):
data = stream.get_data()
i = 0
width = width*3
for y in xrange(height):
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY:
bmp = BMPWriter(fp, 8, width, height)
data = stream.get_data()
i = 0
for y in xrange(height):
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
else:

View File

@ -208,7 +208,7 @@ class PDFXRefFallback(PDFXRef):
except PSEOF:
pass
n = min(n, len(objs)//2)
for index in xrange(n):
for index in range(n):
objid1 = objs[index*2]
self.offsets[objid1] = (objid, index, 0)
return
@ -254,7 +254,7 @@ class PDFXRefStream(PDFBaseXRef):
def get_objids(self):
for (start, nobjs) in self.ranges:
for i in xrange(nobjs):
for i in range(nobjs):
offset = self.entlen * i
ent = self.data[offset:offset+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
@ -294,7 +294,7 @@ class PDFStandardSecurityHandler(object):
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
supported_revisions = (2, 3)
def __init__(self, docid, param, password=b''):
def __init__(self, docid, param, password=''):
self.docid = docid
self.param = param
self.password = password
@ -366,6 +366,7 @@ class PDFStandardSecurityHandler(object):
return result[:n]
def authenticate(self, password):
password = password.encode("latin1")
key = self.authenticate_user_password(password)
if key is None:
key = self.authenticate_owner_password(password)
@ -401,7 +402,7 @@ class PDFStandardSecurityHandler(object):
else:
user_password = self.o
for i in range(19, -1, -1):
k = b''.join(chr(ord(c) ^ i) for c in key)
k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key))
user_password = ARC4.new(k).decrypt(user_password)
return self.authenticate_user_password(user_password)
@ -536,7 +537,7 @@ class PDFDocument(object):
if SHA256 is not None:
security_handler_registry[5] = PDFStandardSecurityHandlerV5
def __init__(self, parser, password=b'', caching=True, fallback=True):
def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."
self.caching = caching
self.xrefs = []
@ -587,7 +588,7 @@ class PDFDocument(object):
# _initialize_password(password=b'')
# Perform the initialization with a given password.
def _initialize_password(self, password=b''):
def _initialize_password(self, password=''):
(docid, param) = self.encryption
if literal_name(param.get('Filter')) != 'Standard':
raise PDFEncryptionError('Unknown filter: param=%r' % param)

View File

@ -805,7 +805,8 @@ class PDFPageInterpreter(object):
# According to PDF reference 1.7 section 4.9.1, XObjects in
# earlier PDFs (prior to v1.2) use the page's Resources entry
# instead of having their own Resources entry.
resources = dict_value(xobj.get('Resources')) or self.resources.copy()
xobjres = xobj.get('Resources')
resources = dict_value(xobjres) if xobjres else self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix)
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
self.device.end_figure(xobjid)

View File

@ -113,7 +113,7 @@ class PDFPage(object):
@classmethod
def get_pages(klass, fp,
pagenos=None, maxpages=0, password=b'',
pagenos=None, maxpages=0, password='',
caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

View File

@ -806,7 +806,7 @@ def rijndaelSetupDecrypt(key, keybits):
j -= 4
# apply the inverse MixColumn transform to all round keys but the first and the last:
p = 0
for i in xrange(1, nrounds):
for i in range(1, nrounds):
p += 4
rk[p+0] = (
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^

View File

@ -19,29 +19,37 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
line0 = b'\x00' * columns
for i in range(0, len(data), nbytes+1):
ft = data[i]
if six.PY2:
ft = six.byte2int(ft)
i += 1
line1 = data[i:i+nbytes]
line2 = b''
if ft == b'\x00':
if ft == 0:
# PNG none
line2 += line1
elif ft == b'\x01':
elif ft == 1:
# PNG sub (UNTESTED)
c = 0
for b in line1:
c = (c+ord(b)) & 255
line2 += chr(c)
elif ft == b'\x02':
if six.PY2:
b = six.byte2int(b)
c = (c+b) & 255
line2 += six.int2byte(c)
elif ft == 2:
# PNG up
for (a, b) in zip(line0, line1):
c = (ord(a)+ord(b)) & 255
line2 += chr(c)
elif ft == b'\x03':
if six.PY2:
a, b = six.byte2int(a), six.byte2int(b)
c = (a+b) & 255
line2 += six.int2byte(c)
elif ft == 3:
# PNG average (UNTESTED)
c = 0
for (a, b) in zip(line0, line1):
c = ((c+ord(a)+ord(b))//2) & 255
line2 += chr(c)
if six.PY2:
a, b = six.byte2int(a), six.byte2int(b)
c = ((c+a+b)//2) & 255
line2 += six.int2byte(c)
else:
# unsupported
raise ValueError(ft)
@ -233,7 +241,9 @@ def decode_text(s):
def enc(x, codec='ascii'):
"""Encodes a string for SGML/XML/HTML"""
x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
return x.encode(codec, 'xmlcharrefreplace')
if codec:
x = x.encode(codec, 'xmlcharrefreplace')
return x
def bbox2str(bbox):

View File

@ -4,8 +4,10 @@ RM=rm -f
CMP=:
ECHO=echo
PYTHON=python2
PYTHON3=python3
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 -V
PDF2TXT3=PYTHONPATH=.. $(PYTHON3) ../tools/pdf2txt.py -p1 -V
FREE= \
simple1 \
@ -45,6 +47,12 @@ tests:
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
$(PDF2TXT) -t text -o $$i.txt $$i.pdf || eixt 1; \
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
$(PDF2TXT3) -t html -o $$i.html $$i.pdf || exit 1; \
$(CMP) $$i.html $$i.html.ref || exit 1; \
$(PDF2TXT3) -t xml -o $$i.xml $$i.pdf || exit 1; \
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
$(PDF2TXT3) -t text -o $$i.txt $$i.pdf || eixt 1; \
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
done
crypts:
@ -54,6 +62,10 @@ crypts:
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
$(PDF2TXT) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
$(PDF2TXT3) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
$(PDF2TXT3) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
done
test:

View File

@ -241,7 +241,7 @@ def main(argv):
objids = []
pagenos = set()
codec = None
password = b''
password = ''
dumpall = False
proc = dumppdf
outfp = sys.stdout
@ -261,6 +261,9 @@ def main(argv):
extractdir = v
proc = extractembedded
if six.PY2 and sys.stdin.encoding:
password = password.decode(sys.stdin.encoding)
for fname in args:
proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec, extractdir=extractdir)

View File

@ -10,6 +10,7 @@ from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
import logging
import six
# main
def main(argv):
@ -27,7 +28,7 @@ def main(argv):
return usage()
if not args: return usage()
# input option
password = b''
password = ''
pagenos = set()
maxpages = 0
# output option
@ -65,6 +66,10 @@ def main(argv):
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
if six.PY2 and sys.stdin.encoding:
password = password.decode(sys.stdin.encoding)
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'