commit
0e40264071
|
@ -243,12 +243,17 @@ class HTMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def write(self, text):
|
||||
if self.codec:
|
||||
text = text.encode(self.codec)
|
||||
self.outfp.write(text)
|
||||
return
|
||||
|
||||
def write_header(self):
|
||||
self.write('<html><head>\n')
|
||||
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||
if self.codec:
|
||||
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||
else:
|
||||
self.write('<meta http-equiv="Content-Type" content="text/html">\n')
|
||||
self.write('</head><body>\n')
|
||||
return
|
||||
|
||||
|
@ -259,7 +264,7 @@ class HTMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def write_text(self, text):
|
||||
self.write(enc(text, self.codec))
|
||||
self.write(enc(text, None))
|
||||
return
|
||||
|
||||
def place_rect(self, color, borderwidth, x, y, w, h):
|
||||
|
@ -281,7 +286,7 @@ class HTMLConverter(PDFConverter):
|
|||
name = self.imagewriter.export_image(item)
|
||||
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||
'width="%d" height="%d" />\n' %
|
||||
(enc(name), borderwidth,
|
||||
(enc(name, None), borderwidth,
|
||||
x*self.scale, (self._yoffset-y)*self.scale,
|
||||
w*self.scale, h*self.scale))
|
||||
return
|
||||
|
@ -411,88 +416,97 @@ class XMLConverter(PDFConverter):
|
|||
self.write_header()
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
if self.codec:
|
||||
text = text.encode(self.codec)
|
||||
self.outfp.write(text)
|
||||
return
|
||||
|
||||
def write_header(self):
|
||||
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
|
||||
self.outfp.write('<pages>\n')
|
||||
if self.codec:
|
||||
self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
|
||||
else:
|
||||
self.write('<?xml version="1.0" ?>\n')
|
||||
self.write('<pages>\n')
|
||||
return
|
||||
|
||||
def write_footer(self):
|
||||
self.outfp.write('</pages>\n')
|
||||
self.write('</pages>\n')
|
||||
return
|
||||
|
||||
def write_text(self, text):
|
||||
if self.stripcontrol:
|
||||
text = self.CONTROL.sub(u'', text)
|
||||
self.outfp.write(enc(text, self.codec))
|
||||
self.write(enc(text, None))
|
||||
return
|
||||
|
||||
def receive_layout(self, ltpage):
|
||||
def show_group(item):
|
||||
if isinstance(item, LTTextBox):
|
||||
self.outfp.write('<textbox id="%d" bbox="%s" />\n' %
|
||||
self.write('<textbox id="%d" bbox="%s" />\n' %
|
||||
(item.index, bbox2str(item.bbox)))
|
||||
elif isinstance(item, LTTextGroup):
|
||||
self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
||||
self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
|
||||
for child in item:
|
||||
show_group(child)
|
||||
self.outfp.write('</textgroup>\n')
|
||||
self.write('</textgroup>\n')
|
||||
return
|
||||
|
||||
def render(item):
|
||||
if isinstance(item, LTPage):
|
||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
self.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
||||
(item.pageid, bbox2str(item.bbox), item.rotate))
|
||||
for child in item:
|
||||
render(child)
|
||||
if item.groups is not None:
|
||||
self.outfp.write('<layout>\n')
|
||||
self.write('<layout>\n')
|
||||
for group in item.groups:
|
||||
show_group(group)
|
||||
self.outfp.write('</layout>\n')
|
||||
self.outfp.write('</page>\n')
|
||||
self.write('</layout>\n')
|
||||
self.write('</page>\n')
|
||||
elif isinstance(item, LTLine):
|
||||
self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
|
||||
self.write('<line linewidth="%d" bbox="%s" />\n' %
|
||||
(item.linewidth, bbox2str(item.bbox)))
|
||||
elif isinstance(item, LTRect):
|
||||
self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
|
||||
self.write('<rect linewidth="%d" bbox="%s" />\n' %
|
||||
(item.linewidth, bbox2str(item.bbox)))
|
||||
elif isinstance(item, LTCurve):
|
||||
self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
|
||||
self.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
|
||||
(item.linewidth, bbox2str(item.bbox), item.get_pts()))
|
||||
elif isinstance(item, LTFigure):
|
||||
self.outfp.write('<figure name="%s" bbox="%s">\n' %
|
||||
self.write('<figure name="%s" bbox="%s">\n' %
|
||||
(item.name, bbox2str(item.bbox)))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</figure>\n')
|
||||
self.write('</figure>\n')
|
||||
elif isinstance(item, LTTextLine):
|
||||
self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
|
||||
self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textline>\n')
|
||||
self.write('</textline>\n')
|
||||
elif isinstance(item, LTTextBox):
|
||||
wmode = ''
|
||||
if isinstance(item, LTTextBoxVertical):
|
||||
wmode = ' wmode="vertical"'
|
||||
self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' %
|
||||
self.write('<textbox id="%d" bbox="%s"%s>\n' %
|
||||
(item.index, bbox2str(item.bbox), wmode))
|
||||
for child in item:
|
||||
render(child)
|
||||
self.outfp.write('</textbox>\n')
|
||||
self.write('</textbox>\n')
|
||||
elif isinstance(item, LTChar):
|
||||
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
|
||||
(enc(item.fontname), bbox2str(item.bbox), item.size))
|
||||
self.write('<text font="%s" bbox="%s" size="%.3f">' %
|
||||
(enc(item.fontname, None), bbox2str(item.bbox), item.size))
|
||||
self.write_text(item.get_text())
|
||||
self.outfp.write('</text>\n')
|
||||
self.write('</text>\n')
|
||||
elif isinstance(item, LTText):
|
||||
self.outfp.write('<text>%s</text>\n' % item.get_text())
|
||||
self.write('<text>%s</text>\n' % item.get_text())
|
||||
elif isinstance(item, LTImage):
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||
(enc(name), item.width, item.height))
|
||||
self.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||
(enc(name, None), item.width, item.height))
|
||||
else:
|
||||
self.outfp.write('<image width="%d" height="%d" />\n' %
|
||||
self.write('<image width="%d" height="%d" />\n' %
|
||||
(item.width, item.height))
|
||||
else:
|
||||
assert 0, item
|
||||
|
|
|
@ -45,7 +45,7 @@ class BMPWriter(object):
|
|||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
elif ncols == 256:
|
||||
# grayscale color table
|
||||
for i in xrange(256):
|
||||
for i in range(256):
|
||||
self.fp.write(struct.pack('BBBx', i, i, i))
|
||||
self.pos0 = self.fp.tell()
|
||||
self.pos1 = self.pos0 + self.datasize
|
||||
|
@ -98,7 +98,7 @@ class ImageWriter(object):
|
|||
data = stream.get_data()
|
||||
i = 0
|
||||
width = (width+7)//8
|
||||
for y in xrange(height):
|
||||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_RGB:
|
||||
|
@ -106,14 +106,14 @@ class ImageWriter(object):
|
|||
data = stream.get_data()
|
||||
i = 0
|
||||
width = width*3
|
||||
for y in xrange(height):
|
||||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
elif image.bits == 8 and image.colorspace is LITERAL_DEVICE_GRAY:
|
||||
bmp = BMPWriter(fp, 8, width, height)
|
||||
data = stream.get_data()
|
||||
i = 0
|
||||
for y in xrange(height):
|
||||
for y in range(height):
|
||||
bmp.write_line(y, data[i:i+width])
|
||||
i += width
|
||||
else:
|
||||
|
|
|
@ -208,7 +208,7 @@ class PDFXRefFallback(PDFXRef):
|
|||
except PSEOF:
|
||||
pass
|
||||
n = min(n, len(objs)//2)
|
||||
for index in xrange(n):
|
||||
for index in range(n):
|
||||
objid1 = objs[index*2]
|
||||
self.offsets[objid1] = (objid, index, 0)
|
||||
return
|
||||
|
@ -254,7 +254,7 @@ class PDFXRefStream(PDFBaseXRef):
|
|||
|
||||
def get_objids(self):
|
||||
for (start, nobjs) in self.ranges:
|
||||
for i in xrange(nobjs):
|
||||
for i in range(nobjs):
|
||||
offset = self.entlen * i
|
||||
ent = self.data[offset:offset+self.entlen]
|
||||
f1 = nunpack(ent[:self.fl1], 1)
|
||||
|
@ -294,7 +294,7 @@ class PDFStandardSecurityHandler(object):
|
|||
b'..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz')
|
||||
supported_revisions = (2, 3)
|
||||
|
||||
def __init__(self, docid, param, password=b''):
|
||||
def __init__(self, docid, param, password=''):
|
||||
self.docid = docid
|
||||
self.param = param
|
||||
self.password = password
|
||||
|
@ -366,6 +366,7 @@ class PDFStandardSecurityHandler(object):
|
|||
return result[:n]
|
||||
|
||||
def authenticate(self, password):
|
||||
password = password.encode("latin1")
|
||||
key = self.authenticate_user_password(password)
|
||||
if key is None:
|
||||
key = self.authenticate_owner_password(password)
|
||||
|
@ -401,7 +402,7 @@ class PDFStandardSecurityHandler(object):
|
|||
else:
|
||||
user_password = self.o
|
||||
for i in range(19, -1, -1):
|
||||
k = b''.join(chr(ord(c) ^ i) for c in key)
|
||||
k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key))
|
||||
user_password = ARC4.new(k).decrypt(user_password)
|
||||
return self.authenticate_user_password(user_password)
|
||||
|
||||
|
@ -536,7 +537,7 @@ class PDFDocument(object):
|
|||
if SHA256 is not None:
|
||||
security_handler_registry[5] = PDFStandardSecurityHandlerV5
|
||||
|
||||
def __init__(self, parser, password=b'', caching=True, fallback=True):
|
||||
def __init__(self, parser, password='', caching=True, fallback=True):
|
||||
"Set the document to use a given PDFParser object."
|
||||
self.caching = caching
|
||||
self.xrefs = []
|
||||
|
@ -587,7 +588,7 @@ class PDFDocument(object):
|
|||
|
||||
# _initialize_password(password=b'')
|
||||
# Perform the initialization with a given password.
|
||||
def _initialize_password(self, password=b''):
|
||||
def _initialize_password(self, password=''):
|
||||
(docid, param) = self.encryption
|
||||
if literal_name(param.get('Filter')) != 'Standard':
|
||||
raise PDFEncryptionError('Unknown filter: param=%r' % param)
|
||||
|
|
|
@ -805,7 +805,8 @@ class PDFPageInterpreter(object):
|
|||
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
||||
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
||||
# instead of having their own Resources entry.
|
||||
resources = dict_value(xobj.get('Resources')) or self.resources.copy()
|
||||
xobjres = xobj.get('Resources')
|
||||
resources = dict_value(xobjres) if xobjres else self.resources.copy()
|
||||
self.device.begin_figure(xobjid, bbox, matrix)
|
||||
interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
|
||||
self.device.end_figure(xobjid)
|
||||
|
|
|
@ -113,7 +113,7 @@ class PDFPage(object):
|
|||
|
||||
@classmethod
|
||||
def get_pages(klass, fp,
|
||||
pagenos=None, maxpages=0, password=b'',
|
||||
pagenos=None, maxpages=0, password='',
|
||||
caching=True, check_extractable=True):
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(fp)
|
||||
|
|
|
@ -806,7 +806,7 @@ def rijndaelSetupDecrypt(key, keybits):
|
|||
j -= 4
|
||||
# apply the inverse MixColumn transform to all round keys but the first and the last:
|
||||
p = 0
|
||||
for i in xrange(1, nrounds):
|
||||
for i in range(1, nrounds):
|
||||
p += 4
|
||||
rk[p+0] = (
|
||||
Td0[Te4[(rk[p+0] >> 24) ] & 0xff] ^
|
||||
|
|
|
@ -19,29 +19,37 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
|||
line0 = b'\x00' * columns
|
||||
for i in range(0, len(data), nbytes+1):
|
||||
ft = data[i]
|
||||
if six.PY2:
|
||||
ft = six.byte2int(ft)
|
||||
i += 1
|
||||
line1 = data[i:i+nbytes]
|
||||
line2 = b''
|
||||
if ft == b'\x00':
|
||||
if ft == 0:
|
||||
# PNG none
|
||||
line2 += line1
|
||||
elif ft == b'\x01':
|
||||
elif ft == 1:
|
||||
# PNG sub (UNTESTED)
|
||||
c = 0
|
||||
for b in line1:
|
||||
c = (c+ord(b)) & 255
|
||||
line2 += chr(c)
|
||||
elif ft == b'\x02':
|
||||
if six.PY2:
|
||||
b = six.byte2int(b)
|
||||
c = (c+b) & 255
|
||||
line2 += six.int2byte(c)
|
||||
elif ft == 2:
|
||||
# PNG up
|
||||
for (a, b) in zip(line0, line1):
|
||||
c = (ord(a)+ord(b)) & 255
|
||||
line2 += chr(c)
|
||||
elif ft == b'\x03':
|
||||
if six.PY2:
|
||||
a, b = six.byte2int(a), six.byte2int(b)
|
||||
c = (a+b) & 255
|
||||
line2 += six.int2byte(c)
|
||||
elif ft == 3:
|
||||
# PNG average (UNTESTED)
|
||||
c = 0
|
||||
for (a, b) in zip(line0, line1):
|
||||
c = ((c+ord(a)+ord(b))//2) & 255
|
||||
line2 += chr(c)
|
||||
if six.PY2:
|
||||
a, b = six.byte2int(a), six.byte2int(b)
|
||||
c = ((c+a+b)//2) & 255
|
||||
line2 += six.int2byte(c)
|
||||
else:
|
||||
# unsupported
|
||||
raise ValueError(ft)
|
||||
|
@ -233,7 +241,9 @@ def decode_text(s):
|
|||
def enc(x, codec='ascii'):
|
||||
"""Encodes a string for SGML/XML/HTML"""
|
||||
x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
|
||||
return x.encode(codec, 'xmlcharrefreplace')
|
||||
if codec:
|
||||
x = x.encode(codec, 'xmlcharrefreplace')
|
||||
return x
|
||||
|
||||
|
||||
def bbox2str(bbox):
|
||||
|
|
|
@ -4,8 +4,10 @@ RM=rm -f
|
|||
CMP=:
|
||||
ECHO=echo
|
||||
PYTHON=python2
|
||||
PYTHON3=python3
|
||||
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) ../tools/pdf2txt.py -p1 -V
|
||||
PDF2TXT3=PYTHONPATH=.. $(PYTHON3) ../tools/pdf2txt.py -p1 -V
|
||||
|
||||
FREE= \
|
||||
simple1 \
|
||||
|
@ -45,6 +47,12 @@ tests:
|
|||
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
|
||||
$(PDF2TXT) -t text -o $$i.txt $$i.pdf || eixt 1; \
|
||||
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
|
||||
$(PDF2TXT3) -t html -o $$i.html $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.html $$i.html.ref || exit 1; \
|
||||
$(PDF2TXT3) -t xml -o $$i.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.xml $$i.xml.ref || exit 1; \
|
||||
$(PDF2TXT3) -t text -o $$i.txt $$i.pdf || eixt 1; \
|
||||
$(CMP) $$i.txt $$i.txt.ref || exit 1; \
|
||||
done
|
||||
|
||||
crypts:
|
||||
|
@ -54,6 +62,10 @@ crypts:
|
|||
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
$(PDF2TXT) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
$(PDF2TXT3) -t xml -P $(CRYPT_PASS1) -o $$i.1.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.1.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
$(PDF2TXT3) -t xml -P $(CRYPT_PASS2) -o $$i.2.xml $$i.pdf || exit 1; \
|
||||
$(CMP) $$i.2.xml $(CRYPT_BASE).xml || exit 1; \
|
||||
done
|
||||
|
||||
test:
|
||||
|
|
|
@ -241,7 +241,7 @@ def main(argv):
|
|||
objids = []
|
||||
pagenos = set()
|
||||
codec = None
|
||||
password = b''
|
||||
password = ''
|
||||
dumpall = False
|
||||
proc = dumppdf
|
||||
outfp = sys.stdout
|
||||
|
@ -261,6 +261,9 @@ def main(argv):
|
|||
extractdir = v
|
||||
proc = extractembedded
|
||||
|
||||
if six.PY2 and sys.stdin.encoding:
|
||||
password = password.decode(sys.stdin.encoding)
|
||||
|
||||
for fname in args:
|
||||
proc(outfp, fname, objids, pagenos, password=password,
|
||||
dumpall=dumpall, codec=codec, extractdir=extractdir)
|
||||
|
|
|
@ -10,6 +10,7 @@ from pdfminer.cmapdb import CMapDB
|
|||
from pdfminer.layout import LAParams
|
||||
from pdfminer.image import ImageWriter
|
||||
import logging
|
||||
import six
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
|
@ -27,7 +28,7 @@ def main(argv):
|
|||
return usage()
|
||||
if not args: return usage()
|
||||
# input option
|
||||
password = b''
|
||||
password = ''
|
||||
pagenos = set()
|
||||
maxpages = 0
|
||||
# output option
|
||||
|
@ -65,6 +66,10 @@ def main(argv):
|
|||
elif k == '-c': codec = v
|
||||
elif k == '-s': scale = float(v)
|
||||
#
|
||||
|
||||
if six.PY2 and sys.stdin.encoding:
|
||||
password = password.decode(sys.stdin.encoding)
|
||||
|
||||
rsrcmgr = PDFResourceManager(caching=caching)
|
||||
if not outtype:
|
||||
outtype = 'text'
|
||||
|
|
Loading…
Reference in New Issue