Merge pull request #160 from charlesreid1/fix-line-endings

apply dos2unix to files in pdfminer/tools dirs to remove \r\n windows line endings
pull/166/head
Tata Ganesh 2018-06-25 11:19:14 +05:30 committed by GitHub
commit 07eafe7b27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 373 additions and 373 deletions

View File

@ -1,197 +1,197 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import six import six
from .pdffont import PDFUnicodeNotDefined from .pdffont import PDFUnicodeNotDefined
from . import utils from . import utils
## PDFDevice ## PDFDevice
## ##
class PDFDevice(object): class PDFDevice(object):
def __init__(self, rsrcmgr): def __init__(self, rsrcmgr):
self.rsrcmgr = rsrcmgr self.rsrcmgr = rsrcmgr
self.ctm = None self.ctm = None
return return
def __repr__(self): def __repr__(self):
return '<PDFDevice>' return '<PDFDevice>'
def __enter__(self): def __enter__(self):
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
self.close() self.close()
def close(self): def close(self):
return return
def set_ctm(self, ctm): def set_ctm(self, ctm):
self.ctm = ctm self.ctm = ctm
return return
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
return return
def end_tag(self): def end_tag(self):
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
return return
def end_page(self, page): def end_page(self, page):
return return
def begin_figure(self, name, bbox, matrix): def begin_figure(self, name, bbox, matrix):
return return
def end_figure(self, name): def end_figure(self, name):
return return
def paint_path(self, graphicstate, stroke, fill, evenodd, path): def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return return
def render_image(self, name, stream): def render_image(self, name, stream):
return return
def render_string(self, textstate, seq, ncs, graphicstate): def render_string(self, textstate, seq, ncs, graphicstate):
return return
## PDFTextDevice ## PDFTextDevice
## ##
class PDFTextDevice(PDFDevice): class PDFTextDevice(PDFDevice):
def render_string(self, textstate, seq, ncs, graphicstate): def render_string(self, textstate, seq, ncs, graphicstate):
matrix = utils.mult_matrix(textstate.matrix, self.ctm) matrix = utils.mult_matrix(textstate.matrix, self.ctm)
font = textstate.font font = textstate.font
fontsize = textstate.fontsize fontsize = textstate.fontsize
scaling = textstate.scaling * .01 scaling = textstate.scaling * .01
charspace = textstate.charspace * scaling charspace = textstate.charspace * scaling
wordspace = textstate.wordspace * scaling wordspace = textstate.wordspace * scaling
rise = textstate.rise rise = textstate.rise
if font.is_multibyte(): if font.is_multibyte():
wordspace = 0 wordspace = 0
dxscale = .001 * fontsize * scaling dxscale = .001 * fontsize * scaling
if font.is_vertical(): if font.is_vertical():
textstate.linematrix = self.render_string_vertical( textstate.linematrix = self.render_string_vertical(
seq, matrix, textstate.linematrix, font, fontsize, seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
else: else:
textstate.linematrix = self.render_string_horizontal( textstate.linematrix = self.render_string_horizontal(
seq, matrix, textstate.linematrix, font, fontsize, seq, matrix, textstate.linematrix, font, fontsize,
scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
return return
def render_string_horizontal(self, seq, matrix, pos, def render_string_horizontal(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace, font, fontsize, scaling, charspace, wordspace,
rise, dxscale, ncs, graphicstate): rise, dxscale, ncs, graphicstate):
(x, y) = pos (x, y) = pos
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if utils.isnumber(obj): if utils.isnumber(obj):
x -= obj*dxscale x -= obj*dxscale
needcharspace = True needcharspace = True
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
x += charspace x += charspace
x += self.render_char(utils.translate_matrix(matrix, (x, y)), x += self.render_char(utils.translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid, font, fontsize, scaling, rise, cid,
ncs, graphicstate) ncs, graphicstate)
if cid == 32 and wordspace: if cid == 32 and wordspace:
x += wordspace x += wordspace
needcharspace = True needcharspace = True
return (x, y) return (x, y)
def render_string_vertical(self, seq, matrix, pos, def render_string_vertical(self, seq, matrix, pos,
font, fontsize, scaling, charspace, wordspace, font, fontsize, scaling, charspace, wordspace,
rise, dxscale, ncs, graphicstate): rise, dxscale, ncs, graphicstate):
(x, y) = pos (x, y) = pos
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if utils.isnumber(obj): if utils.isnumber(obj):
y -= obj*dxscale y -= obj*dxscale
needcharspace = True needcharspace = True
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
y += charspace y += charspace
y += self.render_char(utils.translate_matrix(matrix, (x, y)), y += self.render_char(utils.translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid, font, fontsize, scaling, rise, cid,
ncs, graphicstate) ncs, graphicstate)
if cid == 32 and wordspace: if cid == 32 and wordspace:
y += wordspace y += wordspace
needcharspace = True needcharspace = True
return (x, y) return (x, y)
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
return 0 return 0
## TagExtractor ## TagExtractor
## ##
class TagExtractor(PDFDevice): class TagExtractor(PDFDevice):
def __init__(self, rsrcmgr, outfp, codec='utf-8'): def __init__(self, rsrcmgr, outfp, codec='utf-8'):
PDFDevice.__init__(self, rsrcmgr) PDFDevice.__init__(self, rsrcmgr)
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
self.pageno = 0 self.pageno = 0
self._stack = [] self._stack = []
return return
def render_string(self, textstate, seq, ncs, graphicstate): def render_string(self, textstate, seq, ncs, graphicstate):
font = textstate.font font = textstate.font
text = '' text = ''
for obj in seq: for obj in seq:
if isinstance(obj, six.text_type): if isinstance(obj, six.text_type):
obj = utils.make_compat_bytes(obj) obj = utils.make_compat_bytes(obj)
if not isinstance(obj, six.binary_type): if not isinstance(obj, six.binary_type):
continue continue
chars = font.decode(obj) chars = font.decode(obj)
for cid in chars: for cid in chars:
try: try:
char = font.to_unichr(cid) char = font.to_unichr(cid)
text += char text += char
except PDFUnicodeNotDefined: except PDFUnicodeNotDefined:
print(chars) print(chars)
pass pass
self.outfp.write(utils.enc(text, self.codec)) self.outfp.write(utils.enc(text, self.codec))
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
output = '<page id="%s" bbox="%s" rotate="%d">' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate) output = '<page id="%s" bbox="%s" rotate="%d">' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
self.outfp.write(utils.make_compat_bytes(output)) self.outfp.write(utils.make_compat_bytes(output))
return return
def end_page(self, page): def end_page(self, page):
self.outfp.write(utils.make_compat_bytes('</page>\n')) self.outfp.write(utils.make_compat_bytes('</page>\n'))
self.pageno += 1 self.pageno += 1
return return
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
s = '' s = ''
if isinstance(props, dict): if isinstance(props, dict):
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
in sorted(props.iteritems())) in sorted(props.iteritems()))
out_s = '<%s%s>' % (utils.enc(tag.name), s) out_s = '<%s%s>' % (utils.enc(tag.name), s)
self.outfp.write(utils.make_compat_bytes(out_s)) self.outfp.write(utils.make_compat_bytes(out_s))
self._stack.append(tag) self._stack.append(tag)
return return
def end_tag(self): def end_tag(self):
assert self._stack, str(self.pageno) assert self._stack, str(self.pageno)
tag = self._stack.pop(-1) tag = self._stack.pop(-1)
out_s = '</%s>' % utils.enc(tag.name) out_s = '</%s>' % utils.enc(tag.name)
self.outfp.write(utils.make_compat_bytes(out_s)) self.outfp.write(utils.make_compat_bytes(out_s))
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):
self.begin_tag(tag, props) self.begin_tag(tag, props)
self._stack.pop(-1) self._stack.pop(-1)
return return

View File

@ -1,30 +1,30 @@
# -*- mode: python -*- # -*- mode: python -*-
block_cipher = None block_cipher = None
a = Analysis(['pdf2txt.py'], a = Analysis(['pdf2txt.py'],
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
binaries=[], binaries=[],
datas=[], datas=[],
hiddenimports=[], hiddenimports=[],
hookspath=[], hookspath=[],
runtime_hooks=[], runtime_hooks=[],
excludes=['django','matplotlib','PIL','numpy','qt5'], excludes=['django','matplotlib','PIL','numpy','qt5'],
win_no_prefer_redirects=False, win_no_prefer_redirects=False,
win_private_assemblies=False, win_private_assemblies=False,
cipher=block_cipher) cipher=block_cipher)
pyz = PYZ(a.pure, a.zipped_data, pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher) cipher=block_cipher)
exe = EXE(pyz, exe = EXE(pyz,
a.scripts, a.scripts,
a.binaries, a.binaries,
a.zipfiles, a.zipfiles,
a.datas, a.datas,
name='pdf2txt', name='pdf2txt',
debug=False, debug=False,
strip=False, strip=False,
upx=True, upx=True,
runtime_tmpdir=None, runtime_tmpdir=None,
console=True ) console=True )

View File

@ -1,117 +1,117 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
compares rwo pdf files. compares rwo pdf files.
""" """
import sys import sys
import logging import logging
import six import six
import pdfminer.settings import pdfminer.settings
pdfminer.settings.STRICT = False pdfminer.settings.STRICT = False
import pdfminer.high_level import pdfminer.high_level
import pdfminer.layout import pdfminer.layout
def compare(file1,file2,**args): def compare(file1,file2,**args):
if args.get('_py2_no_more_posargs',None) is not None: if args.get('_py2_no_more_posargs',None) is not None:
raise ValueError("Too many positional arguments passed.") raise ValueError("Too many positional arguments passed.")
# If any LAParams group arguments were passed, create an LAParams object and # If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None. # populate with given args. Otherwise, set it to None.
if args.get('laparams',None) is None: if args.get('laparams',None) is None:
laparams = pdfminer.layout.LAParams() laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = args.get(param, None) paramv = args.get(param, None)
if paramv is not None: if paramv is not None:
laparams[param]=paramv laparams[param]=paramv
args['laparams']=laparams args['laparams']=laparams
s1=six.StringIO() s1=six.StringIO()
with open(file1, "rb") as fp: with open(file1, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp,s1, **args) pdfminer.high_level.extract_text_to_fp(fp,s1, **args)
s2=six.StringIO() s2=six.StringIO()
with open(file2, "rb") as fp: with open(file2, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp,s2, **args) pdfminer.high_level.extract_text_to_fp(fp,s2, **args)
import difflib import difflib
s1.seek(0) s1.seek(0)
s2.seek(0) s2.seek(0)
s1,s2=s1.readlines(), s2.readlines() s1,s2=s1.readlines(), s2.readlines()
import os.path import os.path
try: try:
extension = os.path.splitext(args['outfile'])[1][1:4] extension = os.path.splitext(args['outfile'])[1][1:4]
if extension.lower()=='htm': if extension.lower()=='htm':
return difflib.HtmlDiff().make_file(s1,s2) return difflib.HtmlDiff().make_file(s1,s2)
except KeyError: except KeyError:
pass pass
return difflib.unified_diff(s1,s2,n=args['context_lines']) return difflib.unified_diff(s1,s2,n=args['context_lines'])
# main # main
def main(args=None): def main(args=None):
import argparse import argparse
P = argparse.ArgumentParser(description=__doc__) P = argparse.ArgumentParser(description=__doc__)
P.add_argument("file1", type=str, default=None, help="File 1 to compare.") P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
P.add_argument("file2", type=str, default=None, help="File 2 to compare.") P.add_argument("file2", type=str, default=None, help="File 2 to compare.")
P.add_argument("-o", "--outfile", type=str, default="-", P.add_argument("-o", "--outfile", type=str, default="-",
help="Output file (default/'-' is stdout) \ help="Output file (default/'-' is stdout) \
if .htm or .html, create an HTML table (or a complete HTML file containing the table) \ if .htm or .html, create an HTML table (or a complete HTML file containing the table) \
showing a side by side, line by line comparison of text with inter-line \ showing a side by side, line by line comparison of text with inter-line \
and intra-line change highlights. \ and intra-line change highlights. \
The table can be generated in either full or contextual difference mode." The table can be generated in either full or contextual difference mode."
) )
P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown") P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
# params for pdf2txt # params for pdf2txt
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs") P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")
P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)") P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding") P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale") P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin") P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin") P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin") P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow") P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams") P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation") P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images") P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args) A = P.parse_args(args=args)
if A.page_numbers: if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers]) A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos: if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
if six.PY2 and sys.stdin.encoding: if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding) A.password = A.password.decode(sys.stdin.encoding)
if A.output_type == "text" and A.outfile != "-": if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"), for override, alttype in ( (".htm", "html"),
(".html", "html"), (".html", "html"),
(".xml", "xml" ), (".xml", "xml" ),
(".tag", "tag" ) ): (".tag", "tag" ) ):
if A.outfile.endswith(override): if A.outfile.endswith(override):
A.output_type = alttype A.output_type = alttype
if A.outfile == "-": if A.outfile == "-":
outfp = sys.stdout outfp = sys.stdout
else: else:
outfp = open(A.outfile, "w", encoding='utf-8') outfp = open(A.outfile, "w", encoding='utf-8')
outfp.writelines(compare(**vars(A))) outfp.writelines(compare(**vars(A)))
outfp.close() outfp.close()
return 0 return 0
if __name__ == '__main__': sys.exit(main()) if __name__ == '__main__': sys.exit(main())

View File

@ -1,29 +1,29 @@
# -*- mode: python -*- # -*- mode: python -*-
block_cipher = None block_cipher = None
a = Analysis(['pdfdiff.py'], a = Analysis(['pdfdiff.py'],
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
binaries=[], binaries=[],
datas=[], datas=[],
hiddenimports=[], hiddenimports=[],
hookspath=[], hookspath=[],
runtime_hooks=[], runtime_hooks=[],
excludes=['django','matplotlib','PIL','numpy','qt5'], excludes=['django','matplotlib','PIL','numpy','qt5'],
win_no_prefer_redirects=False, win_no_prefer_redirects=False,
win_private_assemblies=False, win_private_assemblies=False,
cipher=block_cipher) cipher=block_cipher)
pyz = PYZ(a.pure, a.zipped_data, pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher) cipher=block_cipher)
exe = EXE(pyz, exe = EXE(pyz,
a.scripts, a.scripts,
a.binaries, a.binaries,
a.zipfiles, a.zipfiles,
a.datas, a.datas,
name='pdfdiff', name='pdfdiff',
debug=False, debug=False,
strip=False, strip=False,
upx=True, upx=True,
runtime_tmpdir=None, runtime_tmpdir=None,
console=True ) console=True )