Merge remote-tracking branch 'upstream/master'

pull/142/head
Martin Wolf 2018-06-25 23:32:52 +02:00
commit eff3f19886
5 changed files with 180 additions and 178 deletions

View File

@ -213,6 +213,8 @@ ENCODING = [
('six', 54, 54, 54, 54), ('six', 54, 54, 54, 54),
('slash', 47, 47, 47, 47), ('slash', 47, 47, 47, 47),
('space', 32, 32, 32, 32), ('space', 32, 32, 32, 32),
('space', None, 202, 160, None),
('space', None, 202, 173, None),
('sterling', 163, 163, 163, 163), ('sterling', 163, 163, 163, 163),
('t', 116, 116, 116, 116), ('t', 116, 116, 116, 116),
('thorn', None, None, 254, 254), ('thorn', None, None, 254, 254),

View File

@ -58,7 +58,7 @@ class PDFDevice(object):
def render_image(self, name, stream): def render_image(self, name, stream):
return return
def render_string(self, textstate, seq): def render_string(self, textstate, seq, ncs, graphicstate):
return return
@ -145,7 +145,7 @@ class TagExtractor(PDFDevice):
self._stack = [] self._stack = []
return return
def render_string(self, textstate, seq): def render_string(self, textstate, seq, ncs, graphicstate):
font = textstate.font font = textstate.font
text = '' text = ''
for obj in seq: for obj in seq:

View File

@ -1,30 +1,30 @@
# -*- mode: python -*- # -*- mode: python -*-
block_cipher = None block_cipher = None
a = Analysis(['pdf2txt.py'], a = Analysis(['pdf2txt.py'],
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
binaries=[], binaries=[],
datas=[], datas=[],
hiddenimports=[], hiddenimports=[],
hookspath=[], hookspath=[],
runtime_hooks=[], runtime_hooks=[],
excludes=['django','matplotlib','PIL','numpy','qt5'], excludes=['django','matplotlib','PIL','numpy','qt5'],
win_no_prefer_redirects=False, win_no_prefer_redirects=False,
win_private_assemblies=False, win_private_assemblies=False,
cipher=block_cipher) cipher=block_cipher)
pyz = PYZ(a.pure, a.zipped_data, pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher) cipher=block_cipher)
exe = EXE(pyz, exe = EXE(pyz,
a.scripts, a.scripts,
a.binaries, a.binaries,
a.zipfiles, a.zipfiles,
a.datas, a.datas,
name='pdf2txt', name='pdf2txt',
debug=False, debug=False,
strip=False, strip=False,
upx=True, upx=True,
runtime_tmpdir=None, runtime_tmpdir=None,
console=True ) console=True )

View File

@ -1,117 +1,117 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
compares rwo pdf files. compares rwo pdf files.
""" """
import sys import sys
import logging import logging
import six import six
import pdfminer.settings import pdfminer.settings
pdfminer.settings.STRICT = False pdfminer.settings.STRICT = False
import pdfminer.high_level import pdfminer.high_level
import pdfminer.layout import pdfminer.layout
def compare(file1,file2,**args): def compare(file1,file2,**args):
if args.get('_py2_no_more_posargs',None) is not None: if args.get('_py2_no_more_posargs',None) is not None:
raise ValueError("Too many positional arguments passed.") raise ValueError("Too many positional arguments passed.")
# If any LAParams group arguments were passed, create an LAParams object and # If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None. # populate with given args. Otherwise, set it to None.
if args.get('laparams',None) is None: if args.get('laparams',None) is None:
laparams = pdfminer.layout.LAParams() laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = args.get(param, None) paramv = args.get(param, None)
if paramv is not None: if paramv is not None:
laparams[param]=paramv laparams[param]=paramv
args['laparams']=laparams args['laparams']=laparams
s1=six.StringIO() s1=six.StringIO()
with open(file1, "rb") as fp: with open(file1, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp,s1, **args) pdfminer.high_level.extract_text_to_fp(fp,s1, **args)
s2=six.StringIO() s2=six.StringIO()
with open(file2, "rb") as fp: with open(file2, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp,s2, **args) pdfminer.high_level.extract_text_to_fp(fp,s2, **args)
import difflib import difflib
s1.seek(0) s1.seek(0)
s2.seek(0) s2.seek(0)
s1,s2=s1.readlines(), s2.readlines() s1,s2=s1.readlines(), s2.readlines()
import os.path import os.path
try: try:
extension = os.path.splitext(args['outfile'])[1][1:4] extension = os.path.splitext(args['outfile'])[1][1:4]
if extension.lower()=='htm': if extension.lower()=='htm':
return difflib.HtmlDiff().make_file(s1,s2) return difflib.HtmlDiff().make_file(s1,s2)
except KeyError: except KeyError:
pass pass
return difflib.unified_diff(s1,s2,n=args['context_lines']) return difflib.unified_diff(s1,s2,n=args['context_lines'])
# main # main
def main(args=None): def main(args=None):
import argparse import argparse
P = argparse.ArgumentParser(description=__doc__) P = argparse.ArgumentParser(description=__doc__)
P.add_argument("file1", type=str, default=None, help="File 1 to compare.") P.add_argument("file1", type=str, default=None, help="File 1 to compare.")
P.add_argument("file2", type=str, default=None, help="File 2 to compare.") P.add_argument("file2", type=str, default=None, help="File 2 to compare.")
P.add_argument("-o", "--outfile", type=str, default="-", P.add_argument("-o", "--outfile", type=str, default="-",
help="Output file (default/'-' is stdout) \ help="Output file (default/'-' is stdout) \
if .htm or .html, create an HTML table (or a complete HTML file containing the table) \ if .htm or .html, create an HTML table (or a complete HTML file containing the table) \
showing a side by side, line by line comparison of text with inter-line \ showing a side by side, line by line comparison of text with inter-line \
and intra-line change highlights. \ and intra-line change highlights. \
The table can be generated in either full or contextual difference mode." The table can be generated in either full or contextual difference mode."
) )
P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown") P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
# params for pdf2txt # params for pdf2txt
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs") P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs")
P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)") P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding") P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale") P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin") P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin") P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin") P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow") P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams") P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation") P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images") P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args) A = P.parse_args(args=args)
if A.page_numbers: if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers]) A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos: if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
if six.PY2 and sys.stdin.encoding: if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding) A.password = A.password.decode(sys.stdin.encoding)
if A.output_type == "text" and A.outfile != "-": if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"), for override, alttype in ( (".htm", "html"),
(".html", "html"), (".html", "html"),
(".xml", "xml" ), (".xml", "xml" ),
(".tag", "tag" ) ): (".tag", "tag" ) ):
if A.outfile.endswith(override): if A.outfile.endswith(override):
A.output_type = alttype A.output_type = alttype
if A.outfile == "-": if A.outfile == "-":
outfp = sys.stdout outfp = sys.stdout
else: else:
outfp = open(A.outfile, "w", encoding='utf-8') outfp = open(A.outfile, "w", encoding='utf-8')
outfp.writelines(compare(**vars(A))) outfp.writelines(compare(**vars(A)))
outfp.close() outfp.close()
return 0 return 0
if __name__ == '__main__': sys.exit(main()) if __name__ == '__main__': sys.exit(main())

View File

@ -1,29 +1,29 @@
# -*- mode: python -*- # -*- mode: python -*-
block_cipher = None block_cipher = None
a = Analysis(['pdfdiff.py'], a = Analysis(['pdfdiff.py'],
pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'],
binaries=[], binaries=[],
datas=[], datas=[],
hiddenimports=[], hiddenimports=[],
hookspath=[], hookspath=[],
runtime_hooks=[], runtime_hooks=[],
excludes=['django','matplotlib','PIL','numpy','qt5'], excludes=['django','matplotlib','PIL','numpy','qt5'],
win_no_prefer_redirects=False, win_no_prefer_redirects=False,
win_private_assemblies=False, win_private_assemblies=False,
cipher=block_cipher) cipher=block_cipher)
pyz = PYZ(a.pure, a.zipped_data, pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher) cipher=block_cipher)
exe = EXE(pyz, exe = EXE(pyz,
a.scripts, a.scripts,
a.binaries, a.binaries,
a.zipfiles, a.zipfiles,
a.datas, a.datas,
name='pdfdiff', name='pdfdiff',
debug=False, debug=False,
strip=False, strip=False,
upx=True, upx=True,
runtime_tmpdir=None, runtime_tmpdir=None,
console=True ) console=True )