Removed duplicated and therefore unused code from pdf2txt.py (#341)

pull/345/head
Martin Hasoň 2019-12-09 22:04:05 +01:00 committed by Pieter Marsman
parent 452f0b4ad0
commit 78f06225b6
1 changed files with 9 additions and 26 deletions

View File

@ -10,6 +10,11 @@ from pdfminer.image import ImageWriter
logging.basicConfig() logging.basicConfig()
OUTPUT_TYPES = ((".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag"))
def extract_text(files=[], outfile='-', def extract_text(files=[], outfile='-',
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
@ -39,15 +44,8 @@ def extract_text(files=[], outfile='-',
else: else:
laparams = None laparams = None
imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)
if output_type == "text" and outfile != "-": if output_type == "text" and outfile != "-":
for override, alttype in ( (".htm", "html"), for override, alttype in OUTPUT_TYPES:
(".html", "html"),
(".xml", "xml"),
(".tag", "tag") ):
if outfile.endswith(override): if outfile.endswith(override):
output_type = alttype output_type = alttype
@ -58,7 +56,6 @@ def extract_text(files=[], outfile='-',
else: else:
outfp = open(outfile, "wb") outfp = open(outfile, "wb")
for fname in files: for fname in files:
with open(fname, "rb") as fp: with open(fname, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp, **locals()) pdfminer.high_level.extract_text_to_fp(fp, **locals())
@ -145,33 +142,19 @@ def main(args=None):
if A.pagenos: if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
imagewriter = None
if A.output_dir:
imagewriter = ImageWriter(A.output_dir)
if six.PY2 and sys.stdin.encoding: if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding) A.password = A.password.decode(sys.stdin.encoding)
if A.output_type == "text" and A.outfile != "-": if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"), for override, alttype in OUTPUT_TYPES:
(".html", "html"),
(".xml", "xml" ),
(".tag", "tag" ) ):
if A.outfile.endswith(override): if A.outfile.endswith(override):
A.output_type = alttype A.output_type = alttype
if A.outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
# Why ignore outfp.encoding? :-/ stupid cathal?
A.codec = 'utf-8'
else:
outfp = open(A.outfile, "wb")
## Test Code ## Test Code
outfp = extract_text(**vars(A)) outfp = extract_text(**vars(A))
outfp.close() outfp.close()
return 0 return 0
if __name__ == '__main__': sys.exit(main()) if __name__ == '__main__':
sys.exit(main())