Removed duplicated and therefore unused code from pdf2txt.py (#341)
parent
452f0b4ad0
commit
78f06225b6
|
@ -10,6 +10,11 @@ from pdfminer.image import ImageWriter
|
||||||
|
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
|
|
||||||
|
OUTPUT_TYPES = ((".htm", "html"),
|
||||||
|
(".html", "html"),
|
||||||
|
(".xml", "xml"),
|
||||||
|
(".tag", "tag"))
|
||||||
|
|
||||||
|
|
||||||
def extract_text(files=[], outfile='-',
|
def extract_text(files=[], outfile='-',
|
||||||
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
||||||
|
@ -39,15 +44,8 @@ def extract_text(files=[], outfile='-',
|
||||||
else:
|
else:
|
||||||
laparams = None
|
laparams = None
|
||||||
|
|
||||||
imagewriter = None
|
|
||||||
if output_dir:
|
|
||||||
imagewriter = ImageWriter(output_dir)
|
|
||||||
|
|
||||||
if output_type == "text" and outfile != "-":
|
if output_type == "text" and outfile != "-":
|
||||||
for override, alttype in ( (".htm", "html"),
|
for override, alttype in OUTPUT_TYPES:
|
||||||
(".html", "html"),
|
|
||||||
(".xml", "xml"),
|
|
||||||
(".tag", "tag") ):
|
|
||||||
if outfile.endswith(override):
|
if outfile.endswith(override):
|
||||||
output_type = alttype
|
output_type = alttype
|
||||||
|
|
||||||
|
@ -58,7 +56,6 @@ def extract_text(files=[], outfile='-',
|
||||||
else:
|
else:
|
||||||
outfp = open(outfile, "wb")
|
outfp = open(outfile, "wb")
|
||||||
|
|
||||||
|
|
||||||
for fname in files:
|
for fname in files:
|
||||||
with open(fname, "rb") as fp:
|
with open(fname, "rb") as fp:
|
||||||
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
||||||
|
@ -145,33 +142,19 @@ def main(args=None):
|
||||||
if A.pagenos:
|
if A.pagenos:
|
||||||
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
||||||
|
|
||||||
imagewriter = None
|
|
||||||
if A.output_dir:
|
|
||||||
imagewriter = ImageWriter(A.output_dir)
|
|
||||||
|
|
||||||
if six.PY2 and sys.stdin.encoding:
|
if six.PY2 and sys.stdin.encoding:
|
||||||
A.password = A.password.decode(sys.stdin.encoding)
|
A.password = A.password.decode(sys.stdin.encoding)
|
||||||
|
|
||||||
if A.output_type == "text" and A.outfile != "-":
|
if A.output_type == "text" and A.outfile != "-":
|
||||||
for override, alttype in ( (".htm", "html"),
|
for override, alttype in OUTPUT_TYPES:
|
||||||
(".html", "html"),
|
|
||||||
(".xml", "xml" ),
|
|
||||||
(".tag", "tag" ) ):
|
|
||||||
if A.outfile.endswith(override):
|
if A.outfile.endswith(override):
|
||||||
A.output_type = alttype
|
A.output_type = alttype
|
||||||
|
|
||||||
if A.outfile == "-":
|
|
||||||
outfp = sys.stdout
|
|
||||||
if outfp.encoding is not None:
|
|
||||||
# Why ignore outfp.encoding? :-/ stupid cathal?
|
|
||||||
A.codec = 'utf-8'
|
|
||||||
else:
|
|
||||||
outfp = open(A.outfile, "wb")
|
|
||||||
|
|
||||||
## Test Code
|
## Test Code
|
||||||
outfp = extract_text(**vars(A))
|
outfp = extract_text(**vars(A))
|
||||||
outfp.close()
|
outfp.close()
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main())
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
|
|
Loading…
Reference in New Issue