2019-11-07 20:12:34 +00:00
|
|
|
"""A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags."""
|
2018-08-13 04:07:52 +00:00
|
|
|
import argparse
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
import logging
|
2018-08-13 04:07:52 +00:00
|
|
|
import sys
|
2019-11-07 20:12:34 +00:00
|
|
|
import six
|
2019-11-06 20:47:19 +00:00
|
|
|
|
2015-05-30 16:03:55 +00:00
|
|
|
import pdfminer.high_level
|
|
|
|
import pdfminer.layout
|
2016-04-26 02:38:42 +00:00
|
|
|
from pdfminer.image import ImageWriter
|
2009-05-15 14:34:53 +00:00
|
|
|
|
2019-11-06 20:47:19 +00:00
|
|
|
logging.basicConfig()
|
|
|
|
|
2015-05-30 15:14:24 +00:00
|
|
|
|
|
|
|
def extract_text(files=[], outfile='-',
|
2015-05-30 16:03:55 +00:00
|
|
|
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
|
|
|
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
|
|
|
output_type='text', codec='utf-8', strip_control=False,
|
|
|
|
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
|
|
|
layoutmode='normal', output_dir=None, debug=False,
|
2019-11-02 09:29:39 +00:00
|
|
|
disable_caching=False, **kwargs):
|
|
|
|
if '_py2_no_more_posargs' in kwargs is not None:
|
|
|
|
raise DeprecationWarning(
|
|
|
|
'The `_py2_no_more_posargs will be removed on January, 2020. At '
|
|
|
|
'that moment pdfminer.six will stop supporting Python 2. Please '
|
|
|
|
'upgrade to Python 3. For more information see '
|
|
|
|
'https://github.com/pdfminer/pdfminer .six/issues/194')
|
|
|
|
|
2015-05-30 15:14:24 +00:00
|
|
|
if not files:
|
|
|
|
raise ValueError("Must provide files to work upon!")
|
|
|
|
|
|
|
|
# If any LAParams group arguments were passed, create an LAParams object and
|
|
|
|
# populate with given args. Otherwise, set it to None.
|
2015-11-01 21:24:30 +00:00
|
|
|
if not no_laparams:
|
2015-05-30 16:03:55 +00:00
|
|
|
laparams = pdfminer.layout.LAParams()
|
2015-05-30 15:14:24 +00:00
|
|
|
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
|
|
|
paramv = locals().get(param, None)
|
|
|
|
if paramv is not None:
|
|
|
|
setattr(laparams, param, paramv)
|
|
|
|
else:
|
|
|
|
laparams = None
|
|
|
|
|
|
|
|
imagewriter = None
|
|
|
|
if output_dir:
|
|
|
|
imagewriter = ImageWriter(output_dir)
|
|
|
|
|
|
|
|
if output_type == "text" and outfile != "-":
|
|
|
|
for override, alttype in ( (".htm", "html"),
|
|
|
|
(".html", "html"),
|
|
|
|
(".xml", "xml"),
|
|
|
|
(".tag", "tag") ):
|
|
|
|
if outfile.endswith(override):
|
|
|
|
output_type = alttype
|
2015-11-01 21:24:30 +00:00
|
|
|
|
2015-05-30 15:14:24 +00:00
|
|
|
if outfile == "-":
|
|
|
|
outfp = sys.stdout
|
|
|
|
if outfp.encoding is not None:
|
|
|
|
codec = 'utf-8'
|
|
|
|
else:
|
|
|
|
outfp = open(outfile, "wb")
|
2015-11-01 21:24:30 +00:00
|
|
|
|
2015-05-30 15:14:24 +00:00
|
|
|
|
|
|
|
for fname in files:
|
|
|
|
with open(fname, "rb") as fp:
|
2015-05-30 16:03:55 +00:00
|
|
|
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
2015-05-30 15:14:24 +00:00
|
|
|
return outfp
|
|
|
|
|
2018-08-13 04:07:52 +00:00
|
|
|
|
|
|
|
def maketheparser():
|
|
|
|
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
2019-11-07 20:12:34 +00:00
|
|
|
parser.add_argument("files", type=str, default=None, nargs="+", help="One or more paths to PDF files.")
|
|
|
|
|
|
|
|
parser.add_argument("--debug", "-d", default=False, action="store_true",
|
|
|
|
help="Use debug logging level.")
|
|
|
|
parser.add_argument("--disable-caching", "-C", default=False, action="store_true",
|
|
|
|
help="If caching or resources, such as fonts, should be disabled.")
|
|
|
|
|
|
|
|
parse_params = parser.add_argument_group('Parser', description='Used during PDF parsing')
|
|
|
|
parse_params.add_argument("--page-numbers", type=int, default=None, nargs="+",
|
|
|
|
help="A space-seperated list of page numbers to parse.")
|
|
|
|
parse_params.add_argument("--pagenos", "-p", type=str,
|
|
|
|
help="A comma-separated list of page numbers to parse. Included for legacy applications, "
|
|
|
|
"use --page-numbers for more idiomatic argument entry.")
|
|
|
|
parse_params.add_argument("--maxpages", "-m", type=int, default=0,
|
|
|
|
help="The maximum number of pages to parse.")
|
|
|
|
parse_params.add_argument("--password", "-P", type=str, default="",
|
|
|
|
help="The password to use for decrypting PDF file.")
|
|
|
|
parse_params.add_argument("--rotation", "-R", default=0, type=int,
|
|
|
|
help="The number of degrees to rotate the PDF before other types of processing.")
|
|
|
|
|
|
|
|
la_params = parser.add_argument_group('Layout analysis', description='Used during layout analysis.')
|
|
|
|
la_params.add_argument("--no-laparams", "-n", default=False, action="store_true",
|
|
|
|
help="If layout analysis parameters should be ignored.")
|
|
|
|
la_params.add_argument("--detect-vertical", "-V", default=False, action="store_true",
|
|
|
|
help="If vertical text should be considered during layout analysis")
|
|
|
|
la_params.add_argument("--char-margin", "-M", type=float, default=2.0,
|
|
|
|
help="If two characters are closer together than this margin they are considered to be part "
|
|
|
|
"of the same word. The margin is specified relative to the width of the character.")
|
|
|
|
la_params.add_argument("--word-margin", "-W", type=float, default=0.1,
|
|
|
|
help="If two words are are closer together than this margin they are considered to be part "
|
|
|
|
"of the same line. A space is added in between for readability. The margin is "
|
|
|
|
"specified relative to the width of the word.")
|
|
|
|
la_params.add_argument("--line-margin", "-L", type=float, default=0.5,
|
|
|
|
help="If two lines are are close together they are considered to be part of the same "
|
|
|
|
"paragraph. The margin is specified relative to the height of a line.")
|
|
|
|
la_params.add_argument("--boxes-flow", "-F", type=float, default=0.5,
|
|
|
|
help="Specifies how much a horizontal and vertical position of a text matters when "
|
|
|
|
"determining the order of lines. The value should be within the range of -1.0 (only "
|
|
|
|
"horizontal position matters) to +1.0 (only vertical position matters).")
|
|
|
|
la_params.add_argument("--all-texts", "-A", default=True, action="store_true",
|
|
|
|
help="If layout analysis should be performed on text in figures.")
|
|
|
|
|
|
|
|
output_params = parser.add_argument_group('Output', description='Used during output generation.')
|
|
|
|
output_params.add_argument("--outfile", "-o", type=str, default="-",
|
|
|
|
help="Path to file where output is written. Or \"-\" (default) to write to stdout.")
|
|
|
|
output_params.add_argument("--output_type", "-t", type=str, default="text",
|
|
|
|
help="Type of output to generate {text,html,xml,tag}.")
|
|
|
|
output_params.add_argument("--codec", "-c", type=str, default="utf-8",
|
|
|
|
help="Text encoding to use in output file.")
|
|
|
|
output_params.add_argument("--output-dir", "-O", default=None,
|
|
|
|
help="The output directory to put extracted images in. If not given, images are not "
|
|
|
|
"extracted.")
|
|
|
|
output_params.add_argument("--layoutmode", "-Y", default="normal", type=str,
|
|
|
|
help="Type of layout to use when generating html {normal,exact,loose}. If normal, "
|
|
|
|
"each line is positioned separately in the html. If exact, each character is "
|
|
|
|
"positioned separately in the html. If loose, same result as normal but with an "
|
|
|
|
"additional newline after each text line. Only used when output_type is html.")
|
|
|
|
output_params.add_argument("--scale", "-s", type=float, default=1.0,
|
|
|
|
help="The amount of zoom to use when generating html file. Only used when output_type "
|
|
|
|
"is html.")
|
|
|
|
output_params.add_argument("--strip-control", "-S", default=False, action="store_true",
|
|
|
|
help="Remove control statement from text. Only used when output_type is xml.")
|
2018-08-13 04:07:52 +00:00
|
|
|
return parser
|
|
|
|
|
|
|
|
|
2009-05-15 14:34:53 +00:00
|
|
|
# main
|
2018-08-13 04:07:52 +00:00
|
|
|
|
|
|
|
|
2015-05-30 15:14:24 +00:00
|
|
|
def main(args=None):
|
2018-08-13 04:07:52 +00:00
|
|
|
|
|
|
|
P = maketheparser()
|
2015-05-30 15:14:24 +00:00
|
|
|
A = P.parse_args(args=args)
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
|
|
|
|
if A.page_numbers:
|
|
|
|
A.page_numbers = set([x-1 for x in A.page_numbers])
|
|
|
|
if A.pagenos:
|
|
|
|
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
2015-11-01 21:24:30 +00:00
|
|
|
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
imagewriter = None
|
|
|
|
if A.output_dir:
|
|
|
|
imagewriter = ImageWriter(A.output_dir)
|
|
|
|
|
|
|
|
if six.PY2 and sys.stdin.encoding:
|
|
|
|
A.password = A.password.decode(sys.stdin.encoding)
|
|
|
|
|
|
|
|
if A.output_type == "text" and A.outfile != "-":
|
2015-05-30 15:14:24 +00:00
|
|
|
for override, alttype in ( (".htm", "html"),
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
(".html", "html"),
|
2015-05-30 15:14:24 +00:00
|
|
|
(".xml", "xml" ),
|
|
|
|
(".tag", "tag" ) ):
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
if A.outfile.endswith(override):
|
|
|
|
A.output_type = alttype
|
|
|
|
|
|
|
|
if A.outfile == "-":
|
|
|
|
outfp = sys.stdout
|
|
|
|
if outfp.encoding is not None:
|
2015-05-30 15:14:24 +00:00
|
|
|
# Why ignore outfp.encoding? :-/ stupid cathal?
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
A.codec = 'utf-8'
|
|
|
|
else:
|
|
|
|
outfp = open(A.outfile, "wb")
|
|
|
|
|
2015-05-30 15:14:24 +00:00
|
|
|
## Test Code
|
|
|
|
outfp = extract_text(**vars(A))
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
outfp.close()
|
2015-05-30 16:03:55 +00:00
|
|
|
return 0
|
Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic.
*In pdf2txt.py:*
* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.
*In utils:*
* Added a few compatibility functions (some string hax required chardet, new dependency):
- make_compat_bytes(in_str)-> (py3->bytes | py2->str)
- make_compat_str(in_str)-> (str)
- compatible_encode_method(bytesorstring, encoding, erraction)-> (str)
*In pdfdevice:*
* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
as well as some six.PYX checks and logic. These changes are largely responsible for
enhanced Py2/Py3 consistency.
*In converter:*
* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 20:08:57 +00:00
|
|
|
|
2009-05-15 14:34:53 +00:00
|
|
|
|
2015-05-30 15:14:24 +00:00
|
|
|
if __name__ == '__main__': sys.exit(main())
|