Spaces:
Runtime error
Runtime error
import argparse | |
import sys | |
from json import dumps | |
from os.path import abspath, basename, dirname, join, realpath | |
from platform import python_version | |
from typing import List, Optional | |
from unicodedata import unidata_version | |
import charset_normalizer.md as md_module | |
from charset_normalizer import from_fp | |
from charset_normalizer.models import CliDetectionResult | |
from charset_normalizer.version import __version__ | |
def query_yes_no(question: str, default: str = "yes") -> bool: | |
"""Ask a yes/no question via input() and return their answer. | |
"question" is a string that is presented to the user. | |
"default" is the presumed answer if the user just hits <Enter>. | |
It must be "yes" (the default), "no" or None (meaning | |
an answer is required of the user). | |
The "answer" return value is True for "yes" or False for "no". | |
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input | |
""" | |
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} | |
if default is None: | |
prompt = " [y/n] " | |
elif default == "yes": | |
prompt = " [Y/n] " | |
elif default == "no": | |
prompt = " [y/N] " | |
else: | |
raise ValueError("invalid default answer: '%s'" % default) | |
while True: | |
sys.stdout.write(question + prompt) | |
choice = input().lower() | |
if default is not None and choice == "": | |
return valid[default] | |
elif choice in valid: | |
return valid[choice] | |
else: | |
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") | |
def cli_detect(argv: Optional[List[str]] = None) -> int: | |
""" | |
CLI assistant using ARGV and ArgumentParser | |
:param argv: | |
:return: 0 if everything is fine, anything else equal trouble | |
""" | |
parser = argparse.ArgumentParser( | |
description="The Real First Universal Charset Detector. " | |
"Discover originating encoding used on text file. " | |
"Normalize text to unicode." | |
) | |
parser.add_argument( | |
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed" | |
) | |
parser.add_argument( | |
"-v", | |
"--verbose", | |
action="store_true", | |
default=False, | |
dest="verbose", | |
help="Display complementary information about file if any. " | |
"Stdout will contain logs about the detection process.", | |
) | |
parser.add_argument( | |
"-a", | |
"--with-alternative", | |
action="store_true", | |
default=False, | |
dest="alternatives", | |
help="Output complementary possibilities if any. Top-level JSON WILL be a list.", | |
) | |
parser.add_argument( | |
"-n", | |
"--normalize", | |
action="store_true", | |
default=False, | |
dest="normalize", | |
help="Permit to normalize input file. If not set, program does not write anything.", | |
) | |
parser.add_argument( | |
"-m", | |
"--minimal", | |
action="store_true", | |
default=False, | |
dest="minimal", | |
help="Only output the charset detected to STDOUT. Disabling JSON output.", | |
) | |
parser.add_argument( | |
"-r", | |
"--replace", | |
action="store_true", | |
default=False, | |
dest="replace", | |
help="Replace file when trying to normalize it instead of creating a new one.", | |
) | |
parser.add_argument( | |
"-f", | |
"--force", | |
action="store_true", | |
default=False, | |
dest="force", | |
help="Replace file without asking if you are sure, use this flag with caution.", | |
) | |
parser.add_argument( | |
"-t", | |
"--threshold", | |
action="store", | |
default=0.2, | |
type=float, | |
dest="threshold", | |
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.", | |
) | |
parser.add_argument( | |
"--version", | |
action="version", | |
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( | |
__version__, | |
python_version(), | |
unidata_version, | |
"OFF" if md_module.__file__.lower().endswith(".py") else "ON", | |
), | |
help="Show version information and exit.", | |
) | |
args = parser.parse_args(argv) | |
if args.replace is True and args.normalize is False: | |
print("Use --replace in addition of --normalize only.", file=sys.stderr) | |
return 1 | |
if args.force is True and args.replace is False: | |
print("Use --force in addition of --replace only.", file=sys.stderr) | |
return 1 | |
if args.threshold < 0.0 or args.threshold > 1.0: | |
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) | |
return 1 | |
x_ = [] | |
for my_file in args.files: | |
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose) | |
best_guess = matches.best() | |
if best_guess is None: | |
print( | |
'Unable to identify originating encoding for "{}". {}'.format( | |
my_file.name, | |
"Maybe try increasing maximum amount of chaos." | |
if args.threshold < 1.0 | |
else "", | |
), | |
file=sys.stderr, | |
) | |
x_.append( | |
CliDetectionResult( | |
abspath(my_file.name), | |
None, | |
[], | |
[], | |
"Unknown", | |
[], | |
False, | |
1.0, | |
0.0, | |
None, | |
True, | |
) | |
) | |
else: | |
x_.append( | |
CliDetectionResult( | |
abspath(my_file.name), | |
best_guess.encoding, | |
best_guess.encoding_aliases, | |
[ | |
cp | |
for cp in best_guess.could_be_from_charset | |
if cp != best_guess.encoding | |
], | |
best_guess.language, | |
best_guess.alphabets, | |
best_guess.bom, | |
best_guess.percent_chaos, | |
best_guess.percent_coherence, | |
None, | |
True, | |
) | |
) | |
if len(matches) > 1 and args.alternatives: | |
for el in matches: | |
if el != best_guess: | |
x_.append( | |
CliDetectionResult( | |
abspath(my_file.name), | |
el.encoding, | |
el.encoding_aliases, | |
[ | |
cp | |
for cp in el.could_be_from_charset | |
if cp != el.encoding | |
], | |
el.language, | |
el.alphabets, | |
el.bom, | |
el.percent_chaos, | |
el.percent_coherence, | |
None, | |
False, | |
) | |
) | |
if args.normalize is True: | |
if best_guess.encoding.startswith("utf") is True: | |
print( | |
'"{}" file does not need to be normalized, as it already came from unicode.'.format( | |
my_file.name | |
), | |
file=sys.stderr, | |
) | |
if my_file.closed is False: | |
my_file.close() | |
continue | |
dir_path = dirname(realpath(my_file.name)) | |
file_name = basename(realpath(my_file.name)) | |
o_: List[str] = file_name.split(".") | |
if args.replace is False: | |
o_.insert(-1, best_guess.encoding) | |
if my_file.closed is False: | |
my_file.close() | |
elif ( | |
args.force is False | |
and query_yes_no( | |
'Are you sure to normalize "{}" by replacing it ?'.format( | |
my_file.name | |
), | |
"no", | |
) | |
is False | |
): | |
if my_file.closed is False: | |
my_file.close() | |
continue | |
try: | |
x_[0].unicode_path = join(dir_path, ".".join(o_)) | |
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: | |
fp.write(str(best_guess)) | |
except IOError as e: | |
print(str(e), file=sys.stderr) | |
if my_file.closed is False: | |
my_file.close() | |
return 2 | |
if my_file.closed is False: | |
my_file.close() | |
if args.minimal is False: | |
print( | |
dumps( | |
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, | |
ensure_ascii=True, | |
indent=4, | |
) | |
) | |
else: | |
for my_file in args.files: | |
print( | |
", ".join( | |
[ | |
el.encoding or "undefined" | |
for el in x_ | |
if el.path == abspath(my_file.name) | |
] | |
) | |
) | |
return 0 | |
if __name__ == "__main__": | |
cli_detect() | |