Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import json | |
| import os | |
| import string | |
| from src.logger import get_logger | |
| WARMUP_DATASET_SIZE = 100 | |
| DATASET_SIZE = 120 | |
| MIN_INPUT_LENGTH = 2 | |
| MAX_INPUT_LENGTH = 20 | |
| MIN_SUBMISSION_SIZE = 1 | |
| MAX_SUBMISSION_SIZE = 1024 * 1024 * 120 # 120 MB. | |
| MAX_SINGLE_SUBMISSION_SIZE = 1024 * 1024 # 1MB. | |
| MAX_SUBMISSION_LINES = DATASET_SIZE + 1 # Allow empty line. | |
| logger = get_logger(__name__) | |
| def is_valid( | |
| s: str, | |
| min_length: int = MIN_INPUT_LENGTH, | |
| max_length: int = MAX_INPUT_LENGTH, | |
| ) -> bool: | |
| """ | |
| @brief Checks whether the given string is valid. | |
| @param s The string to validate. | |
| @return True iff all characters are in [a-zA-Z0-9], spaces, or '.' and '-', and the length if between | |
| min length and max length. | |
| """ | |
| characters = [c for c in s] # Not using the length from len(.) as that includes unicode characters. | |
| if len(characters) < min_length or len(characters) > max_length: | |
| return False | |
| # Very important: We delimit using underscores. So these _CANNOT_ be allowed in sanitised strings. | |
| ALLOWED = ( | |
| [c for c in string.ascii_lowercase] | |
| + [c for c in string.ascii_uppercase] | |
| + [c for c in string.digits] | |
| + [" ", ".", "-"] | |
| ) | |
| for c in s: | |
| if c not in ALLOWED: | |
| return False | |
| return True | |
| def is_submission_file_valid( | |
| submission_path: str, | |
| is_warmup_dataset: bool, | |
| ) -> bool: | |
| """ | |
| @brief Checks whether the given submission file is valid. | |
| @param submission_path The path to the submission file. | |
| @param is_warmup_dataset Whether we are working on the regular or the warmup dataset. | |
| @return True iff the file is within the size constraints, a JSONL, and every line is no longer than | |
| the fixed maximum bound. | |
| """ | |
| if not os.path.exists(submission_path): | |
| logger.warning(f"Could not find submission file {submission_path=}") | |
| return False | |
| submission_size = os.stat(submission_path).st_size | |
| if submission_size < MIN_SUBMISSION_SIZE or submission_size > MAX_SUBMISSION_SIZE: | |
| logger.warning(f"Submission size was {submission_size}, exceeding [{MIN_SUBMISSION_SIZE, MAX_SUBMISSION_SIZE}]") | |
| return False | |
| with open(submission_path, "r") as f: | |
| # Not using readlines() to avoid consuming a large buffer at once. | |
| n_lines = 0 | |
| seen_ids = set() | |
| while len(line := f.readline(MAX_SINGLE_SUBMISSION_SIZE)) > 0: | |
| n_lines += 1 | |
| if n_lines > MAX_SUBMISSION_LINES: | |
| logger.warning(f"Got submission with more than {MAX_SUBMISSION_LINES} lines") | |
| return False | |
| if not (line.startswith("{") and (line.endswith("}") or line.endswith("}\n"))): | |
| logger.warning("Submission has line that does not appear to be a JSONL") | |
| return False | |
| d = json.loads(line) | |
| if set(d.keys()) != set(["problem_id", "solution"]): | |
| logger.warning("Found unexpected keys") | |
| return False | |
| if not ((type(d["problem_id"]) is str or type(d["problem_id"]) is int) and type(d["solution"] is str)): | |
| logger.warning("Found unexpected types") | |
| return False | |
| try: | |
| problem_id = int(d["problem_id"]) | |
| except Exception: | |
| logger.warning("Could not convert problem ID to int") | |
| return False | |
| if is_warmup_dataset: | |
| if problem_id < DATASET_SIZE or problem_id >= DATASET_SIZE + WARMUP_DATASET_SIZE: | |
| logger.warning(f"Problem ID {problem_id} is beyond allowed bounds") | |
| return False | |
| else: | |
| if problem_id < 0 or problem_id >= DATASET_SIZE: | |
| logger.warning(f"Problem ID {problem_id} is beyond allowed bounds") | |
| return False | |
| if problem_id in seen_ids: | |
| logger.warning(f"Got duplicate submission -- ID {problem_id} appears twice") | |
| return False # Duplicate submission. | |
| seen_ids.add(problem_id) | |
| return True | |