Spaces:
Running
on
Zero
Running
on
Zero
import logging | |
import re | |
from http import HTTPStatus | |
from typing import TypeVar | |
import regex | |
from datasets import Dataset, DatasetDict, Version, load_dataset | |
from datasets.exceptions import DatasetNotFoundError | |
from huggingface_hub.errors import HfHubHTTPError | |
from tenacity import ( | |
before_sleep_log, | |
retry, | |
retry_if_exception, | |
stop_after_attempt, | |
wait_fixed, | |
) | |
logger = logging.getLogger(__name__) | |
# pylint: disable-next=invalid-name | |
TDataset = TypeVar("TDataset", bound=Dataset | DatasetDict) | |
def load_dataset_retrying( | |
path: str, | |
revision: str | Version | None = None, | |
) -> DatasetDict: | |
return load_dataset(path, revision=revision) | |
# SEE: https://www.compart.com/en/unicode/block/U+2070 for subscript letters | |
invalid_chars_regex = re.compile( | |
r"[^A-Za-z0-9Ξ-Ωα-Οββββββββββββββ°ΒΉΒ²Β³β΄β΅βΆβ·βΈβΉββββββ ββββΓ\s!\"#$%&Β±β»'Β΄Κ»ββΚΌββ()*+βΊ,\-ββββββββ€./:;Β«<β€=β‘βββ₯>βΊΒ»β?@[\\\]^_`{|}~βββββββ‘ββ§ββ¨βββΊββΆβ¦]" # noqa: RUF001 | |
) | |
invalid_languages_regex = regex.compile( | |
r"[\p{" | |
+ r"}\p{".join({ | |
# SEE: https://jrgraphix.net/r/Unicode/ | |
"Arabic", | |
"Armenian", | |
"Bengali", | |
"Braille_Patterns", | |
"Cyrillic", | |
"Devanagari", | |
"Ethiopic", | |
"Georgian", | |
"Gujarati", | |
"Gurmukhi", | |
"Han", | |
"Hangul", | |
"Hebrew", | |
"Hiragana", | |
"Kannada", | |
"Katakana", | |
"Khmer", | |
"Latin_Extended_A", | |
"Latin_Extended_Additional", | |
"Latin_Extended_B", | |
"Malayalam", | |
"Myanmar", | |
"Syriac", | |
"Tamil", | |
"Telugu", | |
"Thaana", | |
"Thai", | |
"Tifinagh", | |
}) | |
+ r"}]" | |
) | |
def contains_invalid( | |
text: str, chars: bool = False, languages: bool = False, threshold: int = 1 | |
) -> tuple[bool, list[str]]: | |
"""Check if the text contains invalid characters or languages.""" | |
if chars: | |
matches = invalid_chars_regex.findall(text) | |
if len(matches) >= threshold: | |
return True, sorted(matches) | |
if languages: | |
matches = invalid_languages_regex.findall(text) | |
if len(matches) >= threshold: | |
return True, sorted(matches) | |
return False, [] | |