jonahkall's picture
Upload 51 files
4c346eb verified
raw
history blame
3.51 kB
import logging
import re
from http import HTTPStatus
from typing import TypeVar
import regex
from datasets import Dataset, DatasetDict, Version, load_dataset
from datasets.exceptions import DatasetNotFoundError
from huggingface_hub.errors import HfHubHTTPError
from tenacity import (
before_sleep_log,
retry,
retry_if_exception,
stop_after_attempt,
wait_fixed,
)
logger = logging.getLogger(__name__)
# pylint: disable-next=invalid-name
TDataset = TypeVar("TDataset", bound=Dataset | DatasetDict)
@retry(
retry=retry_if_exception(
lambda x: (
(
# On 2/11/2025 James kept seeing on the g3 server cluster:
# > huggingface_hub.errors.HfHubHTTPError: 504 Server Error: Gateway Time-out for
# > url: https://huggingface.co/api/datasets/org/repo/paths-info/abc123
# And on 3/14 James saw this on the g3 server cluster:
# > huggingface_hub.errors.HfHubHTTPError: 502 Server Error: Bad Gateway for
# > url: https://huggingface.co/api/datasets/org/repo/paths-info/abc123
isinstance(x, HfHubHTTPError)
and x.response.status_code
in {HTTPStatus.BAD_GATEWAY.value, HTTPStatus.GATEWAY_TIMEOUT.value}
)
# On 4/14/2025 James kept seeing on the g5 server cluster:
# > datasets.exceptions.DatasetNotFoundError:
# > Dataset 'org/repo' doesn't exist on the Hub or cannot be accessed.
or isinstance(x, DatasetNotFoundError)
)
),
before_sleep=before_sleep_log(logger, logging.WARNING),
stop=stop_after_attempt(5),
wait=wait_fixed(5),
)
def load_dataset_retrying(
path: str,
revision: str | Version | None = None,
) -> DatasetDict:
return load_dataset(path, revision=revision)
# SEE: https://www.compart.com/en/unicode/block/U+2070 for subscript letters
invalid_chars_regex = re.compile(
r"[^A-Za-z0-9Ξ‘-Ωα-Ο‰β‚β‚‘β‚’β‚“β‚”β‚•β‚–β‚—β‚˜β‚™β‚šβ‚›β‚œβ°ΒΉΒ²Β³β΄β΅βΆβ·βΈβΉβ‚€β‚β‚‚β‚ƒβ‚„β‚…β‚†β‚‡β‚ˆβ‚‰Γ—\s!\"#$%&±⁻'Β΄Κ»β€˜β€™ΚΌβ€œβ€()*+⁺,\-β€”β€“β€β€‘β€’β€•βˆ’β€./:;Β«<≀=β‰‘β‰ˆβ‰†β‰₯>β€ΊΒ»β‡Œ?@[\\\]^_`{|}~β†β‡β†’βž”βžžβž›βž‘βžŸβž§βž­β‡¨β‡’β‡›βŸΊβ‡”βŸΆβ€¦]" # noqa: RUF001
)
invalid_languages_regex = regex.compile(
r"[\p{"
+ r"}\p{".join({
# SEE: https://jrgraphix.net/r/Unicode/
"Arabic",
"Armenian",
"Bengali",
"Braille_Patterns",
"Cyrillic",
"Devanagari",
"Ethiopic",
"Georgian",
"Gujarati",
"Gurmukhi",
"Han",
"Hangul",
"Hebrew",
"Hiragana",
"Kannada",
"Katakana",
"Khmer",
"Latin_Extended_A",
"Latin_Extended_Additional",
"Latin_Extended_B",
"Malayalam",
"Myanmar",
"Syriac",
"Tamil",
"Telugu",
"Thaana",
"Thai",
"Tifinagh",
})
+ r"}]"
)
def contains_invalid(
text: str, chars: bool = False, languages: bool = False, threshold: int = 1
) -> tuple[bool, list[str]]:
"""Check if the text contains invalid characters or languages."""
if chars:
matches = invalid_chars_regex.findall(text)
if len(matches) >= threshold:
return True, sorted(matches)
if languages:
matches = invalid_languages_regex.findall(text)
if len(matches) >= threshold:
return True, sorted(matches)
return False, []