Spaces:

RDTvlokip
/

AG-BPE

Running

App Files Files Community

RDTvlokip commited on Jul 16

Commit

ea3f62e

verified ·

1 Parent(s): 0ee325f

Create app.py

Browse files

Files changed (1) hide show

app.py +243 -0

app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# -*- coding: utf-8 -*-
+"""
+AG-BPE Standalone Usage Script & Web Visualizer
+================================================
+This script demonstrates how to load and use a pre-trained AG-BPE tokenizer
+and provides a real-time web interface using Gradio to visualize its behavior.
+It defines a self-contained AGBPETokenizer class and then launches a web app
+that allows users to type text and see the tokenization and corresponding IDs
+update live.
+This entire script is designed to be run as a single file in a Hugging Face Space.
+"""
+import json
+import regex as re
+from pathlib import Path
+from typing import List, Dict, Tuple
+import unicodedata
+import gradio as gr
+# --- TextCleaner Class ---
+# This class is included to ensure that the input text is pre-processed
+# in exactly the same way as during the tokenizer's training.
+class TextCleaner:
+    """A text cleaner for AI datasets, designed to remove invisible, abnormal, and disruptive characters."""
+    UNWANTED_CHARS = {
+        '\ufffd', '\u200b', '\u200c', '\u200d', '\u2060', '\u2061', '\u2063',
+        '\u00a0', '\u202f', '\u2007', '\u2028', '\u2029', '\ufeff', '\ue000',
+        '\uf8ff', '\ue001', '\xad', '\u180e', '\u200e', '\uFE0F',
+    }
+    @classmethod
+    def clean_text(cls, text: str) -> str:
+        """Cleans a given string by normalizing it, removing unwanted characters, and collapsing whitespace."""
+        text = unicodedata.normalize("NFKC", text)
+        text = text.replace('’', "'").replace('‘', "'")
+        text = text.replace('“', '"').replace('”', '"')
+        for char in cls.UNWANTED_CHARS:
+            text = text.replace(char, '')
+        text = ''.join(c for c in text if ord(c) >= 32 or c in '\n\r\t')
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+# --- Standalone Tokenizer Class ---
+class AGBPETokenizer:
+    """
+    A self-contained tokenizer that loads and uses a pre-trained AG-BPE model
+    from a JSON file containing the vocabulary and merge rules.
+    """
+    def __init__(self, vocab: Dict[str, int], merges: Dict[str, int], special_tokens: Dict[str, int]):
+        """Initializes the tokenizer from loaded vocabulary and merge data."""
+        self.vocab = vocab
+        self.merges = {tuple(k.split()): v for k, v in merges.items()}
+        self.special_tokens_map = special_tokens
+        self.id_to_token: Dict[int, str] = {i: s for s, i in self.vocab.items()}
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        self.unk_token_id = self.vocab.get('<unk>')
+        if self.unk_token_id is None:
+            raise ValueError("The '<unk>' token is missing from the vocabulary.")
+        self.text_cleaner = TextCleaner()
+    @classmethod
+    def from_file(cls, filepath: str) -> 'AGBPETokenizer':
+        """Class method to conveniently load a tokenizer from a JSON file path."""
+        path = Path(filepath)
+        if not path.exists():
+            raise FileNotFoundError(f"Tokenizer file not found: '{filepath}'")
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        required_keys = ['vocab', 'merges', 'special_tokens']
+        if not all(key in data for key in required_keys):
+            raise ValueError("The JSON file is malformed. Missing one of: vocab, merges, special_tokens.")
+        return cls(data['vocab'], data['merges'], data['special_tokens'])
+    def _apply_bpe(self, word_chars: List[str]) -> List[str]:
+        """Applies the BPE merge rules to a list of characters, with a crucial validation step."""
+        if not self.merges:
+            return word_chars
+        while len(word_chars) > 1:
+            pairs = list(zip(word_chars[:-1], word_chars[1:]))
+            local_merges = self.merges.copy()
+            best_pair = None
+            while True:
+                if not local_merges:
+                    best_pair = None
+                    break
+                valid_pairs_in_word = (p for p in pairs if p in local_merges)
+                current_best_pair = min(valid_pairs_in_word, key=local_merges.get, default=None)
+                if current_best_pair is None:
+                    best_pair = None
+                    break
+                merged_token = current_best_pair[0] + current_best_pair[1]
+                if merged_token in self.vocab:
+                    best_pair = current_best_pair
+                    break
+                else:
+                    del local_merges[current_best_pair]
+            if best_pair is None:
+                break
+            new_word_chars = []
+            i = 0
+            while i < len(word_chars):
+                if i < len(word_chars) - 1 and (word_chars[i], word_chars[i+1]) == best_pair:
+                    new_word_chars.append(word_chars[i] + word_chars[i+1])
+                    i += 2
+                else:
+                    new_word_chars.append(word_chars[i])
+                    i += 1
+            word_chars = new_word_chars
+        return word_chars
+    def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
+        """Encodes a string of text into a list of token IDs."""
+        cleaned_text = self.text_cleaner.clean_text(text)
+        token_ids = []
+        if add_special_tokens and (bos_id := self.special_tokens_map.get('<bos>')) is not None:
+            token_ids.append(bos_id)
+        for chunk in self.pat.findall(cleaned_text):
+            tokens = self._apply_bpe(list(chunk))
+            token_ids.extend(self.vocab.get(token, self.unk_token_id) for token in tokens)
+        if add_special_tokens and (eos_id := self.special_tokens_map.get('<eos>')) is not None:
+            token_ids.append(eos_id)
+        return token_ids
+    def decode(self, token_ids: List[int]) -> str:
+        """Decodes a list of token IDs back into a string of text."""
+        special_ids_to_skip = set(self.special_tokens_map.values())
+        tokens = [self.id_to_token.get(token_id, '') for token_id in token_ids if token_id not in special_ids_to_skip]
+        return "".join(tokens)
+# --- Gradio Web Application ---
+# 1. Définir le chemin vers le fichier du tokenizer.
+#    Assurez-vous que ce fichier est présent dans votre Space Hugging Face.
+TOKENIZER_FILE = "ag_bpe_tokenizer.json"
+TOKENIZER_LOADED = False
+ERROR_MESSAGE = ""
+tokenizer = None
+# 2. Essayer de charger le tokenizer au démarrage de l'application.
+try:
+    # Création d'un fichier factice si celui-ci n'existe pas (pour test local facile)
+    if not Path(TOKENIZER_FILE).exists():
+        print(f"⚠️  Attention : Le fichier '{TOKENIZER_FILE}' est introuvable.")
+        print("Création d'un fichier tokenizer factice pour le test local.")
+        dummy_data = {
+            "vocab": {"<unk>": 0, "<bos>": 1, "<eos>": 2, "Hel": 3, "lo": 4, "W": 5, "orld": 6, "HelloWorld": 7, " ": 8},
+            "merges": {"H e l": 1, "l o": 2, "W o r l d": 3, "Hello World": 4},
+            "special_tokens": {"<unk>": 0, "<bos>": 1, "<eos>": 2}
+        }
+        with open(TOKENIZER_FILE, 'w', encoding='utf-8') as f:
+            json.dump(dummy_data, f, indent=2)
+        print("Fichier factice créé. L'application utilisera ce fichier.")
+    print(f"🧠 Chargement du tokenizer depuis '{TOKENIZER_FILE}'...")
+    tokenizer = AGBPETokenizer.from_file(TOKENIZER_FILE)
+    TOKENIZER_LOADED = True
+    print(f"✅ Tokenizer chargé avec succès. Taille du vocabulaire : {len(tokenizer.vocab)}")
+except (FileNotFoundError, ValueError, KeyError) as e:
+    ERROR_MESSAGE = str(e)
+    print(f"❌ ERREUR lors du chargement du tokenizer : {ERROR_MESSAGE}")
+# 3. Définir la fonction principale qui sera appelée par Gradio.
+def visualize_tokenization(text: str) -> List[Tuple[str, str]]:
+    """
+    Prend un texte en entrée, le tokenize et renvoie une liste de tuples
+    (token, id) pour l'affichage avec gr.HighlightedText.
+    """
+    if not TOKENIZER_LOADED or not tokenizer:
+        return [("ERREUR LORS DU CHARGEMENT DU TOKENIZER", ERROR_MESSAGE)]
+    if not text:
+        return [("Veuillez entrer du texte...", "")]
+    # Encoder le texte pour obtenir les IDs des tokens.
+    # add_special_tokens=False pour ne pas afficher <bos> et <eos> dans la démo.
+    encoded_ids = tokenizer.encode(text, add_special_tokens=False)
+    # Préparer la sortie pour le composant HighlightedText.
+    # Le format est une liste de tuples (token_string, label).
+    highlighted_output = []
+    for token_id in encoded_ids:
+        # Récupérer la chaîne de caractères du token à partir de son ID.
+        token_string = tokenizer.id_to_token.get(token_id, f"<unk:{token_id}>")
+        # Le label sera l'ID du token.
+        highlighted_output.append((token_string, str(token_id)))
+    return highlighted_output
+# 4. Construire l'interface Gradio.
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), css="footer {display: none !important}") as demo:
+    gr.Markdown(
+        """
+        # 👁️ Visualiseur de Tokenizer en Temps Réel
+        Entrez du texte dans le champ ci-dessous pour observer la segmentation (tokenization) en direct.
+        Chaque segment de texte coloré est un "token", et son ID numérique est affiché juste en dessous.
+        """
+    )
+    with gr.Column():
+        input_textbox = gr.Textbox(
+            label="Entrez votre texte ici",
+            placeholder="Écrivez quelque chose...",
+            lines=7,
+            show_label=False,
+        )
+        output_highlight = gr.HighlightedText(
+            label="Tokens et IDs",
+            show_label=False,
+            interactive=True, # Permet de sélectionner le texte
+            combine_consecutive=True,
+            show_legend=True,
+            color_map={"ID": "lightblue"} # Simple color map
+        )
+    # Lier l'événement 'input' (chaque frappe) du champ de texte à notre fonction.
+    # 'live=True' est une autre façon de le faire, mais .input() est plus explicite.
+    input_textbox.input(
+        fn=visualize_tokenization,
+        inputs=[input_textbox],
+        outputs=[output_highlight]
+    )
+    # Ajouter un exemple pour guider l'utilisateur.
+    gr.Examples(
+        examples=[
+            "L'intelligence artificielle est fascinante.",
+            "Test avec    des espaces multiples et des ’apostrophes’ typographiques.",
+            "Le code `if (x==10)` et les emojis 👍🚀 sont gérés.",
+            "Hello world! This is a test of the AG-BPE tokenizer.",
+            "안녕하세요"
+        ],
+        inputs=input_textbox
+    )
+# 5. Lancer l'application (le point d'entrée pour Hugging Face Spaces).
+if __name__ == "__main__":
+    demo.launch()