Spaces:

RDTvlokip
/

AG-BPE

Running

App Files Files Community

RDTvlokip commited on 15 days ago

Commit

78519e1

verified ·

1 Parent(s): 6574d88

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -558

app.py CHANGED Viewed

@@ -1,632 +1,248 @@
 # -*- coding: utf-8 -*-
 """
-AG-BPE Enhanced Tokenizer with Advanced Metrics & Real-time Visualizer
-======================================================================
-Enhanced version with comprehensive metrics, better error handling, and improved UI
-"""
 import json
 import regex as re
 from pathlib import Path
-from typing import List, Dict, Tuple, Optional, Any
 import unicodedata
 import gradio as gr
 import html
 import math
-import time
-import sys
-from collections import Counter, defaultdict
-from functools import lru_cache
-import numpy as np
-from dataclasses import dataclass, asdict
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# --- Enhanced Metrics Dataclass ---
-@dataclass
-class TokenizerMetrics:
-    """Comprehensive metrics for tokenizer evaluation"""
-    tokenizer_name: str = "AG-BPE"
-    vocab_size: int = 0
-    vocab_size_kb: float = 0.0
-    compression: float = 0.0
-    effectiveness_per_kb: float = 0.0
-    avg_len: float = 0.0
-    oov_rate: float = 0.0
-    enc_speed_ms: float = 0.0
-    dec_speed_ms: float = 0.0
-    throughput_chars_s: float = 0.0
-    entropy: float = 0.0
-    fertility: float = 0.0
-    robustness_score: float = 0.0
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert metrics to dictionary with rounded values"""
-        return {
-            "Tokenizer": self.tokenizer_name,
-            "Vocab Size": f"{self.vocab_size:,}",
-            "Vocab Size (KB)": f"{self.vocab_size_kb:.2f}",
-            "Compression": f"{self.compression:.3f}",
-            "Effectiveness/KB": f"{self.effectiveness_per_kb:.3f}",
-            "Avg Len": f"{self.avg_len:.2f}",
-            "OOV Rate (%)": f"{self.oov_rate:.2f}",
-            "Enc Speed (ms)": f"{self.enc_speed_ms:.3f}",
-            "Dec Speed (ms)": f"{self.dec_speed_ms:.3f}",
-            "Throughput (chars/s)": f"{self.throughput_chars_s:,.0f}",
-            "Entropy": f"{self.entropy:.3f}",
-            "Fertility": f"{self.fertility:.3f}",
-            "Robustness Score": f"{self.robustness_score:.2f}"
-        }
-# --- Enhanced TextCleaner ---
 class TextCleaner:
-    """Enhanced text cleaner with caching and better normalization"""
     UNWANTED_CHARS = {
         '\ufffd', '\u200b', '\u200c', '\u200d', '\u2060', '\u2061', '\u2063',
         '\u00a0', '\u202f', '\u2007', '\u2028', '\u2029', '\ufeff', '\ue000',
         '\uf8ff', '\ue001', '\xad', '\u180e', '\u200e', '\uFE0F',
     }
-    _cache = {}
-    _max_cache_size = 1000
     @classmethod
-    @lru_cache(maxsize=512)
     def clean_text(cls, text: str) -> str:
-        """Cleans text with caching for performance"""
-        try:
-            # Check cache first
-            if text in cls._cache:
-                return cls._cache[text]
-            # Normalize
-            cleaned = unicodedata.normalize("NFKC", text)
-            # Replace quotes
-            cleaned = cleaned.replace(''', "'").replace(''', "'")
-            cleaned = cleaned.replace('"', '"').replace('"', '"')
-            # Remove unwanted chars
-            for char in cls.UNWANTED_CHARS:
-                cleaned = cleaned.replace(char, '')
-            # Filter control chars
-            cleaned = ''.join(c for c in cleaned if ord(c) >= 32 or c in '\n\r\t')
-            # Collapse whitespace
-            cleaned = re.sub(r'\s+', ' ', cleaned)
-            cleaned = cleaned.strip()
-            # Update cache
-            if len(cls._cache) < cls._max_cache_size:
-                cls._cache[text] = cleaned
-            return cleaned
-        except Exception as e:
-            logger.warning(f"Text cleaning failed: {e}")
-            return text
-# --- Enhanced Tokenizer ---
 class AGBPETokenizer:
-    """Enhanced tokenizer with comprehensive metrics and error handling"""
     def __init__(self, vocab: Dict[str, int], merges: Dict[str, int], special_tokens: Dict[str, int]):
-        """Initialize with validation and caching setup"""
-        try:
-            self.vocab = vocab
-            self.special_tokens_map = special_tokens
-            self.id_to_token: Dict[int, str] = {i: s for s, i in self.vocab.items()}
-            # Pre-compile regex patterns
-            self.pat = re.compile(r'\s*\S+')
-            # Setup UNK token
-            self.unk_token_id = self.vocab.get('<unk>')
-            if self.unk_token_id is None:
-                if self.vocab:
-                    self.unk_token_id = min(self.vocab.values())
-                    logger.warning(f"'<unk>' not found. Using fallback ID: {self.unk_token_id}")
-                else:
-                    raise ValueError("Empty vocabulary with no '<unk>' token")
-            self.text_cleaner = TextCleaner()
-            # Performance caching
-            self._encode_cache = {}
-            self._max_cache_entries = 500
-            # Calculate vocab size in KB
-            self.vocab_size_kb = sys.getsizeof(json.dumps(self.vocab)) / 1024
-            # Stats tracking
-            self.total_tokens_encoded = 0
-            self.total_oov_tokens = 0
-        except Exception as e:
-            logger.error(f"Tokenizer initialization failed: {e}")
-            raise
     @classmethod
     def from_file(cls, filepath: str) -> 'AGBPETokenizer':
-        """Load tokenizer with comprehensive error handling"""
-        try:
-            path = Path(filepath)
-            if not path.exists():
-                raise FileNotFoundError(f"Tokenizer file not found: '{filepath}'")
-            with open(path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-            required_keys = ['vocab', 'merges', 'special_tokens']
-            missing_keys = [k for k in required_keys if k not in data]
-            if missing_keys:
-                raise ValueError(f"Missing required keys: {missing_keys}")
-            logger.info(f"Successfully loaded tokenizer from {filepath}")
-            return cls(data['vocab'], data['merges'], data['special_tokens'])
-        except json.JSONDecodeError as e:
-            logger.error(f"Invalid JSON in tokenizer file: {e}")
-            raise ValueError(f"Failed to parse JSON: {e}")
-        except Exception as e:
-            logger.error(f"Failed to load tokenizer: {e}")
-            raise
-    @lru_cache(maxsize=256)
     def _find_best_vocab_match(self, text_chunk: str) -> List[int]:
-        """Optimized longest-match with caching"""
         ids = []
         i = 0
-        oov_count = 0
         while i < len(text_chunk):
             found_match = False
-            # Try longest possible match first
-            for j in range(min(len(text_chunk), i + 50), i, -1):  # Cap max token length
                 substring = text_chunk[i:j]
                 if substring in self.vocab:
                     ids.append(self.vocab[substring])
-                    i = j
                     found_match = True
-                    break
             if not found_match:
                 ids.append(self.unk_token_id)
-                oov_count += 1
                 i += 1
-        # Track OOV stats
-        if oov_count > 0:
-            self.total_oov_tokens += oov_count
         return ids
     def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
-        """Enhanced encoding with performance tracking"""
-        try:
-            # Check cache
-            cache_key = (text, add_special_tokens)
-            if cache_key in self._encode_cache:
-                return self._encode_cache[cache_key]
-            # Clean text
-            cleaned_text = self.text_cleaner.clean_text(text)
-            token_ids = []
-            # Add BOS token
-            if add_special_tokens and (bos_id := self.special_tokens_map.get('<bos>')) is not None:
-                token_ids.append(bos_id)
-            # Tokenize chunks
-            for chunk in self.pat.findall(cleaned_text):
-                chunk_ids = self._find_best_vocab_match(chunk)
-                token_ids.extend(chunk_ids)
-                self.total_tokens_encoded += len(chunk_ids)
-            # Add EOS token
-            if add_special_tokens and (eos_id := self.special_tokens_map.get('<eos>')) is not None:
-                token_ids.append(eos_id)
-            # Update cache
-            if len(self._encode_cache) < self._max_cache_entries:
-                self._encode_cache[cache_key] = token_ids
-            return token_ids
-        except Exception as e:
-            logger.error(f"Encoding failed: {e}")
-            return [self.unk_token_id]
     def decode(self, token_ids: List[int]) -> str:
-        """Enhanced decoding with error handling"""
-        try:
-            special_ids = set(self.special_tokens_map.values())
-            tokens = []
-            for token_id in token_ids:
-                if token_id not in special_ids:
-                    token = self.id_to_token.get(token_id, f"<UNK_{token_id}>")
-                    tokens.append(token)
-            return "".join(tokens)
-        except Exception as e:
-            logger.error(f"Decoding failed: {e}")
-            return ""
-    def calculate_metrics(self, text: str) -> TokenizerMetrics:
-        """Calculate comprehensive tokenizer metrics"""
-        try:
-            metrics = TokenizerMetrics(tokenizer_name="AG-BPE v4")
-            # Basic vocab metrics
-            metrics.vocab_size = len(self.vocab)
-            metrics.vocab_size_kb = self.vocab_size_kb
-            if not text:
-                return metrics
-            # Timing metrics
-            start_time = time.perf_counter()
-            encoded = self.encode(text, add_special_tokens=False)
-            enc_time = (time.perf_counter() - start_time) * 1000
-            start_time = time.perf_counter()
-            decoded = self.decode(encoded)
-            dec_time = (time.perf_counter() - start_time) * 1000
-            metrics.enc_speed_ms = enc_time
-            metrics.dec_speed_ms = dec_time
-            # Throughput
-            if enc_time > 0:
-                metrics.throughput_chars_s = (len(text) / enc_time) * 1000
-            # Token statistics
-            tokens = [self.id_to_token.get(i, "") for i in encoded]
-            if tokens:
-                token_lengths = [len(t) for t in tokens]
-                metrics.avg_len = np.mean(token_lengths)
-                # Compression ratio
-                original_bytes = len(text.encode('utf-8'))
-                token_bytes = len(encoded) * 2  # Assuming 2 bytes per token ID
-                metrics.compression = original_bytes / max(token_bytes, 1)
-                # Effectiveness per KB
-                metrics.effectiveness_per_kb = metrics.compression / max(metrics.vocab_size_kb, 0.001)
-                # OOV Rate
-                oov_count = sum(1 for tid in encoded if tid == self.unk_token_id)
-                metrics.oov_rate = (oov_count / len(encoded)) * 100 if encoded else 0
-                # Entropy (token distribution)
-                token_counts = Counter(encoded)
-                total = sum(token_counts.values())
-                probs = [count/total for count in token_counts.values()]
-                metrics.entropy = -sum(p * math.log2(p) for p in probs if p > 0)
-                # Fertility (avg tokens per word)
-                words = text.split()
-                metrics.fertility = len(encoded) / max(len(words), 1)
-                # Robustness Score (composite metric)
-                metrics.robustness_score = min(100, (
-                    (100 - metrics.oov_rate) * 0.4 +  # Low OOV is good
-                    min(metrics.compression * 10, 40) +  # Good compression
-                    min(metrics.effectiveness_per_kb * 10, 20)  # Efficiency
-                ))
-            return metrics
-        except Exception as e:
-            logger.error(f"Metrics calculation failed: {e}")
-            return TokenizerMetrics()
-# --- Enhanced Gradio Application ---
 TOKENIZER_FILE = "ag_bpe_tokenizer_v4.json"
-tokenizer: Optional[AGBPETokenizer] = None
-# Initialize tokenizer
 try:
     if not Path(TOKENIZER_FILE).exists():
-        logger.info(f"Creating demo tokenizer file: {TOKENIZER_FILE}")
-        demo_vocab = {
-            "<unk>": 0, "<bos>": 1, "<eos>": 2, " ": 3, "e": 4, "t": 5,
-            "a": 6, "i": 7, "n": 8, "o": 9, "s": 10, "r": 11, "h": 12,
-            "l": 13, "d": 14, "c": 15, "u": 16, "m": 17, "f": 18, "p": 19,
-            "g": 20, "w": 21, "y": 22, "b": 23, "v": 24, "k": 25, "x": 26,
-            "j": 27, "q": 28, "z": 29, "th": 30, "he": 31, "in": 32,
-            "er": 33, "an": 34, " the": 35, "ing": 36, "ed": 37, "and": 38,
-            "to": 39, "of": 40, "is": 41, "it": 42, "for": 43, "as": 44,
-            "with": 45, "was": 46, "that": 47, "be": 48, "on": 49,
-            "Hello": 50, " world": 51, "AI": 52, "test": 53, "code": 54
-        }
-        demo_data = {
-            "vocab": demo_vocab,
-            "merges": {"t h": 30, "h e": 31, "i n": 32},
             "special_tokens": {"<unk>": 0, "<bos>": 1, "<eos>": 2}
         }
         with open(TOKENIZER_FILE, 'w', encoding='utf-8') as f:
-            json.dump(demo_data, f, indent=2)
-    logger.info(f"Loading tokenizer from {TOKENIZER_FILE}")
     tokenizer = AGBPETokenizer.from_file(TOKENIZER_FILE)
-    logger.info(f"✅ Tokenizer ready! Vocab size: {len(tokenizer.vocab)}")
-except Exception as e:
-    logger.error(f"❌ Failed to initialize tokenizer: {e}")
-def process_text(text: str) -> Tuple[str, str, Dict[str, str]]:
-    """Process text and return visualization, stats, and metrics"""
-    if not tokenizer:
-        return (
-            "<div style='color: red; font-weight: bold;'>⚠️ Tokenizer not loaded!</div>",
-            "<div style='color: red;'>No metrics available</div>",
-            {}
-        )
-    if not text:
-        return (
-            "<div style='color: #888; padding: 20px;'>✍️ Enter text to see tokenization...</div>",
-            "<div style='color: #888;'>Waiting for input...</div>",
-            {}
-        )
-    try:
-        # Get tokens
-        encoded_ids = tokenizer.encode(text, add_special_tokens=False)
-        tokens = [tokenizer.id_to_token.get(i, f"<UNK_{i}>") for i in encoded_ids]
-        # Calculate metrics
-        metrics = tokenizer.calculate_metrics(text)
-        metrics_dict = metrics.to_dict()
-        # Generate visualization HTML
-        html_tokens = generate_token_html(tokens, encoded_ids)
-        # Generate stats HTML
-        html_stats = generate_stats_html(metrics_dict)
-        return html_tokens, html_stats, metrics_dict
-    except Exception as e:
-        logger.error(f"Processing failed: {e}")
-        return (
-            f"<div style='color: red;'>❌ Error: {str(e)}</div>",
-            "<div style='color: red;'>Error calculating metrics</div>",
-            {}
-        )
-def generate_token_html(tokens: List[str], token_ids: List[int]) -> str:
-    """Generate beautiful token visualization"""
-    gradients = [
-        "linear-gradient(135deg, #667eea 0%, #764ba2 100%)",
-        "linear-gradient(135deg, #f093fb 0%, #f5576c 100%)",
-        "linear-gradient(135deg, #4facfe 0%, #00f2fe 100%)",
-        "linear-gradient(135deg, #43e97b 0%, #38f9d7 100%)",
-        "linear-gradient(135deg, #fa709a 0%, #fee140 100%)",
-        "linear-gradient(135deg, #30cfd0 0%, #330867 100%)",
-        "linear-gradient(135deg, #a8edea 0%, #fed6e3 100%)"
-    ]
-    html = """
-    <div style='
-        display: flex;
-        flex-wrap: wrap;
-        gap: 8px;
-        padding: 20px;
-        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-        border-radius: 12px;
-        box-shadow: 0 10px 30px rgba(0,0,0,0.1);
-    '>
-    """
-    for i, (token, tid) in enumerate(zip(tokens, token_ids)):
-        gradient = gradients[i % len(gradients)]
-        safe_token = html.escape(token)
-        # Special styling for UNK tokens
-        is_unk = tid == tokenizer.unk_token_id if tokenizer else False
-        border_style = "3px solid #ff4444" if is_unk else "1px solid rgba(255,255,255,0.3)"
-        html += f"""
-        <div style='
-            display: flex;
-            flex-direction: column;
-            align-items: center;
-            padding: 12px 16px;
-            background: {gradient};
-            border-radius: 10px;
-            border: {border_style};
-            box-shadow: 0 4px 15px rgba(0,0,0,0.1);
-            transition: all 0.3s ease;
-            cursor: pointer;
-        ' onmouseover='this.style.transform="scale(1.05)"' onmouseout='this.style.transform="scale(1)"'>
-            <span style='
-                color: white;
-                font-size: 16px;
-                font-weight: 600;
-                text-shadow: 0 1px 3px rgba(0,0,0,0.2);
-                white-space: pre-wrap;
-                max-width: 150px;
-                overflow: hidden;
-                text-overflow: ellipsis;
-            '>{safe_token}</span>
-            <span style='
-                color: white;
-                font-size: 12px;
-                font-weight: bold;
-                margin-top: 6px;
-                background: rgba(0,0,0,0.2);
-                padding: 2px 8px;
-                border-radius: 12px;
-            '>#{tid}</span>
-        </div>
-        """
-    html += "</div>"
-    return html
-def generate_stats_html(metrics: Dict[str, str]) -> str:
-    """Generate beautiful metrics display"""
-    html = """
-    <div style='
-        display: grid;
-        grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-        gap: 15px;
-        padding: 20px;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border-radius: 12px;
-        box-shadow: 0 10px 30px rgba(0,0,0,0.2);
-    '>
     """
-    icons = {
-        "Vocab Size": "📚", "Compression": "🗜️", "Avg Len": "📏",
-        "OOV Rate (%)": "❓", "Enc Speed (ms)": "⚡", "Dec Speed (ms)": "⏱️",
-        "Throughput (chars/s)": "🚀", "Entropy": "🌡️", "Fertility": "🌱",
-        "Robustness Score": "💪", "Effectiveness/KB": "📊", "Vocab Size (KB)": "💾"
-    }
-    for key, value in metrics.items():
-        if key == "Tokenizer":
-            continue
-        icon = icons.get(key, "📈")
-        html += f"""
-        <div style='
-            background: rgba(255,255,255,0.95);
-            padding: 15px;
-            border-radius: 8px;
-            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
-            transition: all 0.3s ease;
-        ' onmouseover='this.style.transform="translateY(-2px)"' onmouseout='this.style.transform="translateY(0)"'>
-            <div style='
-                display: flex;
-                align-items: center;
-                margin-bottom: 8px;
-            '>
-                <span style='font-size: 24px; margin-right: 8px;'>{icon}</span>
-                <span style='
-                    color: #4a5568;
-                    font-size: 12px;
-                    font-weight: 600;
-                    text-transform: uppercase;
-                '>{key}</span>
-            </div>
-            <div style='
-                color: #1a202c;
-                font-size: 20px;
-                font-weight: bold;
-            '>{value}</div>
-        </div>
-        """
-    html += "</div>"
-    return html
-# Create Gradio interface
-with gr.Blocks(
-    theme=gr.themes.Soft(
-        primary_hue="purple",
-        secondary_hue="blue",
-        font=gr.themes.GoogleFont("Inter")
-    ),
-    css="""
-    .gradio-container {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    }
-    footer {display: none !important;}
-    .gr-button {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border: none;
-        color: white;
-    }
-    .gr-button:hover {
-        transform: scale(1.05);
-        box-shadow: 0 5px 20px rgba(0,0,0,0.3);
-    }
     """
-) as demo:
     gr.Markdown(
         """
-        # 🚀 **AG-BPE Enhanced Tokenizer**
-        ### 🎯 Real-time Visualization & Advanced Metrics Dashboard
-        <div style='
-            background: rgba(255,255,255,0.9);
-            padding: 15px;
-            border-radius: 8px;
-            margin: 10px 0;
-        '>
-            <b>✨ Features:</b> Longest-match tokenization • Real-time metrics • Performance analysis • Beautiful visualization
-        </div>
         """
     )
-    with gr.Row():
-        with gr.Column(scale=2):
-            input_text = gr.Textbox(
-                label="📝 Input Text",
-                placeholder="Type or paste your text here...",
-                lines=6,
-                max_lines=10,
-                autofocus=True
-            )
-            gr.Examples(
-                examples=[
-                    "The quick brown fox jumps over the lazy dog.",
-                    "Artificial Intelligence is revolutionizing technology! 🚀",
-                    "def fibonacci(n):\n    return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
-                    "Les mathématiques sont le langage de l'univers. 🌌",
-                    "東京は日本の首都です。人口は約1400万人です。",
-                    "Blockchain technology enables decentralized systems.",
-                ],
-                inputs=input_text,
-                label="💡 Quick Examples"
-            )
-    with gr.Row():
-        output_viz = gr.HTML(
-            label="🎨 Token Visualization",
-            value="<div style='padding: 40px; text-align: center; color: #888;'>Waiting for input...</div>"
-        )
-    with gr.Row():
-        output_stats = gr.HTML(
-            label="📊 Metrics Dashboard",
-            value="<div style='padding: 40px; text-align: center; color: #888;'>Metrics will appear here...</div>"
         )
-    # Real-time updates
-    input_text.change(
-        fn=process_text,
-        inputs=[input_text],
-        outputs=[output_viz, output_stats],
-        queue=True
     )
-    gr.Markdown(
-        """
-        ---
-        <div style='text-align: center; color: rgba(255,255,255,0.8); padding: 20px;'>
-            <b>AG-BPE v4 Enhanced</b> | Created by RDTvlokip |
-            <a href='https://huggingface.co/RDTvlokip' style='color: white;'>HuggingFace</a> |
-            <a href='https://zenodo.org/records/16739553' style='color: white;'>Zenodo</a>
-        </div>
-        """
     )
 if __name__ == "__main__":
-    demo.queue()
-    demo.launch(share=False, debug=True)

 # -*- coding: utf-8 -*-
 """
+AG-BPE Standalone Usage Script & Web Visualizer
+================================================
+This script demonstrates how to load and use a pre-trained AG-BPE tokenizer
+and provides a real-time web interface using Gradio to visualize its behavior.
+This version has been modified to use a "longest-match" strategy directly on the
+vocabulary, ignoring the BPE merge rules.
+"""
 import json
 import regex as re
 from pathlib import Path
+from typing import List, Dict, Tuple
 import unicodedata
 import gradio as gr
 import html
 import math
+# --- TextCleaner Class (Unchanged) ---
 class TextCleaner:
+    """A text cleaner for AI datasets, designed to remove invisible, abnormal, and disruptive characters."""
     UNWANTED_CHARS = {
         '\ufffd', '\u200b', '\u200c', '\u200d', '\u2060', '\u2061', '\u2063',
         '\u00a0', '\u202f', '\u2007', '\u2028', '\u2029', '\ufeff', '\ue000',
         '\uf8ff', '\ue001', '\xad', '\u180e', '\u200e', '\uFE0F',
     }
     @classmethod
     def clean_text(cls, text: str) -> str:
+        """Cleans a given string by normalizing it, removing unwanted characters, and collapsing whitespace."""
+        text = unicodedata.normalize("NFKC", text)
+        text = text.replace('’', "'").replace('‘', "'")
+        text = text.replace('“', '"').replace('”', '"')
+        for char in cls.UNWANTED_CHARS:
+            text = text.replace(char, '')
+        text = ''.join(c for c in text if ord(c) >= 32 or c in '\n\r\t')
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+# --- Standalone Tokenizer Class (Logic Changed) ---
 class AGBPETokenizer:
+    """
+    A self-contained tokenizer that loads a pre-trained model from a JSON file.
+    MODIFIED: This version uses a greedy longest-match algorithm on the vocabulary,
+    ignoring any BPE merge rules.
+    """
     def __init__(self, vocab: Dict[str, int], merges: Dict[str, int], special_tokens: Dict[str, int]):
+        """Initializes the tokenizer from loaded vocabulary and merge data."""
+        self.vocab = vocab
+        # self.merges is no longer used, but kept for file loading compatibility
+        self.special_tokens_map = special_tokens
+        self.id_to_token: Dict[int, str] = {i: s for s, i in self.vocab.items()}
+        self.pat = re.compile(r'\s*\S+')
+        self.unk_token_id = self.vocab.get('<unk>')
+        if self.unk_token_id is None:
+            # Fallback for vocabularies without <unk>
+            if self.vocab:
+                self.unk_token_id = next(iter(self.vocab.values()))
+                print(f"Warning: '<unk>' token not found. Using first token as fallback (ID: {self.unk_token_id}).")
+            else:
+                 raise ValueError("The vocabulary is empty and '<unk>' token is missing.")
+        self.text_cleaner = TextCleaner()
     @classmethod
     def from_file(cls, filepath: str) -> 'AGBPETokenizer':
+        """Class method to conveniently load a tokenizer from a JSON file path."""
+        path = Path(filepath)
+        if not path.exists():
+            raise FileNotFoundError(f"Tokenizer file not found: '{filepath}'")
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        required_keys = ['vocab', 'merges', 'special_tokens']
+        if not all(key in data for key in required_keys):
+            raise ValueError("The JSON file is malformed. Missing one of: vocab, merges, special_tokens.")
+        return cls(data['vocab'], data['merges'], data['special_tokens'])
     def _find_best_vocab_match(self, text_chunk: str) -> List[int]:
+        """
+        Tokenizes a chunk of text by greedily finding the longest possible
+        substring that exists in the vocabulary.
+        """
         ids = []
         i = 0
         while i < len(text_chunk):
             found_match = False
+            # Search for the longest possible match from current position
+            for j in range(len(text_chunk), i, -1):
                 substring = text_chunk[i:j]
                 if substring in self.vocab:
                     ids.append(self.vocab[substring])
+                    i = j  # Move pointer to the end of the match
                     found_match = True
+                    break  # Exit the inner loop to continue from the new position
             if not found_match:
+                # If no match was found (not even a single character),
+                # use the unknown token and advance by one character.
                 ids.append(self.unk_token_id)
                 i += 1
         return ids
     def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
+        """Encodes a string of text into a list of token IDs."""
+        cleaned_text = self.text_cleaner.clean_text(text)
+        token_ids = []
+        if add_special_tokens and (bos_id := self.special_tokens_map.get('<bos>')) is not None:
+            token_ids.append(bos_id)
+        # Pre-tokenize the text into chunks (words and their preceding spaces)
+        for chunk in self.pat.findall(cleaned_text):
+            # Apply the new longest-match algorithm on each chunk
+            chunk_ids = self._find_best_vocab_match(chunk)
+            token_ids.extend(chunk_ids)
+        if add_special_tokens and (eos_id := self.special_tokens_map.get('<eos>')) is not None:
+            token_ids.append(eos_id)
+        return token_ids
     def decode(self, token_ids: List[int]) -> str:
+        """Decodes a list of token IDs back into a string of text."""
+        special_ids_to_skip = set(self.special_tokens_map.values())
+        tokens = [self.id_to_token.get(token_id, '') for token_id in token_ids if token_id not in special_ids_to_skip]
+        return "".join(tokens)
+# --- Gradio Web Application (Unchanged) ---
 TOKENIZER_FILE = "ag_bpe_tokenizer_v4.json"
+TOKENIZER_LOADED = False
+ERROR_MESSAGE = ""
+tokenizer = None
 try:
     if not Path(TOKENIZER_FILE).exists():
+        print(f"⚠️  Warning: Tokenizer file '{TOKENIZER_FILE}' not found.")
+        print("Creating a dummy tokenizer file for local testing.")
+        dummy_data = {
+            "vocab": {"<unk>": 0, "<bos>": 1, "<eos>": 2, " comm": 3, "ent": 4, "?": 5, "Hello": 8, " world": 9, '"comm"': 10, " comment": 11},
+            "merges": {" c o m m": 1, "e n t": 2, " comment":3},
             "special_tokens": {"<unk>": 0, "<bos>": 1, "<eos>": 2}
         }
         with open(TOKENIZER_FILE, 'w', encoding='utf-8') as f:
+            json.dump(dummy_data, f, indent=2)
+        print("Dummy file created. The app will use this file.")
+    print(f"🧠 Loading tokenizer from '{TOKENIZER_FILE}'...")
     tokenizer = AGBPETokenizer.from_file(TOKENIZER_FILE)
+    TOKENIZER_LOADED = True
+    print(f"✅ Tokenizer loaded successfully. Vocabulary size: {len(tokenizer.vocab)}")
+except (FileNotFoundError, ValueError, KeyError) as e:
+    ERROR_MESSAGE = str(e)
+    print(f"❌ ERROR loading tokenizer: {ERROR_MESSAGE}")
+def visualize_tokenization(text: str) -> Tuple[str, float, float, float]:
     """
+    Takes input text, tokenizes it, calculates stats, and returns
+    a styled HTML string and the statistics for display.
     """
+    if not TOKENIZER_LOADED or not tokenizer:
+        error_html = f"<p style='color: red; font-weight: bold;'>TOKENIZER LOADING ERROR: {ERROR_MESSAGE}</p>"
+        return error_html, 0.0, 0.0, 0.0
+    if not text:
+        return "<p style='color: #888;'>Please enter some text to see the visualization...</p>", 0.0, 0.0, 0.0
+    encoded_ids = tokenizer.encode(text, add_special_tokens=False)
+    tokens = [tokenizer.id_to_token.get(i, f"<unk:{i}>") for i in encoded_ids]
+    # --- Calculate Statistics ---
+    avg_len, std_dev, ratio = 0.0, 0.0, 0.0
+    if tokens:
+        token_lengths = [len(t) for t in tokens]
+        avg_len = sum(token_lengths) / len(token_lengths)
+        if len(token_lengths) > 1:
+            variance = sum([(x - avg_len) ** 2 for x in token_lengths]) / (len(token_lengths) - 1)
+            std_dev = math.sqrt(variance)
+        if text:
+            ratio = len(tokens) / len(text)
+    # --- Generate HTML ---
+    colors = ["#dbeafe", "#dcfce7", "#fee2e2", "#fef3c7", "#f3e8ff", "#d1fae5", "#e0f2fe"]
+    html_output = "<div style='display: flex; flex-wrap: wrap; align-items: flex-start; font-family: sans-serif;'>"
+    for i, token_id in enumerate(encoded_ids):
+        safe_token_string = html.escape(tokens[i])
+        color = colors[i % len(colors)]
+        html_output += f"""
+        <div style="display: inline-flex; flex-direction: column; align-items: center; margin: 4px; padding: 8px 10px; border-radius: 8px; background-color: {color}; border: 1px solid rgba(0,0,0,0.1); box-shadow: 0 1px 3px rgba(0,0,0,0.05); text-align: center;">
+            <span style="font-size: 1.1em; font-weight: 500; color: #111827; white-space: pre-wrap;">{safe_token_string}</span>
+            <span style="font-size: 0.9em; font-weight: 700; color: #1e3a8a; margin-top: 5px; background-color: rgba(255,255,255,0.6); padding: 2px 6px; border-radius: 5px;">{token_id}</span>
+        </div>"""
+    html_output += "</div>"
+    return html_output, round(avg_len, 2), round(std_dev, 2), round(ratio, 3)
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky"), css="footer {display: none !important}") as demo:
     gr.Markdown(
         """
+        # 👁️ Real-time Tokenizer Visualizer
+        Enter text in the field below to see the tokenization happen live.
+        Each colored card is a "token", with its corresponding numerical ID shown below it.
         """
     )
+    with gr.Column():
+        input_textbox = gr.Textbox(
+            label="Enter your text here",
+            placeholder="Type something...",
+            lines=5,
+            show_label=False,
         )
+        with gr.Row():
+            avg_len_box = gr.Textbox(label="Avg. Token Len", interactive=False)
+            std_dev_box = gr.Textbox(label="Std. Dev Len", interactive=False)
+            ratio_box = gr.Textbox(label="Tokens/Chars Ratio", interactive=False)
+        output_html = gr.HTML(label="Tokens and IDs")
+    input_textbox.input(
+        fn=visualize_tokenization,
+        inputs=[input_textbox],
+        outputs=[output_html, avg_len_box, std_dev_box, ratio_box]
     )
+    gr.Examples(
+        examples=[
+            "Artificial intelligence is fascinating.",
+            'Test with "quotes" and spaces.',
+            "Code like `if (x==10)` and emojis 👍🚀 are handled.",
+            "Hello world! This is a test of the AG-BPE tokenizer.",
+            "안녕하세요",
+            "Salut comment ça va ?"
+        ],
+        inputs=input_textbox
     )
 if __name__ == "__main__":
+    demo.launch()