Spaces:

sudhakar272
/

Odia5KTokenizer

Sleeping

App Files Files Community

sudhakar272 commited on Jan 11

Commit

e2eb42e

verified ·

1 Parent(s): cbe8a4e

Upload 4 files

Browse files

Files changed (4) hide show

README.md +7 -6
app.py +110 -7
odia_bpe_tokenizer.json +0 -0
odia_tokenizer.py +185 -0

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: Odia5KTokenizer
-emoji: 🌍
-colorFrom: blue
-colorTo: gray
 sdk: gradio
-sdk_version: 5.12.0
 app_file: app.py
 pinned: false
-short_description: Tokenizer for Odia language
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Odia Tokenizer 5k
+emoji: ⚡
+colorFrom: green
+colorTo: red
 sdk: gradio
+sdk_version: 5.10.0
 app_file: app.py
 pinned: false
+license: mit
+short_description: Tokenizer specific to odia language with 5000 tokens
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,7 +1,110 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import gradio as gr
+import json
+from odia_tokenizer import OdiaBPETokenizer
+import random
+import colorsys
+def generate_distinct_colors(n):
+    """Generate n visually distinct colors"""
+    colors = []
+    for i in range(n):
+        hue = i / n
+        saturation = 0.7
+        value = 0.9
+        rgb = colorsys.hsv_to_rgb(hue, saturation, value)
+        hex_color = "#{:02x}{:02x}{:02x}".format(
+            int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
+        )
+        colors.append(hex_color)
+    return colors
+def load_tokenizer():
+    try:
+        return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
+    except:
+        # If no saved tokenizer found, create a new one
+        return OdiaBPETokenizer(vocab_size=5000)
+def tokenize_text(text):
+    tokenizer = load_tokenizer()
+    # Get token IDs and their corresponding text
+    token_ids = tokenizer.encode(text)
+    tokens = []
+    current_pos = 0
+    # Process text to get token spans
+    words = [list(text)]
+    for pair, merged in tokenizer.merges.items():
+        words = tokenizer._merge_vocab(words, pair)
+    # Extract final tokens
+    final_tokens = []
+    for word in words:
+        final_tokens.extend(word)
+    # Generate colors for tokens
+    colors = generate_distinct_colors(len(tokenizer.vocab))
+    color_map = {
+        token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
+    }
+    # Create highlighted HTML
+    html_parts = []
+    token_list = []
+    for i, token in enumerate(final_tokens):
+        token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
+        color = color_map[token_id]
+        html_parts.append(f'<span style="background-color: {color}">{token}</span>')
+        token_list.append(f"{token} ({token_id})")
+    highlighted_text = "".join(html_parts)
+    # Calculate compression ratio
+    compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0
+    return (
+        len(token_ids),  # Token count
+        compression_ratio,  # Compression ratio
+        highlighted_text,  # Highlighted text
+        "\n".join(token_list),  # Token list
+    )
+custom_css = """
+.token-highlight {
+    border-radius: 3px;
+    margin: 0 1px;
+}
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown("# Odia BPE Tokenizer")
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_text = gr.Textbox(
+                label="Input Text", placeholder="Enter Odia text here...", lines=10
+            )
+        with gr.Column(scale=1):
+            token_count = gr.Number(label="Token Count")
+            compression_ratio = gr.Number(label="Compression Ratio")
+            highlighted_output = gr.HTML(label="Tokenized Text")
+            token_list = gr.Textbox(label="Token List", lines=10)
+    input_text.change(
+        fn=tokenize_text,
+        inputs=[input_text],
+        outputs=[token_count, compression_ratio, highlighted_output, token_list],
+    )
+demo.launch()

odia_bpe_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

odia_tokenizer.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import re
+import json
+from collections import defaultdict, Counter
+from typing import List, Dict, Tuple, Set
+class OdiaBPETokenizer:
+    def __init__(self, vocab_size: int = 5000):
+        self.vocab_size = vocab_size
+        self.vocab = {}
+        self.merges = {}
+        self.special_tokens = {
+            '<PAD>': 0,
+            '<UNK>': 1,
+            '<BOS>': 2,
+            '<EOS>': 3
+        }
+        # Initialize basic Odia character vocabulary
+        self.base_vocab = set()
+        # Add basic Odia characters (vowels, consonants, marks)
+        self._initialize_base_vocab()
+    def _initialize_base_vocab(self):
+        """Initialize vocabulary with basic Odia characters"""
+        # Vowels
+        self.base_vocab.update([chr(c) for c in [0x0B05, 0x0B06, 0x0B07, 0x0B08, 0x0B09, 0x0B0A, 0x0B0B, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B14] ])
+        # Consonants
+        self.base_vocab.update([chr(c) for c in [0x0B15, 0x0B16, 0x0B17, 0x0B18, 0x0B19, 0x0B1A, 0x0B1B, 0x0B1C, 0x0B1D, 0x0B1E, 0x0B1F, 0x0B20, 0x0B21, 0x0B22, 0x0B23, 0x0B24, 0x0B25, 0x0B26, 0x0B27, 0x0B28, 0x0B2A, 0x0B2B, 0x0B2C, 0x0B2D, 0x0B2E, 0x0B2F, 0x0B30, 0x0B32, 0x0B33, 0x0B35, 0x0B36, 0x0B37, 0x0B38, 0x0B39, 0x0B3C] ])
+        # Vowel marks
+        self.base_vocab.update([chr(c) for c in [0x0B3E, 0x0B3F, 0x0B40, 0x0B41, 0x0B42, 0x0B43, 0x0B44, 0x0B47, 0x0B48, 0x0B4B, 0x0B4C, 0x0B4D, 0x0B55, 0x0B56, 0x0B57] ])
+        # Other etc chars
+        self.base_vocab.update([chr(c) for c in [0x0B5C, 0x0B5D, 0x0B5F, 0x0B60, 0x0B61, 0x0B62, 0x0B63, 0x0B71] ])
+        # numbers
+        self.base_vocab.update([chr(c) for c in [0x0B66, 0x0B67, 0x0B68, 0x0B69, 0x0B6A, 0x0B6B, 0x0B6C, 0x0B6D, 0x0B6E, 0x0B6F] ])
+        # Signs
+        self.base_vocab.update([chr(c) for c in [0x0B70, 0x0B01, 0x0B02, 0x0B03, 0x0964] ])
+        # Other marks
+        self.base_vocab.update([
+            'ଂ', 'ଃ', 'ଁ', '୍',  # Anusvara, Visarga, Candrabindu, Halanta
+            ' ', '\n', '\t'  # Whitespace characters
+        ])
+    def _get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
+        """Count frequency of adjacent pairs in the vocabulary"""
+        pairs = defaultdict(int)
+        for word in words:
+            for i in range(len(word) - 1):
+                pairs[tuple(word[i:i + 2])] += 1
+        return pairs
+    def _merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
+        """Merge all occurrences of the most frequent pair"""
+        first, second = pair
+        new_words = []
+        for word in words:
+            i = 0
+            new_word = []
+            while i < len(word):
+                if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_words.append(new_word)
+        return new_words
+    def train(self, texts: List[str], min_freq: int = 2) -> None:
+        """Train BPE model on texts"""
+        # Regular expression for extracting Odia words
+        odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
+        # Split texts into characters
+        words = []
+        for text in texts:
+            # Extract words based on the Odia pattern
+            extracted_words = odia_word_pattern.findall(text)
+            for word in extracted_words:
+                chars = list(word)
+                # Filter valid Odia characters
+                valid_chars = [c for c in chars if c in self.base_vocab or c.isspace()]
+                if valid_chars:
+                    words.append(valid_chars)
+        vocab = self.base_vocab.copy()
+        num_merges = self.vocab_size - len(self.special_tokens) - len(vocab)
+        print("num_merges : ", num_merges)
+        # Perform BPE merges
+        for i in range(num_merges):
+            pairs = self._get_stats(words)
+            if not pairs:
+                break
+            # Find most frequent pair
+            best_pair = max(pairs.items(), key=lambda x: x[1])
+            if best_pair[1] < min_freq:
+                break
+            pair = best_pair[0]
+            new_token = ''.join(pair)
+            vocab.add(new_token)
+            #print("merging ..", pair)
+            print(len(vocab))
+            # Record the merge operation
+            self.merges[pair] = new_token
+            # Merge the pair in all words
+            words = self._merge_vocab(words, pair)
+        # Build final vocabulary
+        self.vocab = {**self.special_tokens}
+        idx = len(self.special_tokens)
+        for token in sorted(vocab):
+            self.vocab[token] = idx
+            idx += 1
+        self.inverse_vocab = {v: k for k, v in self.vocab.items()}
+    def encode(self, text: str) -> List[int]:
+        """Encode text using learned BPE merges"""
+        odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
+        extracted_words = odia_word_pattern.findall(text)
+        words = [list(word) for word in extracted_words]
+        #words = [list(text)]
+        # Apply merges in order
+        for pair, merged in self.merges.items():
+            words = self._merge_vocab(words, pair)
+        # Convert to token IDs
+        result = []
+        for word in words:
+            for token in word:
+                if token in self.vocab:
+                    result.append(self.vocab[token])
+                else:
+                    result.append(self.special_tokens['<UNK>'])
+        return result
+    def decode(self, ids: List[int]) -> str:
+        """Decode token IDs back to text"""
+        return ''.join(self.inverse_vocab.get(id, '<UNK>') for id in ids)
+    def calculate_compression_ratio(self, text: str) -> float:
+        """Calculate compression ratio"""
+        encoded = self.encode(text)
+        return len(text) / len(encoded)
+    def save(self, path: str) -> None:
+        """Save tokenizer state"""
+        # Convert tuple keys to strings for JSON serialization
+        serializable_merges = {f"{first}|{second}": merged
+                              for (first, second), merged in self.merges.items()}
+        data = {
+            'vocab': self.vocab,
+            'merges': serializable_merges,
+            'vocab_size': self.vocab_size,
+            'special_tokens': self.special_tokens
+        }
+        with open(path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+    @classmethod
+    def load(cls, path: str) -> 'OdiaBPETokenizer':
+        """Load tokenizer from file"""
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        tokenizer = cls(vocab_size=data['vocab_size'])
+        tokenizer.vocab = data['vocab']
+        # Convert string keys back to tuples
+        tokenizer.merges = {tuple(k.split('|')): v
+                           for k, v in data['merges'].items()}
+        tokenizer.special_tokens = data['special_tokens']
+        tokenizer.inverse_vocab = {v: k for k, v in tokenizer.vocab.items()}
+        return tokenizer