sudhakar272 commited on
Commit
e2eb42e
·
verified ·
1 Parent(s): cbe8a4e

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +7 -6
  2. app.py +110 -7
  3. odia_bpe_tokenizer.json +0 -0
  4. odia_tokenizer.py +185 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Odia5KTokenizer
3
- emoji: 🌍
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Tokenizer for Odia language
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Odia Tokenizer 5k
3
+ emoji:
4
+ colorFrom: green
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.10.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: Tokenizer specific to odia language with 5000 tokens
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,7 +1,110 @@
1
- import gradio as gr
2
-
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
-
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from odia_tokenizer import OdiaBPETokenizer
4
+ import random
5
+ import colorsys
6
+
7
+
8
+ def generate_distinct_colors(n):
9
+ """Generate n visually distinct colors"""
10
+ colors = []
11
+ for i in range(n):
12
+ hue = i / n
13
+ saturation = 0.7
14
+ value = 0.9
15
+ rgb = colorsys.hsv_to_rgb(hue, saturation, value)
16
+ hex_color = "#{:02x}{:02x}{:02x}".format(
17
+ int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
18
+ )
19
+ colors.append(hex_color)
20
+ return colors
21
+
22
+
23
+ def load_tokenizer():
24
+ try:
25
+ return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
26
+ except:
27
+ # If no saved tokenizer found, create a new one
28
+ return OdiaBPETokenizer(vocab_size=5000)
29
+
30
+
31
+ def tokenize_text(text):
32
+ tokenizer = load_tokenizer()
33
+
34
+ # Get token IDs and their corresponding text
35
+ token_ids = tokenizer.encode(text)
36
+ tokens = []
37
+ current_pos = 0
38
+
39
+ # Process text to get token spans
40
+ words = [list(text)]
41
+ for pair, merged in tokenizer.merges.items():
42
+ words = tokenizer._merge_vocab(words, pair)
43
+
44
+ # Extract final tokens
45
+ final_tokens = []
46
+ for word in words:
47
+ final_tokens.extend(word)
48
+
49
+ # Generate colors for tokens
50
+ colors = generate_distinct_colors(len(tokenizer.vocab))
51
+ color_map = {
52
+ token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
53
+ }
54
+
55
+ # Create highlighted HTML
56
+ html_parts = []
57
+ token_list = []
58
+
59
+ for i, token in enumerate(final_tokens):
60
+ token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
61
+ color = color_map[token_id]
62
+ html_parts.append(f'<span style="background-color: {color}">{token}</span>')
63
+ token_list.append(f"{token} ({token_id})")
64
+
65
+ highlighted_text = "".join(html_parts)
66
+
67
+ # Calculate compression ratio
68
+ compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0
69
+
70
+ return (
71
+ len(token_ids), # Token count
72
+ compression_ratio, # Compression ratio
73
+ highlighted_text, # Highlighted text
74
+ "\n".join(token_list), # Token list
75
+ )
76
+
77
+
78
+ custom_css = """
79
+ .token-highlight {
80
+ border-radius: 3px;
81
+ margin: 0 1px;
82
+ }
83
+ .container {
84
+ max-width: 1200px;
85
+ margin: 0 auto;
86
+ }
87
+ """
88
+
89
+ with gr.Blocks(css=custom_css) as demo:
90
+ gr.Markdown("# Odia BPE Tokenizer")
91
+
92
+ with gr.Row():
93
+ with gr.Column(scale=1):
94
+ input_text = gr.Textbox(
95
+ label="Input Text", placeholder="Enter Odia text here...", lines=10
96
+ )
97
+
98
+ with gr.Column(scale=1):
99
+ token_count = gr.Number(label="Token Count")
100
+ compression_ratio = gr.Number(label="Compression Ratio")
101
+ highlighted_output = gr.HTML(label="Tokenized Text")
102
+ token_list = gr.Textbox(label="Token List", lines=10)
103
+
104
+ input_text.change(
105
+ fn=tokenize_text,
106
+ inputs=[input_text],
107
+ outputs=[token_count, compression_ratio, highlighted_output, token_list],
108
+ )
109
+
110
+ demo.launch()
odia_bpe_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
odia_tokenizer.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from collections import defaultdict, Counter
4
+ from typing import List, Dict, Tuple, Set
5
+
6
+ class OdiaBPETokenizer:
7
+ def __init__(self, vocab_size: int = 5000):
8
+ self.vocab_size = vocab_size
9
+ self.vocab = {}
10
+ self.merges = {}
11
+ self.special_tokens = {
12
+ '<PAD>': 0,
13
+ '<UNK>': 1,
14
+ '<BOS>': 2,
15
+ '<EOS>': 3
16
+ }
17
+
18
+ # Initialize basic Odia character vocabulary
19
+ self.base_vocab = set()
20
+ # Add basic Odia characters (vowels, consonants, marks)
21
+ self._initialize_base_vocab()
22
+
23
+ def _initialize_base_vocab(self):
24
+ """Initialize vocabulary with basic Odia characters"""
25
+ # Vowels
26
+ self.base_vocab.update([chr(c) for c in [0x0B05, 0x0B06, 0x0B07, 0x0B08, 0x0B09, 0x0B0A, 0x0B0B, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B14] ])
27
+ # Consonants
28
+ self.base_vocab.update([chr(c) for c in [0x0B15, 0x0B16, 0x0B17, 0x0B18, 0x0B19, 0x0B1A, 0x0B1B, 0x0B1C, 0x0B1D, 0x0B1E, 0x0B1F, 0x0B20, 0x0B21, 0x0B22, 0x0B23, 0x0B24, 0x0B25, 0x0B26, 0x0B27, 0x0B28, 0x0B2A, 0x0B2B, 0x0B2C, 0x0B2D, 0x0B2E, 0x0B2F, 0x0B30, 0x0B32, 0x0B33, 0x0B35, 0x0B36, 0x0B37, 0x0B38, 0x0B39, 0x0B3C] ])
29
+ # Vowel marks
30
+ self.base_vocab.update([chr(c) for c in [0x0B3E, 0x0B3F, 0x0B40, 0x0B41, 0x0B42, 0x0B43, 0x0B44, 0x0B47, 0x0B48, 0x0B4B, 0x0B4C, 0x0B4D, 0x0B55, 0x0B56, 0x0B57] ])
31
+ # Other etc chars
32
+ self.base_vocab.update([chr(c) for c in [0x0B5C, 0x0B5D, 0x0B5F, 0x0B60, 0x0B61, 0x0B62, 0x0B63, 0x0B71] ])
33
+ # numbers
34
+ self.base_vocab.update([chr(c) for c in [0x0B66, 0x0B67, 0x0B68, 0x0B69, 0x0B6A, 0x0B6B, 0x0B6C, 0x0B6D, 0x0B6E, 0x0B6F] ])
35
+ # Signs
36
+ self.base_vocab.update([chr(c) for c in [0x0B70, 0x0B01, 0x0B02, 0x0B03, 0x0964] ])
37
+ # Other marks
38
+ self.base_vocab.update([
39
+ 'ଂ', 'ଃ', 'ଁ', '୍', # Anusvara, Visarga, Candrabindu, Halanta
40
+ ' ', '\n', '\t' # Whitespace characters
41
+ ])
42
+
43
+ def _get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
44
+ """Count frequency of adjacent pairs in the vocabulary"""
45
+ pairs = defaultdict(int)
46
+ for word in words:
47
+ for i in range(len(word) - 1):
48
+ pairs[tuple(word[i:i + 2])] += 1
49
+ return pairs
50
+
51
+ def _merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
52
+ """Merge all occurrences of the most frequent pair"""
53
+ first, second = pair
54
+ new_words = []
55
+
56
+ for word in words:
57
+ i = 0
58
+ new_word = []
59
+ while i < len(word):
60
+ if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
61
+ new_word.append(first + second)
62
+ i += 2
63
+ else:
64
+ new_word.append(word[i])
65
+ i += 1
66
+ new_words.append(new_word)
67
+
68
+ return new_words
69
+
70
+ def train(self, texts: List[str], min_freq: int = 2) -> None:
71
+ """Train BPE model on texts"""
72
+
73
+ # Regular expression for extracting Odia words
74
+ odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
75
+
76
+ # Split texts into characters
77
+ words = []
78
+ for text in texts:
79
+ # Extract words based on the Odia pattern
80
+ extracted_words = odia_word_pattern.findall(text)
81
+ for word in extracted_words:
82
+ chars = list(word)
83
+ # Filter valid Odia characters
84
+ valid_chars = [c for c in chars if c in self.base_vocab or c.isspace()]
85
+ if valid_chars:
86
+ words.append(valid_chars)
87
+
88
+ vocab = self.base_vocab.copy()
89
+ num_merges = self.vocab_size - len(self.special_tokens) - len(vocab)
90
+ print("num_merges : ", num_merges)
91
+ # Perform BPE merges
92
+ for i in range(num_merges):
93
+ pairs = self._get_stats(words)
94
+ if not pairs:
95
+ break
96
+
97
+ # Find most frequent pair
98
+ best_pair = max(pairs.items(), key=lambda x: x[1])
99
+ if best_pair[1] < min_freq:
100
+ break
101
+
102
+ pair = best_pair[0]
103
+ new_token = ''.join(pair)
104
+ vocab.add(new_token)
105
+ #print("merging ..", pair)
106
+ print(len(vocab))
107
+ # Record the merge operation
108
+ self.merges[pair] = new_token
109
+
110
+ # Merge the pair in all words
111
+ words = self._merge_vocab(words, pair)
112
+
113
+ # Build final vocabulary
114
+ self.vocab = {**self.special_tokens}
115
+ idx = len(self.special_tokens)
116
+ for token in sorted(vocab):
117
+ self.vocab[token] = idx
118
+ idx += 1
119
+
120
+ self.inverse_vocab = {v: k for k, v in self.vocab.items()}
121
+
122
+ def encode(self, text: str) -> List[int]:
123
+ """Encode text using learned BPE merges"""
124
+
125
+ odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
126
+ extracted_words = odia_word_pattern.findall(text)
127
+
128
+ words = [list(word) for word in extracted_words]
129
+ #words = [list(text)]
130
+
131
+ # Apply merges in order
132
+ for pair, merged in self.merges.items():
133
+ words = self._merge_vocab(words, pair)
134
+
135
+ # Convert to token IDs
136
+ result = []
137
+ for word in words:
138
+ for token in word:
139
+ if token in self.vocab:
140
+ result.append(self.vocab[token])
141
+ else:
142
+ result.append(self.special_tokens['<UNK>'])
143
+
144
+ return result
145
+
146
+ def decode(self, ids: List[int]) -> str:
147
+ """Decode token IDs back to text"""
148
+ return ''.join(self.inverse_vocab.get(id, '<UNK>') for id in ids)
149
+
150
+ def calculate_compression_ratio(self, text: str) -> float:
151
+ """Calculate compression ratio"""
152
+ encoded = self.encode(text)
153
+ return len(text) / len(encoded)
154
+
155
+ def save(self, path: str) -> None:
156
+ """Save tokenizer state"""
157
+ # Convert tuple keys to strings for JSON serialization
158
+ serializable_merges = {f"{first}|{second}": merged
159
+ for (first, second), merged in self.merges.items()}
160
+
161
+ data = {
162
+ 'vocab': self.vocab,
163
+ 'merges': serializable_merges,
164
+ 'vocab_size': self.vocab_size,
165
+ 'special_tokens': self.special_tokens
166
+ }
167
+ with open(path, 'w', encoding='utf-8') as f:
168
+ json.dump(data, f, ensure_ascii=False, indent=2)
169
+
170
+ @classmethod
171
+ def load(cls, path: str) -> 'OdiaBPETokenizer':
172
+ """Load tokenizer from file"""
173
+ with open(path, 'r', encoding='utf-8') as f:
174
+ data = json.load(f)
175
+
176
+ tokenizer = cls(vocab_size=data['vocab_size'])
177
+ tokenizer.vocab = data['vocab']
178
+
179
+ # Convert string keys back to tuples
180
+ tokenizer.merges = {tuple(k.split('|')): v
181
+ for k, v in data['merges'].items()}
182
+
183
+ tokenizer.special_tokens = data['special_tokens']
184
+ tokenizer.inverse_vocab = {v: k for k, v in tokenizer.vocab.items()}
185
+ return tokenizer