Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- README.md +7 -6
- app.py +110 -7
- odia_bpe_tokenizer.json +0 -0
- odia_tokenizer.py +185 -0
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Odia Tokenizer 5k
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.10.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
+
short_description: Tokenizer specific to odia language with 5000 tokens
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,7 +1,110 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
from odia_tokenizer import OdiaBPETokenizer
|
4 |
+
import random
|
5 |
+
import colorsys
|
6 |
+
|
7 |
+
|
8 |
+
def generate_distinct_colors(n):
|
9 |
+
"""Generate n visually distinct colors"""
|
10 |
+
colors = []
|
11 |
+
for i in range(n):
|
12 |
+
hue = i / n
|
13 |
+
saturation = 0.7
|
14 |
+
value = 0.9
|
15 |
+
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
|
16 |
+
hex_color = "#{:02x}{:02x}{:02x}".format(
|
17 |
+
int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
|
18 |
+
)
|
19 |
+
colors.append(hex_color)
|
20 |
+
return colors
|
21 |
+
|
22 |
+
|
23 |
+
def load_tokenizer():
|
24 |
+
try:
|
25 |
+
return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
|
26 |
+
except:
|
27 |
+
# If no saved tokenizer found, create a new one
|
28 |
+
return OdiaBPETokenizer(vocab_size=5000)
|
29 |
+
|
30 |
+
|
31 |
+
def tokenize_text(text):
|
32 |
+
tokenizer = load_tokenizer()
|
33 |
+
|
34 |
+
# Get token IDs and their corresponding text
|
35 |
+
token_ids = tokenizer.encode(text)
|
36 |
+
tokens = []
|
37 |
+
current_pos = 0
|
38 |
+
|
39 |
+
# Process text to get token spans
|
40 |
+
words = [list(text)]
|
41 |
+
for pair, merged in tokenizer.merges.items():
|
42 |
+
words = tokenizer._merge_vocab(words, pair)
|
43 |
+
|
44 |
+
# Extract final tokens
|
45 |
+
final_tokens = []
|
46 |
+
for word in words:
|
47 |
+
final_tokens.extend(word)
|
48 |
+
|
49 |
+
# Generate colors for tokens
|
50 |
+
colors = generate_distinct_colors(len(tokenizer.vocab))
|
51 |
+
color_map = {
|
52 |
+
token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
|
53 |
+
}
|
54 |
+
|
55 |
+
# Create highlighted HTML
|
56 |
+
html_parts = []
|
57 |
+
token_list = []
|
58 |
+
|
59 |
+
for i, token in enumerate(final_tokens):
|
60 |
+
token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
|
61 |
+
color = color_map[token_id]
|
62 |
+
html_parts.append(f'<span style="background-color: {color}">{token}</span>')
|
63 |
+
token_list.append(f"{token} ({token_id})")
|
64 |
+
|
65 |
+
highlighted_text = "".join(html_parts)
|
66 |
+
|
67 |
+
# Calculate compression ratio
|
68 |
+
compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0
|
69 |
+
|
70 |
+
return (
|
71 |
+
len(token_ids), # Token count
|
72 |
+
compression_ratio, # Compression ratio
|
73 |
+
highlighted_text, # Highlighted text
|
74 |
+
"\n".join(token_list), # Token list
|
75 |
+
)
|
76 |
+
|
77 |
+
|
78 |
+
custom_css = """
|
79 |
+
.token-highlight {
|
80 |
+
border-radius: 3px;
|
81 |
+
margin: 0 1px;
|
82 |
+
}
|
83 |
+
.container {
|
84 |
+
max-width: 1200px;
|
85 |
+
margin: 0 auto;
|
86 |
+
}
|
87 |
+
"""
|
88 |
+
|
89 |
+
with gr.Blocks(css=custom_css) as demo:
|
90 |
+
gr.Markdown("# Odia BPE Tokenizer")
|
91 |
+
|
92 |
+
with gr.Row():
|
93 |
+
with gr.Column(scale=1):
|
94 |
+
input_text = gr.Textbox(
|
95 |
+
label="Input Text", placeholder="Enter Odia text here...", lines=10
|
96 |
+
)
|
97 |
+
|
98 |
+
with gr.Column(scale=1):
|
99 |
+
token_count = gr.Number(label="Token Count")
|
100 |
+
compression_ratio = gr.Number(label="Compression Ratio")
|
101 |
+
highlighted_output = gr.HTML(label="Tokenized Text")
|
102 |
+
token_list = gr.Textbox(label="Token List", lines=10)
|
103 |
+
|
104 |
+
input_text.change(
|
105 |
+
fn=tokenize_text,
|
106 |
+
inputs=[input_text],
|
107 |
+
outputs=[token_count, compression_ratio, highlighted_output, token_list],
|
108 |
+
)
|
109 |
+
|
110 |
+
demo.launch()
|
odia_bpe_tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
odia_tokenizer.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
from collections import defaultdict, Counter
|
4 |
+
from typing import List, Dict, Tuple, Set
|
5 |
+
|
6 |
+
class OdiaBPETokenizer:
|
7 |
+
def __init__(self, vocab_size: int = 5000):
|
8 |
+
self.vocab_size = vocab_size
|
9 |
+
self.vocab = {}
|
10 |
+
self.merges = {}
|
11 |
+
self.special_tokens = {
|
12 |
+
'<PAD>': 0,
|
13 |
+
'<UNK>': 1,
|
14 |
+
'<BOS>': 2,
|
15 |
+
'<EOS>': 3
|
16 |
+
}
|
17 |
+
|
18 |
+
# Initialize basic Odia character vocabulary
|
19 |
+
self.base_vocab = set()
|
20 |
+
# Add basic Odia characters (vowels, consonants, marks)
|
21 |
+
self._initialize_base_vocab()
|
22 |
+
|
23 |
+
def _initialize_base_vocab(self):
|
24 |
+
"""Initialize vocabulary with basic Odia characters"""
|
25 |
+
# Vowels
|
26 |
+
self.base_vocab.update([chr(c) for c in [0x0B05, 0x0B06, 0x0B07, 0x0B08, 0x0B09, 0x0B0A, 0x0B0B, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B14] ])
|
27 |
+
# Consonants
|
28 |
+
self.base_vocab.update([chr(c) for c in [0x0B15, 0x0B16, 0x0B17, 0x0B18, 0x0B19, 0x0B1A, 0x0B1B, 0x0B1C, 0x0B1D, 0x0B1E, 0x0B1F, 0x0B20, 0x0B21, 0x0B22, 0x0B23, 0x0B24, 0x0B25, 0x0B26, 0x0B27, 0x0B28, 0x0B2A, 0x0B2B, 0x0B2C, 0x0B2D, 0x0B2E, 0x0B2F, 0x0B30, 0x0B32, 0x0B33, 0x0B35, 0x0B36, 0x0B37, 0x0B38, 0x0B39, 0x0B3C] ])
|
29 |
+
# Vowel marks
|
30 |
+
self.base_vocab.update([chr(c) for c in [0x0B3E, 0x0B3F, 0x0B40, 0x0B41, 0x0B42, 0x0B43, 0x0B44, 0x0B47, 0x0B48, 0x0B4B, 0x0B4C, 0x0B4D, 0x0B55, 0x0B56, 0x0B57] ])
|
31 |
+
# Other etc chars
|
32 |
+
self.base_vocab.update([chr(c) for c in [0x0B5C, 0x0B5D, 0x0B5F, 0x0B60, 0x0B61, 0x0B62, 0x0B63, 0x0B71] ])
|
33 |
+
# numbers
|
34 |
+
self.base_vocab.update([chr(c) for c in [0x0B66, 0x0B67, 0x0B68, 0x0B69, 0x0B6A, 0x0B6B, 0x0B6C, 0x0B6D, 0x0B6E, 0x0B6F] ])
|
35 |
+
# Signs
|
36 |
+
self.base_vocab.update([chr(c) for c in [0x0B70, 0x0B01, 0x0B02, 0x0B03, 0x0964] ])
|
37 |
+
# Other marks
|
38 |
+
self.base_vocab.update([
|
39 |
+
'ଂ', 'ଃ', 'ଁ', '୍', # Anusvara, Visarga, Candrabindu, Halanta
|
40 |
+
' ', '\n', '\t' # Whitespace characters
|
41 |
+
])
|
42 |
+
|
43 |
+
def _get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
|
44 |
+
"""Count frequency of adjacent pairs in the vocabulary"""
|
45 |
+
pairs = defaultdict(int)
|
46 |
+
for word in words:
|
47 |
+
for i in range(len(word) - 1):
|
48 |
+
pairs[tuple(word[i:i + 2])] += 1
|
49 |
+
return pairs
|
50 |
+
|
51 |
+
def _merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
|
52 |
+
"""Merge all occurrences of the most frequent pair"""
|
53 |
+
first, second = pair
|
54 |
+
new_words = []
|
55 |
+
|
56 |
+
for word in words:
|
57 |
+
i = 0
|
58 |
+
new_word = []
|
59 |
+
while i < len(word):
|
60 |
+
if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
|
61 |
+
new_word.append(first + second)
|
62 |
+
i += 2
|
63 |
+
else:
|
64 |
+
new_word.append(word[i])
|
65 |
+
i += 1
|
66 |
+
new_words.append(new_word)
|
67 |
+
|
68 |
+
return new_words
|
69 |
+
|
70 |
+
def train(self, texts: List[str], min_freq: int = 2) -> None:
|
71 |
+
"""Train BPE model on texts"""
|
72 |
+
|
73 |
+
# Regular expression for extracting Odia words
|
74 |
+
odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
|
75 |
+
|
76 |
+
# Split texts into characters
|
77 |
+
words = []
|
78 |
+
for text in texts:
|
79 |
+
# Extract words based on the Odia pattern
|
80 |
+
extracted_words = odia_word_pattern.findall(text)
|
81 |
+
for word in extracted_words:
|
82 |
+
chars = list(word)
|
83 |
+
# Filter valid Odia characters
|
84 |
+
valid_chars = [c for c in chars if c in self.base_vocab or c.isspace()]
|
85 |
+
if valid_chars:
|
86 |
+
words.append(valid_chars)
|
87 |
+
|
88 |
+
vocab = self.base_vocab.copy()
|
89 |
+
num_merges = self.vocab_size - len(self.special_tokens) - len(vocab)
|
90 |
+
print("num_merges : ", num_merges)
|
91 |
+
# Perform BPE merges
|
92 |
+
for i in range(num_merges):
|
93 |
+
pairs = self._get_stats(words)
|
94 |
+
if not pairs:
|
95 |
+
break
|
96 |
+
|
97 |
+
# Find most frequent pair
|
98 |
+
best_pair = max(pairs.items(), key=lambda x: x[1])
|
99 |
+
if best_pair[1] < min_freq:
|
100 |
+
break
|
101 |
+
|
102 |
+
pair = best_pair[0]
|
103 |
+
new_token = ''.join(pair)
|
104 |
+
vocab.add(new_token)
|
105 |
+
#print("merging ..", pair)
|
106 |
+
print(len(vocab))
|
107 |
+
# Record the merge operation
|
108 |
+
self.merges[pair] = new_token
|
109 |
+
|
110 |
+
# Merge the pair in all words
|
111 |
+
words = self._merge_vocab(words, pair)
|
112 |
+
|
113 |
+
# Build final vocabulary
|
114 |
+
self.vocab = {**self.special_tokens}
|
115 |
+
idx = len(self.special_tokens)
|
116 |
+
for token in sorted(vocab):
|
117 |
+
self.vocab[token] = idx
|
118 |
+
idx += 1
|
119 |
+
|
120 |
+
self.inverse_vocab = {v: k for k, v in self.vocab.items()}
|
121 |
+
|
122 |
+
def encode(self, text: str) -> List[int]:
|
123 |
+
"""Encode text using learned BPE merges"""
|
124 |
+
|
125 |
+
odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
|
126 |
+
extracted_words = odia_word_pattern.findall(text)
|
127 |
+
|
128 |
+
words = [list(word) for word in extracted_words]
|
129 |
+
#words = [list(text)]
|
130 |
+
|
131 |
+
# Apply merges in order
|
132 |
+
for pair, merged in self.merges.items():
|
133 |
+
words = self._merge_vocab(words, pair)
|
134 |
+
|
135 |
+
# Convert to token IDs
|
136 |
+
result = []
|
137 |
+
for word in words:
|
138 |
+
for token in word:
|
139 |
+
if token in self.vocab:
|
140 |
+
result.append(self.vocab[token])
|
141 |
+
else:
|
142 |
+
result.append(self.special_tokens['<UNK>'])
|
143 |
+
|
144 |
+
return result
|
145 |
+
|
146 |
+
def decode(self, ids: List[int]) -> str:
|
147 |
+
"""Decode token IDs back to text"""
|
148 |
+
return ''.join(self.inverse_vocab.get(id, '<UNK>') for id in ids)
|
149 |
+
|
150 |
+
def calculate_compression_ratio(self, text: str) -> float:
|
151 |
+
"""Calculate compression ratio"""
|
152 |
+
encoded = self.encode(text)
|
153 |
+
return len(text) / len(encoded)
|
154 |
+
|
155 |
+
def save(self, path: str) -> None:
|
156 |
+
"""Save tokenizer state"""
|
157 |
+
# Convert tuple keys to strings for JSON serialization
|
158 |
+
serializable_merges = {f"{first}|{second}": merged
|
159 |
+
for (first, second), merged in self.merges.items()}
|
160 |
+
|
161 |
+
data = {
|
162 |
+
'vocab': self.vocab,
|
163 |
+
'merges': serializable_merges,
|
164 |
+
'vocab_size': self.vocab_size,
|
165 |
+
'special_tokens': self.special_tokens
|
166 |
+
}
|
167 |
+
with open(path, 'w', encoding='utf-8') as f:
|
168 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
169 |
+
|
170 |
+
@classmethod
|
171 |
+
def load(cls, path: str) -> 'OdiaBPETokenizer':
|
172 |
+
"""Load tokenizer from file"""
|
173 |
+
with open(path, 'r', encoding='utf-8') as f:
|
174 |
+
data = json.load(f)
|
175 |
+
|
176 |
+
tokenizer = cls(vocab_size=data['vocab_size'])
|
177 |
+
tokenizer.vocab = data['vocab']
|
178 |
+
|
179 |
+
# Convert string keys back to tuples
|
180 |
+
tokenizer.merges = {tuple(k.split('|')): v
|
181 |
+
for k, v in data['merges'].items()}
|
182 |
+
|
183 |
+
tokenizer.special_tokens = data['special_tokens']
|
184 |
+
tokenizer.inverse_vocab = {v: k for k, v in tokenizer.vocab.items()}
|
185 |
+
return tokenizer
|