Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
@@ -1,117 +1,110 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import json
|
3 |
-
from odia_tokenizer import OdiaBPETokenizer
|
4 |
-
import random
|
5 |
-
import colorsys
|
6 |
-
|
7 |
-
|
8 |
-
def generate_distinct_colors(n):
|
9 |
-
"""Generate n visually distinct colors"""
|
10 |
-
colors = []
|
11 |
-
for i in range(n):
|
12 |
-
hue = i / n
|
13 |
-
saturation = 0.7
|
14 |
-
value = 0.9
|
15 |
-
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
|
16 |
-
hex_color = "#{:02x}{:02x}{:02x}".format(
|
17 |
-
int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
|
18 |
-
)
|
19 |
-
colors.append(hex_color)
|
20 |
-
return colors
|
21 |
-
|
22 |
-
|
23 |
-
def load_tokenizer():
|
24 |
-
try:
|
25 |
-
return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
|
26 |
-
except:
|
27 |
-
# If no saved tokenizer found, create a new one
|
28 |
-
return OdiaBPETokenizer(vocab_size=5000)
|
29 |
-
|
30 |
-
|
31 |
-
def tokenize_text(text):
|
32 |
-
tokenizer = load_tokenizer()
|
33 |
-
|
34 |
-
# Get token IDs and their corresponding text
|
35 |
-
token_ids = tokenizer.encode(text)
|
36 |
-
tokens = []
|
37 |
-
current_pos = 0
|
38 |
-
|
39 |
-
# Process text to get token spans
|
40 |
-
words = [list(text)]
|
41 |
-
for pair, merged in tokenizer.merges.items():
|
42 |
-
words = tokenizer._merge_vocab(words, pair)
|
43 |
-
|
44 |
-
# Extract final tokens
|
45 |
-
final_tokens = []
|
46 |
-
for word in words:
|
47 |
-
final_tokens.extend(word)
|
48 |
-
|
49 |
-
# Generate colors for tokens
|
50 |
-
colors = generate_distinct_colors(len(tokenizer.vocab))
|
51 |
-
color_map = {
|
52 |
-
token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
|
53 |
-
}
|
54 |
-
|
55 |
-
# Create highlighted HTML
|
56 |
-
html_parts = []
|
57 |
-
token_list = []
|
58 |
-
|
59 |
-
for i, token in enumerate(final_tokens):
|
60 |
-
token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
|
61 |
-
color = color_map[token_id]
|
62 |
-
html_parts.append(f'<span style="background-color: {color}">{token}</span>')
|
63 |
-
token_list.append(f"{token} ({token_id})")
|
64 |
-
|
65 |
-
highlighted_text = "".join(html_parts)
|
66 |
-
|
67 |
-
# Calculate compression ratio
|
68 |
-
compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0
|
69 |
-
|
70 |
-
return (
|
71 |
-
len(token_ids), # Token count
|
72 |
-
compression_ratio, # Compression ratio
|
73 |
-
highlighted_text, # Highlighted text
|
74 |
-
"\n".join(token_list), # Token list
|
75 |
-
)
|
76 |
-
|
77 |
-
|
78 |
-
custom_css = """
|
79 |
-
.token-highlight {
|
80 |
-
border-radius: 3px;
|
81 |
-
margin: 0 1px;
|
82 |
-
}
|
83 |
-
.container {
|
84 |
-
max-width: 1200px;
|
85 |
-
margin: 0 auto;
|
86 |
-
}
|
87 |
-
"""
|
88 |
-
|
89 |
-
with gr.Blocks(css=custom_css) as demo:
|
90 |
-
gr.Markdown("#
|
91 |
-
|
92 |
-
with gr.Row():
|
93 |
-
with gr.Column(scale=1):
|
94 |
-
input_text = gr.Textbox(
|
95 |
-
label="Input Text", placeholder="
|
96 |
-
)
|
97 |
-
|
98 |
-
with gr.Column(scale=1):
|
99 |
-
token_count = gr.Number(label="
|
100 |
-
compression_ratio = gr.Number(label="Compression Ratio")
|
101 |
-
highlighted_output = gr.HTML(label="Tokenized Text")
|
102 |
-
token_list = gr.Textbox(label="
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
[
|
107 |
-
[
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
fn=tokenize_text,
|
112 |
-
inputs=[input_text],
|
113 |
-
outputs=[token_count, compression_ratio, highlighted_output, token_list],
|
114 |
-
title="BPE Tokenizer for Odia Language",
|
115 |
-
description="TOkenize any sentence in Odia and also show the compression ratio.",
|
116 |
-
examples=examples
|
117 |
-
).launch(share=True)
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
from odia_tokenizer import OdiaBPETokenizer
|
4 |
+
import random
|
5 |
+
import colorsys
|
6 |
+
|
7 |
+
|
8 |
+
def generate_distinct_colors(n):
|
9 |
+
"""Generate n visually distinct colors"""
|
10 |
+
colors = []
|
11 |
+
for i in range(n):
|
12 |
+
hue = i / n
|
13 |
+
saturation = 0.7
|
14 |
+
value = 0.9
|
15 |
+
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
|
16 |
+
hex_color = "#{:02x}{:02x}{:02x}".format(
|
17 |
+
int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
|
18 |
+
)
|
19 |
+
colors.append(hex_color)
|
20 |
+
return colors
|
21 |
+
|
22 |
+
|
23 |
+
def load_tokenizer():
|
24 |
+
try:
|
25 |
+
return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
|
26 |
+
except:
|
27 |
+
# If no saved tokenizer found, create a new one
|
28 |
+
return OdiaBPETokenizer(vocab_size=5000)
|
29 |
+
|
30 |
+
|
31 |
+
def tokenize_text(text):
|
32 |
+
tokenizer = load_tokenizer()
|
33 |
+
|
34 |
+
# Get token IDs and their corresponding text
|
35 |
+
token_ids = tokenizer.encode(text)
|
36 |
+
tokens = []
|
37 |
+
current_pos = 0
|
38 |
+
|
39 |
+
# Process text to get token spans
|
40 |
+
words = [list(text)]
|
41 |
+
for pair, merged in tokenizer.merges.items():
|
42 |
+
words = tokenizer._merge_vocab(words, pair)
|
43 |
+
|
44 |
+
# Extract final tokens
|
45 |
+
final_tokens = []
|
46 |
+
for word in words:
|
47 |
+
final_tokens.extend(word)
|
48 |
+
|
49 |
+
# Generate colors for tokens
|
50 |
+
colors = generate_distinct_colors(len(tokenizer.vocab))
|
51 |
+
color_map = {
|
52 |
+
token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
|
53 |
+
}
|
54 |
+
|
55 |
+
# Create highlighted HTML
|
56 |
+
html_parts = []
|
57 |
+
token_list = []
|
58 |
+
|
59 |
+
for i, token in enumerate(final_tokens):
|
60 |
+
token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
|
61 |
+
color = color_map[token_id]
|
62 |
+
html_parts.append(f'<span style="background-color: {color}">{token}</span>')
|
63 |
+
token_list.append(f"{token} ({token_id})")
|
64 |
+
|
65 |
+
highlighted_text = "".join(html_parts)
|
66 |
+
|
67 |
+
# Calculate compression ratio
|
68 |
+
compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0
|
69 |
+
|
70 |
+
return (
|
71 |
+
len(token_ids), # Token count
|
72 |
+
compression_ratio, # Compression ratio
|
73 |
+
highlighted_text, # Highlighted text
|
74 |
+
"\n".join(token_list), # Token list
|
75 |
+
)
|
76 |
+
|
77 |
+
|
78 |
+
custom_css = """
|
79 |
+
.token-highlight {
|
80 |
+
border-radius: 3px;
|
81 |
+
margin: 0 1px;
|
82 |
+
}
|
83 |
+
.container {
|
84 |
+
max-width: 1200px;
|
85 |
+
margin: 0 auto;
|
86 |
+
}
|
87 |
+
"""
|
88 |
+
|
89 |
+
with gr.Blocks(css=custom_css) as demo:
|
90 |
+
gr.Markdown("# BPE Tokenizer for Odia Language")
|
91 |
+
|
92 |
+
with gr.Row():
|
93 |
+
with gr.Column(scale=1):
|
94 |
+
input_text = gr.Textbox(
|
95 |
+
label="Input Text", placeholder=""ତୁମେ ମଧ୍ୟାହ୍ନ ଭୋଜନ କରିସାରିଲ କି?", lines=10
|
96 |
+
)
|
97 |
+
|
98 |
+
with gr.Column(scale=1):
|
99 |
+
token_count = gr.Number(label="No of Tokens")
|
100 |
+
compression_ratio = gr.Number(label="Compression Ratio")
|
101 |
+
highlighted_output = gr.HTML(label="Tokenized Text Output")
|
102 |
+
token_list = gr.Textbox(label="List of Tokens", lines=10)
|
103 |
+
|
104 |
+
input_text.change(
|
105 |
+
fn=tokenize_text,
|
106 |
+
inputs=[input_text],
|
107 |
+
outputs=[token_count, compression_ratio, highlighted_output, token_list],
|
108 |
+
)
|
109 |
+
|
110 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|