sudhakar272 commited on
Commit
97426a9
·
verified ·
1 Parent(s): 9161c6c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -117
app.py CHANGED
@@ -1,117 +1,110 @@
1
- import gradio as gr
2
- import json
3
- from odia_tokenizer import OdiaBPETokenizer
4
- import random
5
- import colorsys
6
-
7
-
8
- def generate_distinct_colors(n):
9
- """Generate n visually distinct colors"""
10
- colors = []
11
- for i in range(n):
12
- hue = i / n
13
- saturation = 0.7
14
- value = 0.9
15
- rgb = colorsys.hsv_to_rgb(hue, saturation, value)
16
- hex_color = "#{:02x}{:02x}{:02x}".format(
17
- int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
18
- )
19
- colors.append(hex_color)
20
- return colors
21
-
22
-
23
- def load_tokenizer():
24
- try:
25
- return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
26
- except:
27
- # If no saved tokenizer found, create a new one
28
- return OdiaBPETokenizer(vocab_size=5000)
29
-
30
-
31
- def tokenize_text(text):
32
- tokenizer = load_tokenizer()
33
-
34
- # Get token IDs and their corresponding text
35
- token_ids = tokenizer.encode(text)
36
- tokens = []
37
- current_pos = 0
38
-
39
- # Process text to get token spans
40
- words = [list(text)]
41
- for pair, merged in tokenizer.merges.items():
42
- words = tokenizer._merge_vocab(words, pair)
43
-
44
- # Extract final tokens
45
- final_tokens = []
46
- for word in words:
47
- final_tokens.extend(word)
48
-
49
- # Generate colors for tokens
50
- colors = generate_distinct_colors(len(tokenizer.vocab))
51
- color_map = {
52
- token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
53
- }
54
-
55
- # Create highlighted HTML
56
- html_parts = []
57
- token_list = []
58
-
59
- for i, token in enumerate(final_tokens):
60
- token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
61
- color = color_map[token_id]
62
- html_parts.append(f'<span style="background-color: {color}">{token}</span>')
63
- token_list.append(f"{token} ({token_id})")
64
-
65
- highlighted_text = "".join(html_parts)
66
-
67
- # Calculate compression ratio
68
- compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0
69
-
70
- return (
71
- len(token_ids), # Token count
72
- compression_ratio, # Compression ratio
73
- highlighted_text, # Highlighted text
74
- "\n".join(token_list), # Token list
75
- )
76
-
77
-
78
- custom_css = """
79
- .token-highlight {
80
- border-radius: 3px;
81
- margin: 0 1px;
82
- }
83
- .container {
84
- max-width: 1200px;
85
- margin: 0 auto;
86
- }
87
- """
88
-
89
- with gr.Blocks(css=custom_css) as demo:
90
- gr.Markdown("# Odia BPE Tokenizer")
91
-
92
- with gr.Row():
93
- with gr.Column(scale=1):
94
- input_text = gr.Textbox(
95
- label="Input Text", placeholder="Enter Odia text here...", lines=10
96
- )
97
-
98
- with gr.Column(scale=1):
99
- token_count = gr.Number(label="Token Count")
100
- compression_ratio = gr.Number(label="Compression Ratio")
101
- highlighted_output = gr.HTML(label="Tokenized Text")
102
- token_list = gr.Textbox(label="Token List", lines=10)
103
- examples = [
104
- ["କଣ ହେଲା?"], # Example 1
105
- ["ତୁମେ ମଧ୍ୟାହ୍ନ ଭୋଜନ କରିସାରିଲ କି?"], # Example 2
106
- ["ଘରକୁ ଆସ"], # Example 3
107
- ["ଦୟାକରି ବସିଯାଅ ।"], # Example 4
108
- ["ତୁମେ କେମିତି ଅଛ?"], # Example 5
109
- ]
110
- interface = gr.Interface(
111
- fn=tokenize_text,
112
- inputs=[input_text],
113
- outputs=[token_count, compression_ratio, highlighted_output, token_list],
114
- title="BPE Tokenizer for Odia Language",
115
- description="TOkenize any sentence in Odia and also show the compression ratio.",
116
- examples=examples
117
- ).launch(share=True)
 
1
+ import gradio as gr
2
+ import json
3
+ from odia_tokenizer import OdiaBPETokenizer
4
+ import random
5
+ import colorsys
6
+
7
+
8
+ def generate_distinct_colors(n):
9
+ """Generate n visually distinct colors"""
10
+ colors = []
11
+ for i in range(n):
12
+ hue = i / n
13
+ saturation = 0.7
14
+ value = 0.9
15
+ rgb = colorsys.hsv_to_rgb(hue, saturation, value)
16
+ hex_color = "#{:02x}{:02x}{:02x}".format(
17
+ int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
18
+ )
19
+ colors.append(hex_color)
20
+ return colors
21
+
22
+
23
+ def load_tokenizer():
24
+ try:
25
+ return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
26
+ except:
27
+ # If no saved tokenizer found, create a new one
28
+ return OdiaBPETokenizer(vocab_size=5000)
29
+
30
+
31
+ def tokenize_text(text):
32
+ tokenizer = load_tokenizer()
33
+
34
+ # Get token IDs and their corresponding text
35
+ token_ids = tokenizer.encode(text)
36
+ tokens = []
37
+ current_pos = 0
38
+
39
+ # Process text to get token spans
40
+ words = [list(text)]
41
+ for pair, merged in tokenizer.merges.items():
42
+ words = tokenizer._merge_vocab(words, pair)
43
+
44
+ # Extract final tokens
45
+ final_tokens = []
46
+ for word in words:
47
+ final_tokens.extend(word)
48
+
49
+ # Generate colors for tokens
50
+ colors = generate_distinct_colors(len(tokenizer.vocab))
51
+ color_map = {
52
+ token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
53
+ }
54
+
55
+ # Create highlighted HTML
56
+ html_parts = []
57
+ token_list = []
58
+
59
+ for i, token in enumerate(final_tokens):
60
+ token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
61
+ color = color_map[token_id]
62
+ html_parts.append(f'<span style="background-color: {color}">{token}</span>')
63
+ token_list.append(f"{token} ({token_id})")
64
+
65
+ highlighted_text = "".join(html_parts)
66
+
67
+ # Calculate compression ratio
68
+ compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0
69
+
70
+ return (
71
+ len(token_ids), # Token count
72
+ compression_ratio, # Compression ratio
73
+ highlighted_text, # Highlighted text
74
+ "\n".join(token_list), # Token list
75
+ )
76
+
77
+
78
+ custom_css = """
79
+ .token-highlight {
80
+ border-radius: 3px;
81
+ margin: 0 1px;
82
+ }
83
+ .container {
84
+ max-width: 1200px;
85
+ margin: 0 auto;
86
+ }
87
+ """
88
+
89
+ with gr.Blocks(css=custom_css) as demo:
90
+ gr.Markdown("# BPE Tokenizer for Odia Language")
91
+
92
+ with gr.Row():
93
+ with gr.Column(scale=1):
94
+ input_text = gr.Textbox(
95
+ label="Input Text", placeholder=""ତୁମେ ମଧ୍ୟାହ୍ନ ଭୋଜନ କରିସାରିଲ କି?", lines=10
96
+ )
97
+
98
+ with gr.Column(scale=1):
99
+ token_count = gr.Number(label="No of Tokens")
100
+ compression_ratio = gr.Number(label="Compression Ratio")
101
+ highlighted_output = gr.HTML(label="Tokenized Text Output")
102
+ token_list = gr.Textbox(label="List of Tokens", lines=10)
103
+
104
+ input_text.change(
105
+ fn=tokenize_text,
106
+ inputs=[input_text],
107
+ outputs=[token_count, compression_ratio, highlighted_output, token_list],
108
+ )
109
+
110
+ demo.launch()