Manu101 commited on
Commit
3229104
·
verified ·
1 Parent(s): 0787d32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -77
app.py CHANGED
@@ -1,77 +1,77 @@
1
- import pathlib
2
- import random
3
-
4
- import gradio as gr
5
- from src import HindiTokenizer, BasicTokenizer
6
-
7
- Basic = BasicTokenizer()
8
- Basic._build_vocab()
9
-
10
- Hindi = HindiTokenizer()
11
- Hindi.load(
12
- model_file_path=pathlib.Path(
13
- "saved_vocabs/batch_1_Hindi_Tokenizer-test-all_batches-100_000_batchsize-initial_vocab_size_5000.model"))
14
-
15
-
16
- def tokenize_and_color(text, tokenizer_choice="HindiTokenizer"):
17
- if tokenizer_choice == "BasicTokenizer":
18
- tokenizer = Basic
19
- else:
20
- tokenizer = Hindi
21
-
22
- tokens = tokenizer.encode(text)
23
-
24
- # colors = [
25
- # "#FF5733", "#33FF57", "#3357FF", "#F333FF",
26
- # "#33FFF3", "#F3FF33", "#FF3380", "#3380FF",
27
- # "#83FF33", "#FF8333"
28
- # ]
29
- colors = [
30
- "#FF5733", "#33FF57", "#3357FF", "#F333FF",
31
- "#33FFF3", "#F3FF33", "#FF3380", "#3380FF",
32
- "#83FF33", "#FF8333", "#7FDBFF", "#0074D9",
33
- "#39CCCC", "#3D9970", "#2ECC40", "#01FF70",
34
- "#FFDC00", "#FF851B", "#FF4136", "#85144b",
35
- "#F012BE", "#B10DC9", "#AAAAAA", "#DDDDDD"
36
- ]
37
-
38
- colored_text = '<div style="word-wrap: break-word; white-space: pre-wrap;">'
39
- token_color_mapping = {}
40
- last_color = ""
41
- for index, token in enumerate(tokens):
42
- token_id = token
43
- if token_id in token_color_mapping:
44
- color = token_color_mapping[token_id]
45
- else:
46
- color = random.choice([c for c in colors if c != last_color])
47
- last_color = color
48
- token_color_mapping[token_id] = color
49
- colored_text += f'<span id="{token_id}" style="color: {color}; margin-right: 20px;">{token}</span>'
50
- colored_text += '</div>'
51
-
52
- return colored_text
53
-
54
-
55
- examples = [
56
- ["आप कैसे हैं??"],
57
- ["यह एक परीक्षण है।"],
58
- ["लोरेम इप्सम एक छद्म-लैटिन पाठ है जिसका उपयोग मुद्रण और टाइपसेटिंग उद्योगों में किया जाता है।"]
59
- ]
60
-
61
- iface = gr.Interface(fn=tokenize_and_color,
62
- title="Hindi Text Tokenizer",
63
- description="Enter text to see the tokenized output with each token colored differently.",
64
- inputs=[
65
- gr.Textbox(lines=2, label="Input Text"),
66
- # gr.Radio(choices=["BasicTokenizer", "HindiTokenizer"], label="Tokenizer Choice",
67
- # value="HindiTokenizer")
68
- ],
69
- outputs=[
70
- gr.HTML(label="Tokenized and Colored Text")
71
- ],
72
- examples=examples,
73
- # theme=gr.themes.Soft()
74
- theme=gr.themes.Base()
75
- )
76
- if __name__ == "__main__":
77
- iface.launch()
 
1
+ import pathlib
2
+ import random
3
+
4
+ import gradio as gr
5
+ from src import HindiTokenizer, BasicTokenizer
6
+
7
+ Basic = BasicTokenizer()
8
+ Basic._build_vocab()
9
+
10
+ Hindi = HindiTokenizer()
11
+ Hindi.load(
12
+ model_file_path=pathlib.Path(
13
+ "saved_vocabs/batch_1_Hindi_Tokenizer-test-all_batches-100_000_batchsize-initial_vocab_size_5000.model"))
14
+
15
+
16
+ def tokenize_and_color(text, tokenizer_choice="HindiTokenizer"):
17
+ if tokenizer_choice == "BasicTokenizer":
18
+ tokenizer = Basic
19
+ else:
20
+ tokenizer = Hindi
21
+
22
+ tokens = tokenizer.encode(text)
23
+
24
+ # colors = [
25
+ # "#FF5733", "#33FF57", "#3357FF", "#F333FF",
26
+ # "#33FFF3", "#F3FF33", "#FF3380", "#3380FF",
27
+ # "#83FF33", "#FF8333"
28
+ # ]
29
+ colors = [
30
+ "#FF5733", "#33FF57", "#3357FF", "#F333FF",
31
+ "#33FFF3", "#F3FF33", "#FF3380", "#3380FF",
32
+ "#83FF33", "#FF8333", "#7FDBFF", "#0074D9",
33
+ "#39CCCC", "#3D9970", "#2ECC40", "#01FF70",
34
+ "#FFDC00", "#FF851B", "#FF4136", "#85144b",
35
+ "#F012BE", "#B10DC9", "#AAAAAA", "#DDDDDD"
36
+ ]
37
+
38
+ colored_text = '<div style="word-wrap: break-word; white-space: pre-wrap;">'
39
+ token_color_mapping = {}
40
+ last_color = ""
41
+ for index, token in enumerate(tokens):
42
+ token_id = token
43
+ if token_id in token_color_mapping:
44
+ color = token_color_mapping[token_id]
45
+ else:
46
+ color = random.choice([c for c in colors if c != last_color])
47
+ last_color = color
48
+ token_color_mapping[token_id] = color
49
+ colored_text += f'<span id="{token_id}" style="color: {color}; margin-right: 20px;">{token}</span>'
50
+ colored_text += '</div>'
51
+
52
+ return colored_text
53
+
54
+
55
+ examples = [
56
+ ["आप कैसे हैं??"],
57
+ ["यह एक परीक्षण है।"],
58
+ ["लोरेम इप्सम एक छद्म-लैटिन पाठ है जिसका उपयोग मुद्रण और टाइपसेटिंग उद्योगों में किया जाता है।"]
59
+ ]
60
+
61
+ iface = gr.Interface(fn=tokenize_and_color,
62
+ title="Hindi Text Tokenizer",
63
+ description="Enter text to see the tokenized output with each token colored differently.",
64
+ inputs=[
65
+ gr.Textbox(lines=2, label="Input Text"),
66
+ gr.Radio(choices=["BasicTokenizer", "HindiTokenizer"], label="Tokenizer Choice",
67
+ value="HindiTokenizer")
68
+ ],
69
+ outputs=[
70
+ gr.HTML(label="Tokenized and Colored Text")
71
+ ],
72
+ examples=examples,
73
+ # theme=gr.themes.Soft()
74
+ theme=gr.themes.Base()
75
+ )
76
+ if __name__ == "__main__":
77
+ iface.launch()