File size: 3,151 Bytes
97426a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94b05dc
97426a9
70979fd
97426a9
 
 
 
 
 
 
 
a7d2cc8
97426a9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import json
from odia_tokenizer import OdiaBPETokenizer
import random
import colorsys


def generate_distinct_colors(n):
    """Generate n visually distinct colors"""
    colors = []
    for i in range(n):
        hue = i / n
        saturation = 0.7
        value = 0.9
        rgb = colorsys.hsv_to_rgb(hue, saturation, value)
        hex_color = "#{:02x}{:02x}{:02x}".format(
            int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
        )
        colors.append(hex_color)
    return colors


def load_tokenizer():
    try:
        return OdiaBPETokenizer.load("odia_bpe_tokenizer.json")
    except:
        # If no saved tokenizer found, create a new one
        return OdiaBPETokenizer(vocab_size=5000)


def tokenize_text(text):
    tokenizer = load_tokenizer()

    # Get token IDs and their corresponding text
    token_ids = tokenizer.encode(text)
    tokens = []
    current_pos = 0

    # Process text to get token spans
    words = [list(text)]
    for pair, merged in tokenizer.merges.items():
        words = tokenizer._merge_vocab(words, pair)

    # Extract final tokens
    final_tokens = []
    for word in words:
        final_tokens.extend(word)

    # Generate colors for tokens
    colors = generate_distinct_colors(len(tokenizer.vocab))
    color_map = {
        token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
    }

    # Create highlighted HTML
    html_parts = []
    token_list = []

    for i, token in enumerate(final_tokens):
        token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
        color = color_map[token_id]
        html_parts.append(f'<span style="background-color: {color}">{token}</span>')
        token_list.append(f"{token} ({token_id})")

    highlighted_text = "".join(html_parts)

    # Calculate compression ratio
    compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0

    return (
        len(token_ids),  # Token count
        compression_ratio,  # Compression ratio
        highlighted_text,  # Highlighted text
        "\n".join(token_list),  # Token list
    )


custom_css = """

.token-highlight {

    border-radius: 3px;

    margin: 0 1px;

}

.container {

    max-width: 1200px;

    margin: 0 auto;

}

"""

with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("# BPE Tokenizer for Odia Language")

    with gr.Row():
        with gr.Column(scale=1):
            input_text = gr.Textbox(
                label="Odia Text", placeholder="Enter Odia Text Here...", lines=10
            )

        with gr.Column(scale=1):
            token_count = gr.Number(label="No of Tokens")
            compression_ratio = gr.Number(label="Compression Ratio")
            highlighted_output = gr.HTML(label="Tokenized Text Output")
            token_list = gr.Textbox(label="List of Tokens", lines=10)

    input_text.change(
        fn=tokenize_text,
        inputs=[input_text],
        outputs=[token_count, compression_ratio, highlighted_output, token_list],
    )

demo.launch()