File size: 1,540 Bytes
09ffe17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import gradio as gr
import json
from read_files import Tokenizer  # Make sure to include this file

def load_tokenizer(path):
    """Load tokenizer from json file"""
    with open(path, 'r') as f:
        serialized_merges = json.load(f)
    merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
    return Tokenizer(merges)

# Load tokenizer
tokenizer = load_tokenizer('tokenizer.json')

def process_text(text):
    """Process text through the tokenizer"""
    # Encode
    encoded = tokenizer.encode(text)
    
    # Decode back to verify
    decoded = tokenizer.decode(encoded)
    
    return {
        "Encoded Tokens": str(encoded),
        "Number of Tokens": len(encoded),
        "Decoded Text": decoded,
        "Round-trip Success": text == decoded
    }

# Create Gradio interface
iface = gr.Interface(
    fn=process_text,
    inputs=gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!"),
    outputs={
        "Encoded Tokens": gr.Textbox(label="Token IDs"),
        "Number of Tokens": gr.Number(label="Token Count"),
        "Decoded Text": gr.Textbox(label="Decoded Text"),
        "Round-trip Success": gr.Checkbox(label="Successful Round-trip")
    },
    title="Marathi BPE Tokenizer",
    description="Enter Marathi text to see how it's tokenized using byte-pair encoding.",
    examples=[
        ["नमस्कार, जग!"],
        ["ही एक चाचणी आहे."],
    ]
)

# Launch the app
if __name__ == "__main__":
    iface.launch()