nragrawal's picture
Initial commit
09ffe17
import gradio as gr
import json
from read_files import Tokenizer # Make sure to include this file
def load_tokenizer(path):
"""Load tokenizer from json file"""
with open(path, 'r') as f:
serialized_merges = json.load(f)
merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
return Tokenizer(merges)
# Load tokenizer
tokenizer = load_tokenizer('tokenizer.json')
def process_text(text):
"""Process text through the tokenizer"""
# Encode
encoded = tokenizer.encode(text)
# Decode back to verify
decoded = tokenizer.decode(encoded)
return {
"Encoded Tokens": str(encoded),
"Number of Tokens": len(encoded),
"Decoded Text": decoded,
"Round-trip Success": text == decoded
}
# Create Gradio interface
iface = gr.Interface(
fn=process_text,
inputs=gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!"),
outputs={
"Encoded Tokens": gr.Textbox(label="Token IDs"),
"Number of Tokens": gr.Number(label="Token Count"),
"Decoded Text": gr.Textbox(label="Decoded Text"),
"Round-trip Success": gr.Checkbox(label="Successful Round-trip")
},
title="Marathi BPE Tokenizer",
description="Enter Marathi text to see how it's tokenized using byte-pair encoding.",
examples=[
["नमस्कार, जग!"],
["ही एक चाचणी आहे."],
]
)
# Launch the app
if __name__ == "__main__":
iface.launch()