Spaces:
Configuration error
Configuration error
import gradio as gr | |
import json | |
from read_files import Tokenizer # Make sure to include this file | |
def load_tokenizer(path): | |
"""Load tokenizer from json file""" | |
with open(path, 'r') as f: | |
serialized_merges = json.load(f) | |
merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()} | |
return Tokenizer(merges) | |
# Load tokenizer | |
tokenizer = load_tokenizer('tokenizer.json') | |
def process_text(text): | |
"""Process text through the tokenizer""" | |
# Encode | |
encoded = tokenizer.encode(text) | |
# Decode back to verify | |
decoded = tokenizer.decode(encoded) | |
return { | |
"Encoded Tokens": str(encoded), | |
"Number of Tokens": len(encoded), | |
"Decoded Text": decoded, | |
"Round-trip Success": text == decoded | |
} | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_text, | |
inputs=gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!"), | |
outputs={ | |
"Encoded Tokens": gr.Textbox(label="Token IDs"), | |
"Number of Tokens": gr.Number(label="Token Count"), | |
"Decoded Text": gr.Textbox(label="Decoded Text"), | |
"Round-trip Success": gr.Checkbox(label="Successful Round-trip") | |
}, | |
title="Marathi BPE Tokenizer", | |
description="Enter Marathi text to see how it's tokenized using byte-pair encoding.", | |
examples=[ | |
["नमस्कार, जग!"], | |
["ही एक चाचणी आहे."], | |
] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |