import gradio as gr from bpe_Awadhi import AwadhiBPE from examples import EXAMPLES import json import os # Initialize the BPE model bpe = AwadhiBPE(vocab_size=4500) # Load the model if it exists, otherwise train it def initialize_model(): if os.path.exists('Awadhi_bpe.json'): bpe.load('Awadhi_bpe.json') else: # Load the text and train the model with open('sunderkand_awdhi.txt', 'r', encoding='utf-8') as f: text = f.read() bpe.fit(text) bpe.save('Awadhi_bpe.json') def process_text(input_text: str) -> dict: """Process input text and return tokenization results""" # Tokenize the text tokens = bpe.tokenize(input_text) # Calculate compression ratio original_size = len(input_text.encode('utf-8')) tokenized_size = len(tokens) * 2 # Assuming average 2 bytes per token compression_ratio = original_size / tokenized_size return { "Tokens": " ".join(tokens[:100]) + "..." if len(tokens) > 100 else " ".join(tokens), "Number of Tokens": len(tokens), "Original Size (bytes)": original_size, "Tokenized Size (bytes)": tokenized_size, "Compression Ratio": f"{compression_ratio:.2f}", "Vocabulary Size": len(bpe.vocab) } def load_example(text: str) -> tuple: """Load example text and clear previous results""" return text, None # Create the Gradio interface def create_interface(): with gr.Blocks(title="Awadhi BPE Tokenizer") as demo: gr.Markdown("# Awadhi BPE Tokenizer") gr.Markdown("This tool implements Byte Pair Encoding (BPE) for Awadhi text compression.") with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Input Awadhi Text", placeholder="Enter Awadhi text here...", lines=5 ) with gr.Column(): output = gr.JSON(label="Tokenization Results") submit_btn = gr.Button("Tokenize") submit_btn.click( fn=process_text, inputs=input_text, outputs=output ) # Example buttons with gr.Row(): for label, text in EXAMPLES.items(): # Create a closure to capture the text value def make_example_handler(example_text): def handler(): return example_text, None return handler gr.Button(label).click( fn=make_example_handler(text), outputs=[input_text, output] ) gr.Markdown(""" ### About - This tokenizer uses BPE to compress Awadhi text - Vocabulary size is limited to 4500 tokens - Aims for a compression ratio > 3.2 """) return demo # Initialize model and create interface initialize_model() demo = create_interface() # Launch the app if __name__ == "__main__": demo.launch()