Spaces:
Sleeping
Sleeping
import gradio as gr | |
from bpe_Awadhi import AwadhiBPE | |
from examples import EXAMPLES | |
import json | |
import os | |
# Initialize the BPE model | |
bpe = AwadhiBPE(vocab_size=4500) | |
# Load the model if it exists, otherwise train it | |
def initialize_model(): | |
if os.path.exists('Awadhi_bpe.json'): | |
bpe.load('Awadhi_bpe.json') | |
else: | |
# Load the text and train the model | |
with open('sunderkand_awdhi.txt', 'r', encoding='utf-8') as f: | |
text = f.read() | |
bpe.fit(text) | |
bpe.save('Awadhi_bpe.json') | |
def process_text(input_text: str) -> dict: | |
"""Process input text and return tokenization results""" | |
# Tokenize the text | |
tokens = bpe.tokenize(input_text) | |
# Calculate compression ratio | |
original_size = len(input_text.encode('utf-8')) | |
tokenized_size = len(tokens) * 2 # Assuming average 2 bytes per token | |
compression_ratio = original_size / tokenized_size | |
return { | |
"Tokens": " ".join(tokens[:100]) + "..." if len(tokens) > 100 else " ".join(tokens), | |
"Number of Tokens": len(tokens), | |
"Original Size (bytes)": original_size, | |
"Tokenized Size (bytes)": tokenized_size, | |
"Compression Ratio": f"{compression_ratio:.2f}", | |
"Vocabulary Size": len(bpe.vocab) | |
} | |
def load_example(text: str) -> tuple: | |
"""Load example text and clear previous results""" | |
return text, None | |
# Create the Gradio interface | |
def create_interface(): | |
with gr.Blocks(title="Awadhi BPE Tokenizer") as demo: | |
gr.Markdown("# Awadhi BPE Tokenizer") | |
gr.Markdown("This tool implements Byte Pair Encoding (BPE) for Awadhi text compression.") | |
with gr.Row(): | |
with gr.Column(): | |
input_text = gr.Textbox( | |
label="Input Awadhi Text", | |
placeholder="Enter Awadhi text here...", | |
lines=5 | |
) | |
with gr.Column(): | |
output = gr.JSON(label="Tokenization Results") | |
submit_btn = gr.Button("Tokenize") | |
submit_btn.click( | |
fn=process_text, | |
inputs=input_text, | |
outputs=output | |
) | |
# Example buttons | |
with gr.Row(): | |
for label, text in EXAMPLES.items(): | |
# Create a closure to capture the text value | |
def make_example_handler(example_text): | |
def handler(): | |
return example_text, None | |
return handler | |
gr.Button(label).click( | |
fn=make_example_handler(text), | |
outputs=[input_text, output] | |
) | |
gr.Markdown(""" | |
### About | |
- This tokenizer uses BPE to compress Awadhi text | |
- Vocabulary size is limited to 4500 tokens | |
- Aims for a compression ratio > 3.2 | |
""") | |
return demo | |
# Initialize model and create interface | |
initialize_model() | |
demo = create_interface() | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |