awadhi_bpe / app.py
pradeep6kumar2024's picture
Added examples in seperate file
99c428d
raw
history blame
3.09 kB
import gradio as gr
from bpe_Awadhi import AwadhiBPE
from examples import EXAMPLES
import json
import os
# Initialize the BPE model
bpe = AwadhiBPE(vocab_size=4500)
# Load the model if it exists, otherwise train it
def initialize_model():
if os.path.exists('Awadhi_bpe.json'):
bpe.load('Awadhi_bpe.json')
else:
# Load the text and train the model
with open('sunderkand_awdhi.txt', 'r', encoding='utf-8') as f:
text = f.read()
bpe.fit(text)
bpe.save('Awadhi_bpe.json')
def process_text(input_text: str) -> dict:
"""Process input text and return tokenization results"""
# Tokenize the text
tokens = bpe.tokenize(input_text)
# Calculate compression ratio
original_size = len(input_text.encode('utf-8'))
tokenized_size = len(tokens) * 2 # Assuming average 2 bytes per token
compression_ratio = original_size / tokenized_size
return {
"Tokens": " ".join(tokens[:100]) + "..." if len(tokens) > 100 else " ".join(tokens),
"Number of Tokens": len(tokens),
"Original Size (bytes)": original_size,
"Tokenized Size (bytes)": tokenized_size,
"Compression Ratio": f"{compression_ratio:.2f}",
"Vocabulary Size": len(bpe.vocab)
}
def load_example(text: str) -> tuple:
"""Load example text and clear previous results"""
return text, None
# Create the Gradio interface
def create_interface():
with gr.Blocks(title="Awadhi BPE Tokenizer") as demo:
gr.Markdown("# Awadhi BPE Tokenizer")
gr.Markdown("This tool implements Byte Pair Encoding (BPE) for Awadhi text compression.")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Awadhi Text",
placeholder="Enter Awadhi text here...",
lines=5
)
with gr.Column():
output = gr.JSON(label="Tokenization Results")
submit_btn = gr.Button("Tokenize")
submit_btn.click(
fn=process_text,
inputs=input_text,
outputs=output
)
# Example buttons
with gr.Row():
for label, text in EXAMPLES.items():
# Create a closure to capture the text value
def make_example_handler(example_text):
def handler():
return example_text, None
return handler
gr.Button(label).click(
fn=make_example_handler(text),
outputs=[input_text, output]
)
gr.Markdown("""
### About
- This tokenizer uses BPE to compress Awadhi text
- Vocabulary size is limited to 4500 tokens
- Aims for a compression ratio > 3.2
""")
return demo
# Initialize model and create interface
initialize_model()
demo = create_interface()
# Launch the app
if __name__ == "__main__":
demo.launch()