Spaces:

pradeep6kumar2024
/

awadhi_bpe

Sleeping

App Files Files Community

awadhi_bpe / app.py

pradeep6kumar2024

Added examples in seperate file

99c428d about 1 month ago

raw

history blame

3.09 kB

	import gradio as gr
	from bpe_Awadhi import AwadhiBPE
	from examples import EXAMPLES
	import json
	import os

	# Initialize the BPE model
	bpe = AwadhiBPE(vocab_size=4500)

	# Load the model if it exists, otherwise train it
	def initialize_model():
	if os.path.exists('Awadhi_bpe.json'):
	bpe.load('Awadhi_bpe.json')
	else:
	# Load the text and train the model
	with open('sunderkand_awdhi.txt', 'r', encoding='utf-8') as f:
	text = f.read()
	bpe.fit(text)
	bpe.save('Awadhi_bpe.json')

	def process_text(input_text: str) -> dict:
	"""Process input text and return tokenization results"""
	# Tokenize the text
	tokens = bpe.tokenize(input_text)

	# Calculate compression ratio
	original_size = len(input_text.encode('utf-8'))
	tokenized_size = len(tokens) * 2 # Assuming average 2 bytes per token
	compression_ratio = original_size / tokenized_size

	return {
	"Tokens": " ".join(tokens[:100]) + "..." if len(tokens) > 100 else " ".join(tokens),
	"Number of Tokens": len(tokens),
	"Original Size (bytes)": original_size,
	"Tokenized Size (bytes)": tokenized_size,
	"Compression Ratio": f"{compression_ratio:.2f}",
	"Vocabulary Size": len(bpe.vocab)
	}

	def load_example(text: str) -> tuple:
	"""Load example text and clear previous results"""
	return text, None

	# Create the Gradio interface
	def create_interface():
	with gr.Blocks(title="Awadhi BPE Tokenizer") as demo:
	gr.Markdown("# Awadhi BPE Tokenizer")
	gr.Markdown("This tool implements Byte Pair Encoding (BPE) for Awadhi text compression.")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Input Awadhi Text",
	placeholder="Enter Awadhi text here...",
	lines=5
	)

	with gr.Column():
	output = gr.JSON(label="Tokenization Results")

	submit_btn = gr.Button("Tokenize")
	submit_btn.click(
	fn=process_text,
	inputs=input_text,
	outputs=output
	)

	# Example buttons
	with gr.Row():
	for label, text in EXAMPLES.items():
	# Create a closure to capture the text value
	def make_example_handler(example_text):
	def handler():
	return example_text, None
	return handler

	gr.Button(label).click(
	fn=make_example_handler(text),
	outputs=[input_text, output]
	)

	gr.Markdown("""
	### About
	- This tokenizer uses BPE to compress Awadhi text
	- Vocabulary size is limited to 4500 tokens
	- Aims for a compression ratio > 3.2
	""")

	return demo

	# Initialize model and create interface
	initialize_model()
	demo = create_interface()

	# Launch the app
	if __name__ == "__main__":
	demo.launch()