pradeep6kumar2024 commited on
Commit
a70b2bd
·
verified ·
1 Parent(s): 76aed2d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from bpe_Awadhi import AwadhiBPE
3
+ import json
4
+ import os
5
+
6
+ # Initialize the BPE model
7
+ bpe = AwadhiBPE(vocab_size=4500)
8
+
9
+ # Load the model if it exists, otherwise train it
10
+ def initialize_model():
11
+ if os.path.exists('Awadhi_bpe.json'):
12
+ bpe.load('Awadhi_bpe.json')
13
+ else:
14
+ # Load the text and train the model
15
+ with open('sunderkand_awdhi.txt', 'r', encoding='utf-8') as f:
16
+ text = f.read()
17
+ bpe.fit(text)
18
+ bpe.save('Awadhi_bpe.json')
19
+
20
+ def process_text(input_text: str) -> dict:
21
+ """Process input text and return tokenization results"""
22
+ # Tokenize the text
23
+ tokens = bpe.tokenize(input_text)
24
+
25
+ # Calculate compression ratio
26
+ original_size = len(input_text.encode('utf-8'))
27
+ tokenized_size = len(tokens) * 2 # Assuming average 2 bytes per token
28
+ compression_ratio = original_size / tokenized_size
29
+
30
+ return {
31
+ "Tokens": " ".join(tokens[:100]) + "..." if len(tokens) > 100 else " ".join(tokens),
32
+ "Number of Tokens": len(tokens),
33
+ "Original Size (bytes)": original_size,
34
+ "Tokenized Size (bytes)": tokenized_size,
35
+ "Compression Ratio": f"{compression_ratio:.2f}",
36
+ "Vocabulary Size": len(bpe.vocab)
37
+ }
38
+
39
+ # Create the Gradio interface
40
+ def create_interface():
41
+ with gr.Blocks(title="Awadhi BPE Tokenizer") as demo:
42
+ gr.Markdown("# Awadhi BPE Tokenizer")
43
+ gr.Markdown("This tool implements Byte Pair Encoding (BPE) for Awadhi text compression.")
44
+
45
+ with gr.Row():
46
+ with gr.Column():
47
+ input_text = gr.Textbox(
48
+ label="Input Awadhi Text",
49
+ placeholder="Enter Awadhi text here...",
50
+ lines=5
51
+ )
52
+
53
+ with gr.Column():
54
+ output = gr.JSON(label="Tokenization Results")
55
+
56
+ submit_btn = gr.Button("Tokenize")
57
+ submit_btn.click(
58
+ fn=process_text,
59
+ inputs=input_text,
60
+ outputs=output
61
+ )
62
+
63
+ gr.Markdown("""
64
+ ### About
65
+ - This tokenizer uses BPE to compress Awadhi text
66
+ - Vocabulary size is limited to 4500 tokens
67
+ - Aims for a compression ratio > 3.2
68
+ """)
69
+
70
+ return demo
71
+
72
+ # Initialize model and create interface
73
+ initialize_model()
74
+ demo = create_interface()
75
+
76
+ # Launch the app
77
+ if __name__ == "__main__":
78
+ demo.launch()