Spaces:
Sleeping
Sleeping
Commit
·
62c02ab
1
Parent(s):
33edf85
Initial commit
Browse files- .gitattributes +1 -3
- README.md +38 -6
- bpe_Awadhi.py +106 -0
- config.yaml +18 -0
- requirements.txt +2 -0
.gitattributes
CHANGED
@@ -23,13 +23,11 @@
|
|
23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
26 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
27 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
28 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
29 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
30 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,6 +1,38 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Awadhi BPE Tokenizer
|
2 |
+
|
3 |
+
This space provides a Byte Pair Encoding (BPE) implementation for Awadhi text compression. It features:
|
4 |
+
|
5 |
+
- Custom BPE implementation for Awadhi text
|
6 |
+
- Vocabulary size < 5000 tokens
|
7 |
+
- Compression ratio > 3.2
|
8 |
+
- Interactive web interface
|
9 |
+
|
10 |
+
## Usage
|
11 |
+
|
12 |
+
1. Enter Awadhi text in the input box
|
13 |
+
2. Click "Tokenize"
|
14 |
+
3. View tokenization results and statistics
|
15 |
+
|
16 |
+
## Implementation Details
|
17 |
+
|
18 |
+
- Uses character-level tokenization as base
|
19 |
+
- Implements BPE merging strategy
|
20 |
+
- Handles UTF-8 encoded Awadhi text
|
21 |
+
- Provides compression statistics
|
22 |
+
|
23 |
+
## Model Details
|
24 |
+
|
25 |
+
- Base tokenization: Character-level
|
26 |
+
- Maximum vocabulary size: 4500 tokens
|
27 |
+
- Training corpus: Sunderkand in Awadhi
|
28 |
+
- Compression target: > 3.2x
|
29 |
+
|
30 |
+
## Technical Requirements
|
31 |
+
|
32 |
+
- Python 3.10+
|
33 |
+
- PyTorch
|
34 |
+
- Gradio 3.50.2+
|
35 |
+
|
36 |
+
## License
|
37 |
+
|
38 |
+
This project is licensed under the MIT License.
|
bpe_Awadhi.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from collections import defaultdict, Counter
|
3 |
+
import re
|
4 |
+
from typing import Dict, List, Tuple, Set
|
5 |
+
import json
|
6 |
+
|
7 |
+
class AwadhiBPE:
|
8 |
+
def __init__(self, vocab_size: int = 5000):
|
9 |
+
self.vocab_size = vocab_size
|
10 |
+
self.merges: Dict[Tuple[str, str], str] = {}
|
11 |
+
self.vocab: Set[str] = set()
|
12 |
+
|
13 |
+
def get_stats(self, vocab: Dict[str, int]) -> Dict[Tuple[str, str], int]:
|
14 |
+
pairs = defaultdict(int)
|
15 |
+
for word, freq in vocab.items():
|
16 |
+
symbols = word.split()
|
17 |
+
for i in range(len(symbols)-1):
|
18 |
+
pairs[symbols[i], symbols[i+1]] += freq
|
19 |
+
return pairs
|
20 |
+
|
21 |
+
def merge_vocab(self, pair: Tuple[str, str], v_in: Dict[str, int]) -> Dict[str, int]:
|
22 |
+
v_out = {}
|
23 |
+
bigram = ' '.join(pair)
|
24 |
+
replacement = ''.join(pair)
|
25 |
+
for word in v_in:
|
26 |
+
w_out = word.replace(bigram, replacement)
|
27 |
+
v_out[w_out] = v_in[word]
|
28 |
+
return v_out
|
29 |
+
|
30 |
+
def fit(self, text: str) -> None:
|
31 |
+
# Initial character-level tokenization
|
32 |
+
words = text.split()
|
33 |
+
word_freqs = Counter(words)
|
34 |
+
|
35 |
+
# Initialize vocabulary with characters
|
36 |
+
vocab = {}
|
37 |
+
for word, freq in word_freqs.items():
|
38 |
+
chars = ' '.join(list(word))
|
39 |
+
vocab[chars] = freq
|
40 |
+
self.vocab.update(set(word))
|
41 |
+
|
42 |
+
num_merges = min(self.vocab_size - len(self.vocab), len(vocab))
|
43 |
+
|
44 |
+
for i in range(num_merges):
|
45 |
+
pairs = self.get_stats(vocab)
|
46 |
+
if not pairs:
|
47 |
+
break
|
48 |
+
|
49 |
+
best = max(pairs, key=pairs.get)
|
50 |
+
vocab = self.merge_vocab(best, vocab)
|
51 |
+
self.merges[best] = ''.join(best)
|
52 |
+
self.vocab.add(self.merges[best])
|
53 |
+
|
54 |
+
def tokenize(self, text: str) -> List[str]:
|
55 |
+
words = text.split()
|
56 |
+
tokens = []
|
57 |
+
|
58 |
+
for word in words:
|
59 |
+
chars = ' '.join(list(word))
|
60 |
+
for pair, merge in self.merges.items():
|
61 |
+
chars = chars.replace(' '.join(pair), merge)
|
62 |
+
tokens.extend(chars.split())
|
63 |
+
|
64 |
+
return tokens
|
65 |
+
|
66 |
+
def save(self, path: str) -> None:
|
67 |
+
with open(path, 'w', encoding='utf-8') as f:
|
68 |
+
json.dump({
|
69 |
+
'merges': {' '.join(k): v for k, v in self.merges.items()},
|
70 |
+
'vocab': list(self.vocab)
|
71 |
+
}, f, ensure_ascii=False)
|
72 |
+
|
73 |
+
def load(self, path: str) -> None:
|
74 |
+
with open(path, 'r', encoding='utf-8') as f:
|
75 |
+
data = json.load(f)
|
76 |
+
self.merges = {tuple(k.split()): v for k, v in data['merges'].items()}
|
77 |
+
self.vocab = set(data['vocab'])
|
78 |
+
|
79 |
+
# Training and evaluation code
|
80 |
+
def main():
|
81 |
+
# Read the text file
|
82 |
+
with open('sunderkand_awdhi.txt', 'r', encoding='utf-8') as f:
|
83 |
+
text = f.read()
|
84 |
+
|
85 |
+
# Create and train BPE
|
86 |
+
bpe = AwadhiBPE(vocab_size=4500) # Using slightly less than 5000 to be safe
|
87 |
+
bpe.fit(text)
|
88 |
+
|
89 |
+
# Save the trained model
|
90 |
+
bpe.save('Awadhi_bpe.json')
|
91 |
+
|
92 |
+
# Tokenize the text
|
93 |
+
tokens = bpe.tokenize(text)
|
94 |
+
|
95 |
+
# Calculate compression ratio
|
96 |
+
original_size = len(text.encode('utf-8'))
|
97 |
+
tokenized_size = len(tokens) * 2 # Assuming average 2 bytes per token
|
98 |
+
compression_ratio = original_size / tokenized_size
|
99 |
+
|
100 |
+
print(f"Original size (bytes): {original_size}")
|
101 |
+
print(f"Tokenized size (bytes): {tokenized_size}")
|
102 |
+
print(f"Compression ratio: {compression_ratio:.2f}")
|
103 |
+
print(f"Vocabulary size: {len(bpe.vocab)}")
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
main()
|
config.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title: Awadhi BPE Tokenizer
|
2 |
+
emoji: 🇮🇳
|
3 |
+
colorFrom: blue
|
4 |
+
colorTo: red
|
5 |
+
sdk: gradio
|
6 |
+
sdk_version: 3.50.2
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
license: mit
|
10 |
+
python_version: "3.10"
|
11 |
+
app_port: 7860
|
12 |
+
tags:
|
13 |
+
- awadhi
|
14 |
+
- tokenizer
|
15 |
+
- bpe
|
16 |
+
- text-compression
|
17 |
+
datasets:
|
18 |
+
- sunderkand_awdhi
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
torch
|