pradeep6kumar2024 commited on
Commit
62c02ab
·
1 Parent(s): 33edf85

Initial commit

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -3
  2. README.md +38 -6
  3. bpe_Awadhi.py +106 -0
  4. config.yaml +18 -0
  5. requirements.txt +2 -0
.gitattributes CHANGED
@@ -23,13 +23,11 @@
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
 
26
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
27
  *.tflite filter=lfs diff=lfs merge=lfs -text
28
  *.tgz filter=lfs diff=lfs merge=lfs -text
29
  *.wasm filter=lfs diff=lfs merge=lfs -text
30
  *.xz filter=lfs diff=lfs merge=lfs -text
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,38 @@
1
- ---
2
- license: mit
3
- sdk: gradio
4
- colorFrom: indigo
5
- colorTo: gray
6
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Awadhi BPE Tokenizer
2
+
3
+ This space provides a Byte Pair Encoding (BPE) implementation for Awadhi text compression. It features:
4
+
5
+ - Custom BPE implementation for Awadhi text
6
+ - Vocabulary size < 5000 tokens
7
+ - Compression ratio > 3.2
8
+ - Interactive web interface
9
+
10
+ ## Usage
11
+
12
+ 1. Enter Awadhi text in the input box
13
+ 2. Click "Tokenize"
14
+ 3. View tokenization results and statistics
15
+
16
+ ## Implementation Details
17
+
18
+ - Uses character-level tokenization as base
19
+ - Implements BPE merging strategy
20
+ - Handles UTF-8 encoded Awadhi text
21
+ - Provides compression statistics
22
+
23
+ ## Model Details
24
+
25
+ - Base tokenization: Character-level
26
+ - Maximum vocabulary size: 4500 tokens
27
+ - Training corpus: Sunderkand in Awadhi
28
+ - Compression target: > 3.2x
29
+
30
+ ## Technical Requirements
31
+
32
+ - Python 3.10+
33
+ - PyTorch
34
+ - Gradio 3.50.2+
35
+
36
+ ## License
37
+
38
+ This project is licensed under the MIT License.
bpe_Awadhi.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from collections import defaultdict, Counter
3
+ import re
4
+ from typing import Dict, List, Tuple, Set
5
+ import json
6
+
7
+ class AwadhiBPE:
8
+ def __init__(self, vocab_size: int = 5000):
9
+ self.vocab_size = vocab_size
10
+ self.merges: Dict[Tuple[str, str], str] = {}
11
+ self.vocab: Set[str] = set()
12
+
13
+ def get_stats(self, vocab: Dict[str, int]) -> Dict[Tuple[str, str], int]:
14
+ pairs = defaultdict(int)
15
+ for word, freq in vocab.items():
16
+ symbols = word.split()
17
+ for i in range(len(symbols)-1):
18
+ pairs[symbols[i], symbols[i+1]] += freq
19
+ return pairs
20
+
21
+ def merge_vocab(self, pair: Tuple[str, str], v_in: Dict[str, int]) -> Dict[str, int]:
22
+ v_out = {}
23
+ bigram = ' '.join(pair)
24
+ replacement = ''.join(pair)
25
+ for word in v_in:
26
+ w_out = word.replace(bigram, replacement)
27
+ v_out[w_out] = v_in[word]
28
+ return v_out
29
+
30
+ def fit(self, text: str) -> None:
31
+ # Initial character-level tokenization
32
+ words = text.split()
33
+ word_freqs = Counter(words)
34
+
35
+ # Initialize vocabulary with characters
36
+ vocab = {}
37
+ for word, freq in word_freqs.items():
38
+ chars = ' '.join(list(word))
39
+ vocab[chars] = freq
40
+ self.vocab.update(set(word))
41
+
42
+ num_merges = min(self.vocab_size - len(self.vocab), len(vocab))
43
+
44
+ for i in range(num_merges):
45
+ pairs = self.get_stats(vocab)
46
+ if not pairs:
47
+ break
48
+
49
+ best = max(pairs, key=pairs.get)
50
+ vocab = self.merge_vocab(best, vocab)
51
+ self.merges[best] = ''.join(best)
52
+ self.vocab.add(self.merges[best])
53
+
54
+ def tokenize(self, text: str) -> List[str]:
55
+ words = text.split()
56
+ tokens = []
57
+
58
+ for word in words:
59
+ chars = ' '.join(list(word))
60
+ for pair, merge in self.merges.items():
61
+ chars = chars.replace(' '.join(pair), merge)
62
+ tokens.extend(chars.split())
63
+
64
+ return tokens
65
+
66
+ def save(self, path: str) -> None:
67
+ with open(path, 'w', encoding='utf-8') as f:
68
+ json.dump({
69
+ 'merges': {' '.join(k): v for k, v in self.merges.items()},
70
+ 'vocab': list(self.vocab)
71
+ }, f, ensure_ascii=False)
72
+
73
+ def load(self, path: str) -> None:
74
+ with open(path, 'r', encoding='utf-8') as f:
75
+ data = json.load(f)
76
+ self.merges = {tuple(k.split()): v for k, v in data['merges'].items()}
77
+ self.vocab = set(data['vocab'])
78
+
79
+ # Training and evaluation code
80
+ def main():
81
+ # Read the text file
82
+ with open('sunderkand_awdhi.txt', 'r', encoding='utf-8') as f:
83
+ text = f.read()
84
+
85
+ # Create and train BPE
86
+ bpe = AwadhiBPE(vocab_size=4500) # Using slightly less than 5000 to be safe
87
+ bpe.fit(text)
88
+
89
+ # Save the trained model
90
+ bpe.save('Awadhi_bpe.json')
91
+
92
+ # Tokenize the text
93
+ tokens = bpe.tokenize(text)
94
+
95
+ # Calculate compression ratio
96
+ original_size = len(text.encode('utf-8'))
97
+ tokenized_size = len(tokens) * 2 # Assuming average 2 bytes per token
98
+ compression_ratio = original_size / tokenized_size
99
+
100
+ print(f"Original size (bytes): {original_size}")
101
+ print(f"Tokenized size (bytes): {tokenized_size}")
102
+ print(f"Compression ratio: {compression_ratio:.2f}")
103
+ print(f"Vocabulary size: {len(bpe.vocab)}")
104
+
105
+ if __name__ == "__main__":
106
+ main()
config.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: Awadhi BPE Tokenizer
2
+ emoji: 🇮🇳
3
+ colorFrom: blue
4
+ colorTo: red
5
+ sdk: gradio
6
+ sdk_version: 3.50.2
7
+ app_file: app.py
8
+ pinned: false
9
+ license: mit
10
+ python_version: "3.10"
11
+ app_port: 7860
12
+ tags:
13
+ - awadhi
14
+ - tokenizer
15
+ - bpe
16
+ - text-compression
17
+ datasets:
18
+ - sunderkand_awdhi
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ torch