HindiBPE / backend /train_bpe.py
Vibi007's picture
updated
1914537
import asyncio
from app.bpe_tokenizer import BPETokenizer
import json
from tqdm import tqdm
from app.dynamic_vocabulary_manager import DynamicVocabularyManager
from app.token_frequency_tracker import TokenFrequencyTracker
from app.feedback_loop import FeedbackLoop
def load_sample_data(file_path: str, max_sentences: int = None) -> str:
"""Load sentences from file with progress bar"""
sentences = []
total_chars = 0
print(f"Loading data from {file_path}...")
# First count total lines for progress bar
with open(file_path, "r", encoding="utf-8") as f:
total_lines = sum(1 for _ in f)
with open(file_path, "r", encoding="utf-8") as f:
for line in tqdm(f, total=total_lines, desc="Loading sentences"):
line = line.strip()
if line:
sentences.append(line)
total_chars += len(line)
if max_sentences and len(sentences) >= max_sentences:
break
print(f"\nLoaded {len(sentences)} sentences")
print(f"Total characters: {total_chars}")
print(f"Average sentence length: {total_chars/len(sentences):.2f} characters")
return "\n".join(sentences)
async def train_and_save_bpe(text: str, vocab_size: int = 5000):
tokenizer = BPETokenizer(vocab_size=vocab_size)
frequency_tracker = TokenFrequencyTracker()
vocab_manager = DynamicVocabularyManager(initial_vocabulary=tokenizer.vocab)
feedback_loop = FeedbackLoop(target_compression_ratio=1.5)
print("\nInitializing vocabulary...")
tokenizer.initialize_vocab()
print("\nStarting BPE training...")
await tokenizer.learn_bpe(text, manager=vocab_manager)
# Integrate feedback loop
feedback_loop.evaluate_and_adjust(tokenizer)
# Save final model
model_data = {
"vocab": list(tokenizer.vocab),
"merges": {" ".join(k): v for k, v in tokenizer.merges.items()},
"merge_history": tokenizer.merge_history,
"base_vocab_stats": tokenizer.base_vocab_stats,
"training_stats": {
"total_merges": len(tokenizer.merge_history),
"final_vocab_size": len(tokenizer.vocab),
"learned_vocab_size": len(tokenizer.learned_vocab),
},
}
with open("bpe_model.json", "w", encoding="utf-8") as f:
json.dump(model_data, f, ensure_ascii=False, indent=2)
return tokenizer
async def main():
# Load the larger dataset
text = load_sample_data("data/hindi_wiki_corpus.txt", max_sentences=10000)
# Train BPE with larger vocabulary
tokenizer = await train_and_save_bpe(text, vocab_size=10000)
# Show sample tokenization
print("\nSample Tokenization:")
sample_texts = ["नमस्ते भारत", "मैं हिंदी सीख रहा हूं", "यह एक बहुत अच्छा दिन है"]
for sample_text in sample_texts:
result = tokenizer.tokenize_with_details(sample_text)
print(f"\nInput text: {sample_text}")
print(f"Original tokens: {result['original_tokens']}")
print(f"BPE tokens: {result['bpe_tokens']}")
print(f"Compression ratio: {result['stats']['compression_ratio']}:1")
if __name__ == "__main__":
asyncio.run(main())