HindiBPE / backend /app /main_tokenization_process.py
Vibi007's picture
updated
1914537
import re
from app.token_frequency_tracker import TokenFrequencyTracker
from app.dynamic_vocabulary_manager import DynamicVocabularyManager
from app.adaptive_bpe import AdaptiveBPE
from app.feedback_loop import FeedbackLoop
def preprocess_text(text):
# Define a regex pattern to match only Devanagari characters and numbers
devanagari_pattern = re.compile(r"[\u0900-\u097F]+")
number_pattern = re.compile(r"\d{3,}")
# Find all Devanagari words and sequences of three or more numbers
words = devanagari_pattern.findall(text)
numbers = number_pattern.findall(text)
# Combine Devanagari words and number sequences
return words + numbers
def calculate_compression_ratio(original_text, tokens):
return len(original_text) / len(" ".join(tokens))
def main_tokenization_process(
texts,
initial_vocabulary,
initial_merges,
target_compression_ratio,
frequency_threshold,
):
frequency_tracker = TokenFrequencyTracker()
vocabulary_manager = DynamicVocabularyManager(initial_vocabulary)
bpe = AdaptiveBPE(initial_merges)
feedback_loop = FeedbackLoop(target_compression_ratio)
for text in texts:
# Preprocess text to extract relevant tokens
tokens = preprocess_text(text)
frequency_tracker.update_frequencies(tokens)
token_frequencies = frequency_tracker.get_frequencies()
vocabulary_manager.update_vocabulary(token_frequencies, frequency_threshold)
bpe.perform_merges(token_frequencies)
current_compression_ratio = calculate_compression_ratio(text, tokens)
if feedback_loop.evaluate_performance(current_compression_ratio):
pass
return vocabulary_manager.get_vocabulary(), bpe.get_merges()