import re from app.token_frequency_tracker import TokenFrequencyTracker from app.dynamic_vocabulary_manager import DynamicVocabularyManager from app.adaptive_bpe import AdaptiveBPE from app.feedback_loop import FeedbackLoop def preprocess_text(text): # Define a regex pattern to match only Devanagari characters and numbers devanagari_pattern = re.compile(r"[\u0900-\u097F]+") number_pattern = re.compile(r"\d{3,}") # Find all Devanagari words and sequences of three or more numbers words = devanagari_pattern.findall(text) numbers = number_pattern.findall(text) # Combine Devanagari words and number sequences return words + numbers def calculate_compression_ratio(original_text, tokens): return len(original_text) / len(" ".join(tokens)) def main_tokenization_process( texts, initial_vocabulary, initial_merges, target_compression_ratio, frequency_threshold, ): frequency_tracker = TokenFrequencyTracker() vocabulary_manager = DynamicVocabularyManager(initial_vocabulary) bpe = AdaptiveBPE(initial_merges) feedback_loop = FeedbackLoop(target_compression_ratio) for text in texts: # Preprocess text to extract relevant tokens tokens = preprocess_text(text) frequency_tracker.update_frequencies(tokens) token_frequencies = frequency_tracker.get_frequencies() vocabulary_manager.update_vocabulary(token_frequencies, frequency_threshold) bpe.perform_merges(token_frequencies) current_compression_ratio = calculate_compression_ratio(text, tokens) if feedback_loop.evaluate_performance(current_compression_ratio): pass return vocabulary_manager.get_vocabulary(), bpe.get_merges()