Spaces:
Building
Building
import re | |
from app.token_frequency_tracker import TokenFrequencyTracker | |
from app.dynamic_vocabulary_manager import DynamicVocabularyManager | |
from app.adaptive_bpe import AdaptiveBPE | |
from app.feedback_loop import FeedbackLoop | |
def preprocess_text(text): | |
# Define a regex pattern to match only Devanagari characters and numbers | |
devanagari_pattern = re.compile(r"[\u0900-\u097F]+") | |
number_pattern = re.compile(r"\d{3,}") | |
# Find all Devanagari words and sequences of three or more numbers | |
words = devanagari_pattern.findall(text) | |
numbers = number_pattern.findall(text) | |
# Combine Devanagari words and number sequences | |
return words + numbers | |
def calculate_compression_ratio(original_text, tokens): | |
return len(original_text) / len(" ".join(tokens)) | |
def main_tokenization_process( | |
texts, | |
initial_vocabulary, | |
initial_merges, | |
target_compression_ratio, | |
frequency_threshold, | |
): | |
frequency_tracker = TokenFrequencyTracker() | |
vocabulary_manager = DynamicVocabularyManager(initial_vocabulary) | |
bpe = AdaptiveBPE(initial_merges) | |
feedback_loop = FeedbackLoop(target_compression_ratio) | |
for text in texts: | |
# Preprocess text to extract relevant tokens | |
tokens = preprocess_text(text) | |
frequency_tracker.update_frequencies(tokens) | |
token_frequencies = frequency_tracker.get_frequencies() | |
vocabulary_manager.update_vocabulary(token_frequencies, frequency_threshold) | |
bpe.perform_merges(token_frequencies) | |
current_compression_ratio = calculate_compression_ratio(text, tokens) | |
if feedback_loop.evaluate_performance(current_compression_ratio): | |
pass | |
return vocabulary_manager.get_vocabulary(), bpe.get_merges() | |