# src/utils.py import re from collections import Counter import json def tokenize(text): """ Simple tokenizer that splits text into tokens based on whitespace and punctuation. """ tokens = re.findall(r'\b\w+\b', text.lower()) return tokens def build_vocab(tokenized_texts, min_freq=2): """ Builds a vocabulary dictionary from tokenized texts. Tokens appearing fewer than `min_freq` times are excluded. """ counter = Counter() for tokens in tokenized_texts: counter.update(tokens) vocab = {'': 0, '': 1} for word, freq in counter.items(): if freq >= min_freq: vocab[word] = len(vocab) return vocab def save_vocab(vocab, filepath='vocab.json'): """ Saves the vocabulary dictionary to a JSON file. """ with open(filepath, 'w', encoding='utf-8') as f: json.dump(vocab, f, ensure_ascii=False, indent=4) def load_vocab(filepath='vocab.json'): """ Loads the vocabulary dictionary from a JSON file. """ with open(filepath, 'r', encoding='utf-8') as f: return json.load(f)