# src/utils.py
import re
from collections import Counter
import json

def tokenize(text):
    """
    Simple tokenizer that splits text into tokens based on whitespace and punctuation.
    """
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

def build_vocab(tokenized_texts, min_freq=2):
    """
    Builds a vocabulary dictionary from tokenized texts.
    Tokens appearing fewer than `min_freq` times are excluded.
    """
    counter = Counter()
    for tokens in tokenized_texts:
        counter.update(tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

def save_vocab(vocab, filepath='vocab.json'):
    """
    Saves the vocabulary dictionary to a JSON file.
    """
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(vocab, f, ensure_ascii=False, indent=4)

def load_vocab(filepath='vocab.json'):
    """
    Loads the vocabulary dictionary from a JSON file.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)