File size: 1,149 Bytes
65224b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# src/utils.py
import re
from collections import Counter
import json

def tokenize(text):
    """

    Simple tokenizer that splits text into tokens based on whitespace and punctuation.

    """
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

def build_vocab(tokenized_texts, min_freq=2):
    """

    Builds a vocabulary dictionary from tokenized texts.

    Tokens appearing fewer than `min_freq` times are excluded.

    """
    counter = Counter()
    for tokens in tokenized_texts:
        counter.update(tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

def save_vocab(vocab, filepath='vocab.json'):
    """

    Saves the vocabulary dictionary to a JSON file.

    """
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(vocab, f, ensure_ascii=False, indent=4)

def load_vocab(filepath='vocab.json'):
    """

    Loads the vocabulary dictionary from a JSON file.

    """
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)