Spaces:
Sleeping
Sleeping
File size: 1,149 Bytes
65224b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# src/utils.py
import re
from collections import Counter
import json
def tokenize(text):
"""
Simple tokenizer that splits text into tokens based on whitespace and punctuation.
"""
tokens = re.findall(r'\b\w+\b', text.lower())
return tokens
def build_vocab(tokenized_texts, min_freq=2):
"""
Builds a vocabulary dictionary from tokenized texts.
Tokens appearing fewer than `min_freq` times are excluded.
"""
counter = Counter()
for tokens in tokenized_texts:
counter.update(tokens)
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, freq in counter.items():
if freq >= min_freq:
vocab[word] = len(vocab)
return vocab
def save_vocab(vocab, filepath='vocab.json'):
"""
Saves the vocabulary dictionary to a JSON file.
"""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(vocab, f, ensure_ascii=False, indent=4)
def load_vocab(filepath='vocab.json'):
"""
Loads the vocabulary dictionary from a JSON file.
"""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
|