File size: 1,092 Bytes
65224b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# After saving processed_data.json
from utils import tokenize, build_vocab, save_vocab
from utils import load_data
import json

def prepare_training_data(processed_data, vocab_path='vocab.json'):
    tokenized_texts = []
    for entry in processed_data:
        if isinstance(entry, str):
            tokens = tokenize(entry)
            tokenized_texts.append(tokens)
        elif isinstance(entry, list):
            for item in entry:
                if isinstance(item, str):
                    tokens = tokenize(item)
                    tokenized_texts.append(tokens)
    vocab = build_vocab(tokenized_texts)
    save_vocab(vocab, vocab_path)
    return tokenized_texts, vocab

if __name__ == "__main__":
    data = load_data()
    tokenized_texts, vocab = prepare_training_data(data)
    # Save tokenized data
    with open('data/processed/tokenized_data.json', 'w', encoding='utf-8') as f:
        json.dump(tokenized_texts, f, ensure_ascii=False, indent=4)
    print("Data processing complete. Tokenized data saved to data/processed/tokenized_data.json")