File size: 3,221 Bytes
65976bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
from random import randrange
from datetime import datetime
from dotenv import (
    load_dotenv,
    find_dotenv,
    dotenv_values,
    set_key
)

from datasets import load_dataset

from transformers import AutoTokenizer

# --------------------------------------------------- #
# ----------------- Load env vars ------------------  #
# --------------------------------------------------- #
print("Loading env variables.........................")

_ = load_dotenv(find_dotenv())

CHATBOT_NAME = os.environ.get('CHATBOT_NAME')
ENV_FILE = os.environ.get("ENV_FILE")
MODEL_NAME = os.environ.get("MODEL_NAME")

# ----------------------------------------------- #
# ----------------- Load data ------------------  #
# ----------------------------------------------- #
print("Loading data.........................")
DATA_PATH = 'data/raw_data.json'
if DATA_PATH.endswith(".json") or DATA_PATH.endswith(".jsonl"):
    dataset = load_dataset("json", data_files=DATA_PATH)

# ----------------------------------------------------- #
# ----------------- Preprocess data ------------------  #
# ----------------------------------------------------- #
print("Cleaning data.........................")

# ----------------- TO DO ------------------  #
# For the prompt, implement the same using    #
# lancgchain.prompt systemprompt, humanprompt #
# ------------------------------------------  #

# ----------------- TO DO ------------------ #
# For the conversational data, adjust the    #
# prompt to have a chain of user-ai query.   #
# ------------------------------------------ #

# this function is used to output the right formate for each row in the dataset

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    use_fast = True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

def create_text_row(data):
    user, response, input = data['user'], data['response'], data['input']

    chat_prompt = [
        {
            "role": "user",
            "content": user
        },
        {
            "role": "assistant",
            "content": response
        }
    ]

    data["text"] = tokenizer.apply_chat_template(
        chat_prompt,
        tokenize=False,
        add_generation_prompt=True
    )
    return data

# idx = randrange(len(dataset['train']))
train_dataset = dataset['train'].shuffle(seed=2023).map(create_text_row)


# ----------------------------------------------------------- #
# ----------------- Saving processed data ------------------  #
# ----------------------------------------------------------- #
print("Saving processed data.........................")
now = datetime.now().strftime(format='%Y-%m-%d-%Hh%Mm')

data_cleaned_path = f"data/cleaned/{now}_data_cleaned.jsonl"

train_dataset.to_json(data_cleaned_path)

print(f"\tProcessed data saved to '{data_cleaned_path}'")


# ---------------------------------------------------------- #
# ----------------- Setting new env vars ------------------  #
# ---------------------------------------------------------- #
existing_env_vals = dotenv_values(ENV_FILE)
set_key(ENV_FILE, "CLEANED_DATA_PATH", data_cleaned_path)
existing_env_vals

print("Data processing completed")