Spaces:
Runtime error
Runtime error
import os | |
from random import randrange | |
from datetime import datetime | |
from dotenv import ( | |
load_dotenv, | |
find_dotenv, | |
dotenv_values, | |
set_key | |
) | |
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
# --------------------------------------------------- # | |
# ----------------- Load env vars ------------------ # | |
# --------------------------------------------------- # | |
print("Loading env variables.........................") | |
_ = load_dotenv(find_dotenv()) | |
CHATBOT_NAME = os.environ.get('CHATBOT_NAME') | |
ENV_FILE = os.environ.get("ENV_FILE") | |
MODEL_NAME = os.environ.get("MODEL_NAME") | |
# ----------------------------------------------- # | |
# ----------------- Load data ------------------ # | |
# ----------------------------------------------- # | |
print("Loading data.........................") | |
DATA_PATH = 'data/raw_data.json' | |
if DATA_PATH.endswith(".json") or DATA_PATH.endswith(".jsonl"): | |
dataset = load_dataset("json", data_files=DATA_PATH) | |
# ----------------------------------------------------- # | |
# ----------------- Preprocess data ------------------ # | |
# ----------------------------------------------------- # | |
print("Cleaning data.........................") | |
# ----------------- TO DO ------------------ # | |
# For the prompt, implement the same using # | |
# lancgchain.prompt systemprompt, humanprompt # | |
# ------------------------------------------ # | |
# ----------------- TO DO ------------------ # | |
# For the conversational data, adjust the # | |
# prompt to have a chain of user-ai query. # | |
# ------------------------------------------ # | |
# this function is used to output the right formate for each row in the dataset | |
tokenizer = AutoTokenizer.from_pretrained( | |
pretrained_model_name_or_path=MODEL_NAME, | |
use_fast = True | |
) | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = 'right' | |
def create_text_row(data): | |
user, response, input = data['user'], data['response'], data['input'] | |
chat_prompt = [ | |
{ | |
"role": "user", | |
"content": user | |
}, | |
{ | |
"role": "assistant", | |
"content": response | |
} | |
] | |
data["text"] = tokenizer.apply_chat_template( | |
chat_prompt, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
return data | |
# idx = randrange(len(dataset['train'])) | |
train_dataset = dataset['train'].shuffle(seed=2023).map(create_text_row) | |
# ----------------------------------------------------------- # | |
# ----------------- Saving processed data ------------------ # | |
# ----------------------------------------------------------- # | |
print("Saving processed data.........................") | |
now = datetime.now().strftime(format='%Y-%m-%d-%Hh%Mm') | |
data_cleaned_path = f"data/cleaned/{now}_data_cleaned.jsonl" | |
train_dataset.to_json(data_cleaned_path) | |
print(f"\tProcessed data saved to '{data_cleaned_path}'") | |
# ---------------------------------------------------------- # | |
# ----------------- Setting new env vars ------------------ # | |
# ---------------------------------------------------------- # | |
existing_env_vals = dotenv_values(ENV_FILE) | |
set_key(ENV_FILE, "CLEANED_DATA_PATH", data_cleaned_path) | |
existing_env_vals | |
print("Data processing completed") |