xtrade_bot / archived /data_prep.py
Josh-Ola's picture
Upload folder using huggingface_hub
65976bc verified
import os
from random import randrange
from datetime import datetime
from dotenv import (
load_dotenv,
find_dotenv,
dotenv_values,
set_key
)
from datasets import load_dataset
from transformers import AutoTokenizer
# --------------------------------------------------- #
# ----------------- Load env vars ------------------ #
# --------------------------------------------------- #
print("Loading env variables.........................")
_ = load_dotenv(find_dotenv())
CHATBOT_NAME = os.environ.get('CHATBOT_NAME')
ENV_FILE = os.environ.get("ENV_FILE")
MODEL_NAME = os.environ.get("MODEL_NAME")
# ----------------------------------------------- #
# ----------------- Load data ------------------ #
# ----------------------------------------------- #
print("Loading data.........................")
DATA_PATH = 'data/raw_data.json'
if DATA_PATH.endswith(".json") or DATA_PATH.endswith(".jsonl"):
dataset = load_dataset("json", data_files=DATA_PATH)
# ----------------------------------------------------- #
# ----------------- Preprocess data ------------------ #
# ----------------------------------------------------- #
print("Cleaning data.........................")
# ----------------- TO DO ------------------ #
# For the prompt, implement the same using #
# lancgchain.prompt systemprompt, humanprompt #
# ------------------------------------------ #
# ----------------- TO DO ------------------ #
# For the conversational data, adjust the #
# prompt to have a chain of user-ai query. #
# ------------------------------------------ #
# this function is used to output the right formate for each row in the dataset
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=MODEL_NAME,
use_fast = True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
def create_text_row(data):
user, response, input = data['user'], data['response'], data['input']
chat_prompt = [
{
"role": "user",
"content": user
},
{
"role": "assistant",
"content": response
}
]
data["text"] = tokenizer.apply_chat_template(
chat_prompt,
tokenize=False,
add_generation_prompt=True
)
return data
# idx = randrange(len(dataset['train']))
train_dataset = dataset['train'].shuffle(seed=2023).map(create_text_row)
# ----------------------------------------------------------- #
# ----------------- Saving processed data ------------------ #
# ----------------------------------------------------------- #
print("Saving processed data.........................")
now = datetime.now().strftime(format='%Y-%m-%d-%Hh%Mm')
data_cleaned_path = f"data/cleaned/{now}_data_cleaned.jsonl"
train_dataset.to_json(data_cleaned_path)
print(f"\tProcessed data saved to '{data_cleaned_path}'")
# ---------------------------------------------------------- #
# ----------------- Setting new env vars ------------------ #
# ---------------------------------------------------------- #
existing_env_vals = dotenv_values(ENV_FILE)
set_key(ENV_FILE, "CLEANED_DATA_PATH", data_cleaned_path)
existing_env_vals
print("Data processing completed")