Spaces:
Sleeping
Sleeping
# this file is a WIP and an attempt to locally recreate https://colab.research.google.com/drive/1-h3rPUzV-j9VzD9Rg7ZLGKEp-jMNFaje?usp=sharing | |
# this script is not working as expected, it is not able to load the training data from the file | |
import uuid | |
import tqdm | |
import json | |
import asyncio | |
from pathlib import Path | |
from dotenv import load_dotenv | |
from langchain_openai import ChatOpenAI | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_community.document_loaders import DirectoryLoader | |
from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
CHUNK_SIZE = 1000 | |
CHUNK_OVERLAP = CHUNK_SIZE // 2 | |
QA_PROMPT = """\ | |
Given the following context, you must generate questions based on only the provided context. | |
You are to generate {n_questions} questions which should be provided in the following format: | |
1. QUESTION #1 | |
2. QUESTION #2 | |
... | |
Context: | |
{context} | |
""" | |
fine_tuning_data_filepath = Path("data/finetuning") | |
fine_tuning_data_filepath.mkdir(parents=True, exist_ok=True) | |
async def create_questions(documents, n_questions, question_generation_chain): | |
questions = {} | |
relevant_docs = {} | |
for document in tqdm.tqdm(documents): | |
context = document.page_content | |
# get questions by invoking the question generation chain | |
response = await question_generation_chain.ainvoke( | |
{"context": context, "n_questions": n_questions} | |
) | |
# split the response into two questions | |
[question1, question2] = response.content.split("\n") | |
# generate a unique id for the first question | |
id1 = str(uuid.uuid4()) | |
while id1 in questions: | |
id1 = str(uuid.uuid4()) | |
# store the first question | |
questions[id1] = question1[2:].strip() | |
# generate a unique id for the second question | |
id2 = str(uuid.uuid4()) | |
while id2 in questions: | |
id2 = str(uuid.uuid4()) | |
# store the second question | |
questions[id2] = question2[2:].strip() | |
# Store the relevant doc for each questions | |
relevant_docs[id1] = [document.metadata["id"]] | |
relevant_docs[id2] = [document.metadata["id"]] | |
return questions, relevant_docs | |
async def main(): | |
path = "data/scraped/clean" | |
text_loader = DirectoryLoader(path, glob="*.txt", loader_cls=UnstructuredMarkdownLoader) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = CHUNK_SIZE, | |
chunk_overlap = CHUNK_OVERLAP, | |
length_function = len | |
) | |
training_documents = text_splitter.split_documents(text_loader.load()) | |
# Add unique id to each document | |
id_set = set() | |
for document in training_documents: | |
id = str(uuid.uuid4()) | |
while id in id_set: | |
id = uuid.uuid4() | |
id_set.add(id) | |
document.metadata["id"] = id | |
TRAINING_DOC_LENGTH = len(training_documents) | |
BREAK1 = TRAINING_DOC_LENGTH - 24 | |
BREAK2 = TRAINING_DOC_LENGTH - 12 | |
training_split_documents = training_documents[:TRAINING_DOC_LENGTH - 24] | |
eval_split_documents = training_documents[BREAK1:BREAK2] | |
test_split_documents = training_documents[BREAK2:] | |
qa_chat_model = ChatOpenAI( | |
model="gpt-4o-mini", | |
temperature=0 | |
) | |
qa_prompt_template = ChatPromptTemplate.from_template(QA_PROMPT) | |
question_generation_chain = qa_prompt_template | qa_chat_model | |
# try to load training data from file otherwise generate new data | |
try: | |
training_dataset = json.load(open(fine_tuning_data_filepath / "training_dataset.jsonl")) | |
training_questions = training_dataset["questions"] | |
training_relevant_contexts = training_dataset["relevant_contexts"] | |
training_corpus = training_dataset["corpus"] | |
except: | |
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2, question_generation_chain) | |
training_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents} | |
training_dataset = { | |
"questions" : training_questions, | |
"relevant_contexts" : training_relevant_contexts, | |
"corpus" : training_corpus | |
} | |
with open(fine_tuning_data_filepath /"training_dataset.jsonl", "w") as f: | |
json.dump(training_dataset, f) | |
# try to load eval data from file otherwise generate new data | |
try: | |
eval_dataset = json.load(open(fine_tuning_data_filepath / "eval_dataset.jsonl")) | |
eval_questions = eval_dataset["questions"] | |
eval_relevant_contexts = eval_dataset["relevant_contexts"] | |
eval_corpus = eval_dataset["corpus"] | |
except: | |
eval_questions, eval_relevant_contexts = await create_questions(eval_split_documents, 2, question_generation_chain) | |
eval_corpus = {eval_item.metadata["id"] : eval_item.page_content for eval_item in eval_split_documents} | |
eval_dataset = { | |
"questions" : eval_questions, | |
"relevant_contexts" : eval_relevant_contexts, | |
"corpus" : eval_corpus, | |
} | |
with open(fine_tuning_data_filepath /"eval_dataset.jsonl", "w") as f: | |
json.dump(eval_dataset, f) | |
# try to load test data from file otherwise generate new data | |
try: | |
test_dataset = json.load(open(fine_tuning_data_filepath / "test_dataset.jsonl")) | |
test_questions = test_dataset["questions"] | |
test_relevant_contexts = test_dataset["relevant_contexts"] | |
test_corpus = test_dataset["corpus"] | |
except: | |
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2, question_generation_chain) | |
test_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents} | |
test_dataset = { | |
"questions" : test_questions, | |
"relevant_contexts" : test_relevant_contexts, | |
"corpus" : test_corpus, | |
} | |
with open(fine_tuning_data_filepath /"test_dataset.jsonl", "w") as f: | |
json.dump(test_dataset, f) | |
import wandb | |
from torch.utils.data import DataLoader | |
from sentence_transformers import InputExample, SentenceTransformer | |
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss | |
from sentence_transformers.evaluation import InformationRetrievalEvaluator | |
from huggingface_hub import notebook_login | |
BATCH_SIZE = 10 | |
MODEL_ID = "Snowflake/snowflake-arctic-embed-l" | |
model = SentenceTransformer(MODEL_ID) | |
wandb.init(mode="disabled") | |
corpus = training_dataset['corpus'] | |
queries = training_dataset['questions'] | |
relevant_docs = training_dataset['relevant_contexts'] | |
examples = [] | |
for query_id, query in queries.items(): | |
doc_id = relevant_docs[query_id][0] | |
text = corpus[doc_id] | |
example = InputExample(texts=[query, text]) | |
examples.append(example) | |
loader = DataLoader( | |
examples, batch_size=BATCH_SIZE | |
) | |
matryoshka_dimensions = [768, 512, 256, 128, 64] | |
inner_train_loss = MultipleNegativesRankingLoss(model) | |
train_loss = MatryoshkaLoss( | |
model, inner_train_loss, matryoshka_dims=matryoshka_dimensions | |
) | |
evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs) | |
EPOCHS = 10 | |
warmup_steps = int(len(loader) * EPOCHS * 0.1) | |
model.fit( | |
train_objectives=[(loader, train_loss)], | |
epochs=EPOCHS, | |
warmup_steps=warmup_steps, | |
output_path='AIE5-MidTerm-finetuned-embeddings', | |
show_progress_bar=True, | |
evaluator=evaluator, | |
evaluation_steps=50 | |
) | |
notebook_login() | |
hf_username = "thomfoolery" | |
model.push_to_hub(f"{hf_username}/AIE5-MidTerm-finetuned-embeddings") | |
if __name__ == "__main__": | |
load_dotenv() | |
asyncio.run(main()) |