pythonic-rag / scripts /evaluate_finetune_rag.py
ric9176's picture
Add and run evaluation comparision and move evals to scrpits folder
a11e7e8
import os
from dotenv import load_dotenv
import pandas as pd
from tqdm import tqdm
from typing import Dict, List
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas import evaluate, EvaluationDataset
from ragas.metrics import (
LLMContextRecall,
Faithfulness,
FactualCorrectness,
ResponseRelevancy,
ContextEntityRecall,
NoiseSensitivity
)
# Load environment variables
load_dotenv()
# Initialize URLs and load documents
urls = [
"https://www.timeout.com/london/things-to-do-in-london-this-weekend",
"https://www.timeout.com/london/london-events-in-march"
]
loader = WebBaseLoader(urls)
docs = loader.load()
# Text splitting
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=600,
chunk_overlap=50,
length_function=len
)
split_documents = text_splitter.split_documents(docs)
# Initialize embedding models
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
base_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
finetuned_embeddings = HuggingFaceEmbeddings(model_name="ric9176/cjo-ft-v0")
def create_rag_chain(documents: List[Document], embeddings, k: int = 6):
"""Create a RAG chain with specified embeddings"""
# Create vector store and retriever
vectorstore = FAISS.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
# Create RAG prompt
rag_prompt = ChatPromptTemplate.from_template("""
Given a provided context and a question, you must answer the question.
If you do not know the answer, you must state that you do not know.
Context:
{context}
Question:
{question}
Answer:
""")
# Create LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
# Create RAG chain
rag_chain = (
{"context": itemgetter("question") | retriever, "question": itemgetter("question")}
| RunnablePassthrough.assign(context=itemgetter("context"))
| {"response": rag_prompt | llm | StrOutputParser(), "context": itemgetter("context")}
)
return rag_chain
def evaluate_embeddings(documents, test_questions):
"""Evaluate different embedding models"""
results = {}
# Create RAG chains for each embedding model
chains = {
"OpenAI": create_rag_chain(documents, openai_embeddings),
"Base Arctic": create_rag_chain(documents, base_embeddings),
"Fine-tuned Arctic": create_rag_chain(documents, finetuned_embeddings)
}
# Generate test dataset using RAGAS
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
# Evaluate each model
for model_name, chain in chains.items():
print(f"\nEvaluating {model_name}...")
# Generate dataset
dataset = generator.generate_with_langchain_docs(documents, testset_size=10)
# Process questions through RAG pipeline
for test_row in dataset:
response = chain.invoke({"question": test_row.eval_sample.user_input})
test_row.eval_sample.response = response["response"]
test_row.eval_sample.retrieved_contexts = [
context.page_content for context in response["context"]
]
# Convert to evaluation dataset
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())
# Run RAGAS evaluation
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
result = evaluate(
dataset=evaluation_dataset,
metrics=[
LLMContextRecall(),
Faithfulness(),
FactualCorrectness(),
ResponseRelevancy(),
ContextEntityRecall(),
NoiseSensitivity()
],
llm=evaluator_llm
)
results[model_name] = result
return results
# Run evaluation
print("Starting evaluation of embedding models...")
results = evaluate_embeddings(split_documents, None)
# Save results
print("\nSaving results...")
os.makedirs("docs", exist_ok=True)
# Save detailed results for each model
for model_name, result in results.items():
df = result.to_pandas()
filename = f"docs/evaluation_{model_name.lower().replace(' ', '_')}.csv"
df.to_csv(filename, index=False)
print(f"Saved results for {model_name} to {filename}")
# Create comparison table
comparison = pd.DataFrame()
for model_name, result in results.items():
comparison[model_name] = pd.Series(result.scores)
# Save comparison
comparison.to_csv("docs/embedding_comparison.csv")
print("\nSaved comparison to docs/embedding_comparison.csv")
# Print comparison
print("\nEmbedding Models Comparison:")
print(comparison)