AIE5-MidTerm / evaluation.py
thomfoolery's picture
evaluate with fine-tuned embeddings
c1367c2
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from ragas import EvaluationDataset, RunConfig, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.testset import TestsetGenerator
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from rag_graph import RagGraph
# Load environment variables
load_dotenv()
USE_FINE_TUNED_EMBEDDINGS = True
# Create data folders
eval_data_filepath = Path("data/evaluation") if not USE_FINE_TUNED_EMBEDDINGS else Path("data/evaluation/finetuned")
eval_data_filepath.mkdir(parents=True, exist_ok=True)
# Necessary for dependencies for DirectoryLoader
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
# Initialize evaluation generators
eval_gen_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
eval_judge_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
eval_gen_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
def generate_test_data():
"""Generate test data for evaluation"""
# Load documents
loader = DirectoryLoader("data/scraped/clean", glob="*.txt")
docs = loader.load()
# Generate test set data
generator = TestsetGenerator(
llm=eval_gen_llm,
embedding_model=eval_gen_embeddings
)
print("Generating test set data...")
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
# Save test set data to file
dataset.to_pandas().to_json(eval_data_filepath / "testset_data.json", orient="records", indent=4)
# Upload test set data to RAGAS
dataset.upload()
def evaluate_rag():
"""Generate RAG responses for evaluation"""
# Create Qdrant client and RAG graph
qdrant_client = QdrantClient(path='data/vectors')
rag_graph = RagGraph(qdrant_client, use_finetuned_embeddings=USE_FINE_TUNED_EMBEDDINGS)
# Load data set from testset_data.json
dataset_df = pd.read_json(eval_data_filepath / "testset_data.json")
dataset = EvaluationDataset.from_pandas(dataset_df)
# Generate Run responses
print("Generating responses...")
for test_row in dataset:
user_input = test_row.user_input
print(f"Generating response for: {user_input}")
response = rag_graph.run(user_input)
test_row.response = response["response"]
test_row.retrieved_contexts = [context.page_content for context in response["context"]]
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())
# Save evaluation data to file
evaluation_dataset.to_pandas().to_json(eval_data_filepath / "evaluation_data.json", orient="records", indent=4)
# Evaluate the responses
print("Evaluating responses...")
result = evaluate(
dataset=evaluation_dataset,
metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
llm=eval_judge_llm,
run_config=RunConfig(timeout=360)
)
# Write evaluation results to file
print("Writing evaluation results to file...")
(eval_data_filepath / "evaluation_results.json").write_text(result.to_pandas().to_json(orient="records", indent=4))
def calculate_average_evaluation_results():
"""Get the average evaluation results from the evaluation_results.json file"""
evaluation_results = json.loads((eval_data_filepath / "evaluation_results.json").read_text())
fields = ["context_recall", "faithfulness", "factual_correctness", "answer_relevancy", "context_entity_recall", "noise_sensitivity_relevant"]
# map over evaluation_results and return objects with just the fields we want then convert to a DataFrame
evaluation_results_df = pd.DataFrame(list(map(lambda x: {field: x[field] for field in fields}, evaluation_results)))
# calculate the average of each field
average_results = evaluation_results_df.mean()
# save average results to file as an object with the fields as keys
(eval_data_filepath / "evaluation_results_average.json").write_text(average_results.to_json(orient="index", indent=4))
# Run RAG with CLI (no streaming)
def main():
"""Test the RAG graph."""
# generate_test_data()
evaluate_rag()
calculate_average_evaluation_results()
if __name__ == "__main__":
main()