import json import pandas as pd from pathlib import Path from dotenv import load_dotenv from qdrant_client import QdrantClient from ragas import EvaluationDataset, RunConfig, evaluate from ragas.llms import LangchainLLMWrapper from ragas.testset import TestsetGenerator from ragas.embeddings import LangchainEmbeddingsWrapper from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders import DirectoryLoader from rag_graph import RagGraph # Load environment variables load_dotenv() USE_FINE_TUNED_EMBEDDINGS = True # Create data folders eval_data_filepath = Path("data/evaluation") if not USE_FINE_TUNED_EMBEDDINGS else Path("data/evaluation/finetuned") eval_data_filepath.mkdir(parents=True, exist_ok=True) # Necessary for dependencies for DirectoryLoader import nltk nltk.download('punkt_tab') nltk.download('averaged_perceptron_tagger_eng') # Initialize evaluation generators eval_gen_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")) eval_judge_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")) eval_gen_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) def generate_test_data(): """Generate test data for evaluation""" # Load documents loader = DirectoryLoader("data/scraped/clean", glob="*.txt") docs = loader.load() # Generate test set data generator = TestsetGenerator( llm=eval_gen_llm, embedding_model=eval_gen_embeddings ) print("Generating test set data...") dataset = generator.generate_with_langchain_docs(docs, testset_size=10) # Save test set data to file dataset.to_pandas().to_json(eval_data_filepath / "testset_data.json", orient="records", indent=4) # Upload test set data to RAGAS dataset.upload() def evaluate_rag(): """Generate RAG responses for evaluation""" # Create Qdrant client and RAG graph qdrant_client = QdrantClient(path='data/vectors') rag_graph = RagGraph(qdrant_client, use_finetuned_embeddings=USE_FINE_TUNED_EMBEDDINGS) # Load data set from testset_data.json dataset_df = pd.read_json(eval_data_filepath / "testset_data.json") dataset = EvaluationDataset.from_pandas(dataset_df) # Generate Run responses print("Generating responses...") for test_row in dataset: user_input = test_row.user_input print(f"Generating response for: {user_input}") response = rag_graph.run(user_input) test_row.response = response["response"] test_row.retrieved_contexts = [context.page_content for context in response["context"]] evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas()) # Save evaluation data to file evaluation_dataset.to_pandas().to_json(eval_data_filepath / "evaluation_data.json", orient="records", indent=4) # Evaluate the responses print("Evaluating responses...") result = evaluate( dataset=evaluation_dataset, metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()], llm=eval_judge_llm, run_config=RunConfig(timeout=360) ) # Write evaluation results to file print("Writing evaluation results to file...") (eval_data_filepath / "evaluation_results.json").write_text(result.to_pandas().to_json(orient="records", indent=4)) def calculate_average_evaluation_results(): """Get the average evaluation results from the evaluation_results.json file""" evaluation_results = json.loads((eval_data_filepath / "evaluation_results.json").read_text()) fields = ["context_recall", "faithfulness", "factual_correctness", "answer_relevancy", "context_entity_recall", "noise_sensitivity_relevant"] # map over evaluation_results and return objects with just the fields we want then convert to a DataFrame evaluation_results_df = pd.DataFrame(list(map(lambda x: {field: x[field] for field in fields}, evaluation_results))) # calculate the average of each field average_results = evaluation_results_df.mean() # save average results to file as an object with the fields as keys (eval_data_filepath / "evaluation_results_average.json").write_text(average_results.to_json(orient="index", indent=4)) # Run RAG with CLI (no streaming) def main(): """Test the RAG graph.""" # generate_test_data() evaluate_rag() calculate_average_evaluation_results() if __name__ == "__main__": main()