Spaces:
Sleeping
Sleeping
import json | |
import pandas as pd | |
from pathlib import Path | |
from dotenv import load_dotenv | |
from qdrant_client import QdrantClient | |
from ragas import EvaluationDataset, RunConfig, evaluate | |
from ragas.llms import LangchainLLMWrapper | |
from ragas.testset import TestsetGenerator | |
from ragas.embeddings import LangchainEmbeddingsWrapper | |
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity | |
from langchain_openai import ChatOpenAI | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.document_loaders import DirectoryLoader | |
from rag_graph import RagGraph | |
# Load environment variables | |
load_dotenv() | |
USE_FINE_TUNED_EMBEDDINGS = True | |
# Create data folders | |
eval_data_filepath = Path("data/evaluation") if not USE_FINE_TUNED_EMBEDDINGS else Path("data/evaluation/finetuned") | |
eval_data_filepath.mkdir(parents=True, exist_ok=True) | |
# Necessary for dependencies for DirectoryLoader | |
import nltk | |
nltk.download('punkt_tab') | |
nltk.download('averaged_perceptron_tagger_eng') | |
# Initialize evaluation generators | |
eval_gen_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")) | |
eval_judge_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")) | |
eval_gen_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings()) | |
def generate_test_data(): | |
"""Generate test data for evaluation""" | |
# Load documents | |
loader = DirectoryLoader("data/scraped/clean", glob="*.txt") | |
docs = loader.load() | |
# Generate test set data | |
generator = TestsetGenerator( | |
llm=eval_gen_llm, | |
embedding_model=eval_gen_embeddings | |
) | |
print("Generating test set data...") | |
dataset = generator.generate_with_langchain_docs(docs, testset_size=10) | |
# Save test set data to file | |
dataset.to_pandas().to_json(eval_data_filepath / "testset_data.json", orient="records", indent=4) | |
# Upload test set data to RAGAS | |
dataset.upload() | |
def evaluate_rag(): | |
"""Generate RAG responses for evaluation""" | |
# Create Qdrant client and RAG graph | |
qdrant_client = QdrantClient(path='data/vectors') | |
rag_graph = RagGraph(qdrant_client, use_finetuned_embeddings=USE_FINE_TUNED_EMBEDDINGS) | |
# Load data set from testset_data.json | |
dataset_df = pd.read_json(eval_data_filepath / "testset_data.json") | |
dataset = EvaluationDataset.from_pandas(dataset_df) | |
# Generate Run responses | |
print("Generating responses...") | |
for test_row in dataset: | |
user_input = test_row.user_input | |
print(f"Generating response for: {user_input}") | |
response = rag_graph.run(user_input) | |
test_row.response = response["response"] | |
test_row.retrieved_contexts = [context.page_content for context in response["context"]] | |
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas()) | |
# Save evaluation data to file | |
evaluation_dataset.to_pandas().to_json(eval_data_filepath / "evaluation_data.json", orient="records", indent=4) | |
# Evaluate the responses | |
print("Evaluating responses...") | |
result = evaluate( | |
dataset=evaluation_dataset, | |
metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()], | |
llm=eval_judge_llm, | |
run_config=RunConfig(timeout=360) | |
) | |
# Write evaluation results to file | |
print("Writing evaluation results to file...") | |
(eval_data_filepath / "evaluation_results.json").write_text(result.to_pandas().to_json(orient="records", indent=4)) | |
def calculate_average_evaluation_results(): | |
"""Get the average evaluation results from the evaluation_results.json file""" | |
evaluation_results = json.loads((eval_data_filepath / "evaluation_results.json").read_text()) | |
fields = ["context_recall", "faithfulness", "factual_correctness", "answer_relevancy", "context_entity_recall", "noise_sensitivity_relevant"] | |
# map over evaluation_results and return objects with just the fields we want then convert to a DataFrame | |
evaluation_results_df = pd.DataFrame(list(map(lambda x: {field: x[field] for field in fields}, evaluation_results))) | |
# calculate the average of each field | |
average_results = evaluation_results_df.mean() | |
# save average results to file as an object with the fields as keys | |
(eval_data_filepath / "evaluation_results_average.json").write_text(average_results.to_json(orient="index", indent=4)) | |
# Run RAG with CLI (no streaming) | |
def main(): | |
"""Test the RAG graph.""" | |
# generate_test_data() | |
evaluate_rag() | |
calculate_average_evaluation_results() | |
if __name__ == "__main__": | |
main() | |