File size: 4,489 Bytes
0d77f31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1367c2
 
0d77f31
c1367c2
0d77f31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1367c2
0d77f31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1367c2
0d77f31
 
 
 
 
 
 
 
 
 
 
 
 
 
c1367c2
 
0d77f31
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

from qdrant_client import QdrantClient

from ragas import EvaluationDataset, RunConfig, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.testset import TestsetGenerator
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader

from rag_graph import RagGraph

# Load environment variables
load_dotenv()

USE_FINE_TUNED_EMBEDDINGS = True

# Create data folders
eval_data_filepath = Path("data/evaluation") if not USE_FINE_TUNED_EMBEDDINGS else Path("data/evaluation/finetuned")
eval_data_filepath.mkdir(parents=True, exist_ok=True)

# Necessary for dependencies for DirectoryLoader
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Initialize evaluation generators
eval_gen_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
eval_judge_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
eval_gen_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

def generate_test_data():
  """Generate test data for evaluation"""
  # Load documents
  loader = DirectoryLoader("data/scraped/clean", glob="*.txt")
  docs = loader.load()

  # Generate test set data
  generator = TestsetGenerator(
    llm=eval_gen_llm, 
    embedding_model=eval_gen_embeddings
  )

  print("Generating test set data...")
  dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

  # Save test set data to file
  dataset.to_pandas().to_json(eval_data_filepath / "testset_data.json", orient="records", indent=4)

  # Upload test set data to RAGAS
  dataset.upload()

def evaluate_rag():
  """Generate RAG responses for evaluation"""
  # Create Qdrant client and RAG graph
  qdrant_client = QdrantClient(path='data/vectors')
  rag_graph = RagGraph(qdrant_client, use_finetuned_embeddings=USE_FINE_TUNED_EMBEDDINGS)
  
  # Load data set from testset_data.json
  dataset_df = pd.read_json(eval_data_filepath / "testset_data.json")
  dataset = EvaluationDataset.from_pandas(dataset_df)

  # Generate Run responses
  print("Generating responses...")
  for test_row in dataset:
    user_input = test_row.user_input
    print(f"Generating response for: {user_input}")
    response = rag_graph.run(user_input)
    test_row.response = response["response"]
    test_row.retrieved_contexts = [context.page_content for context in response["context"]]
  
  evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

  # Save evaluation data to file
  evaluation_dataset.to_pandas().to_json(eval_data_filepath / "evaluation_data.json", orient="records", indent=4)

  # Evaluate the responses
  print("Evaluating responses...")
  result = evaluate(
      dataset=evaluation_dataset,
      metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
      llm=eval_judge_llm,
      run_config=RunConfig(timeout=360)
  )

  # Write evaluation results to file
  print("Writing evaluation results to file...")
  (eval_data_filepath / "evaluation_results.json").write_text(result.to_pandas().to_json(orient="records", indent=4))

def calculate_average_evaluation_results():
  """Get the average evaluation results from the evaluation_results.json file"""
  evaluation_results = json.loads((eval_data_filepath / "evaluation_results.json").read_text())
  fields = ["context_recall", "faithfulness", "factual_correctness", "answer_relevancy", "context_entity_recall", "noise_sensitivity_relevant"]
  # map over evaluation_results and return  objects with just the fields we want then convert to a DataFrame
  evaluation_results_df = pd.DataFrame(list(map(lambda x: {field: x[field] for field in fields}, evaluation_results)))
  # calculate the average of each field
  average_results = evaluation_results_df.mean()
  # save average results to file as an object with the fields as keys
  (eval_data_filepath / "evaluation_results_average.json").write_text(average_results.to_json(orient="index", indent=4))

# Run RAG with CLI (no streaming)
def main():
  """Test the RAG graph."""
  # generate_test_data()
  evaluate_rag()
  calculate_average_evaluation_results()


if __name__ == "__main__":
  main()