adding rest of ragas metrics
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ from qdrant_client import QdrantClient
|
|
12 |
from langchain_openai import OpenAIEmbeddings
|
13 |
import os
|
14 |
from ragas import evaluate
|
15 |
-
from ragas.metrics import answer_relevancy
|
16 |
from langchain_core.documents import Document
|
17 |
import json
|
18 |
import numpy as np
|
@@ -26,12 +26,12 @@ load_dotenv()
|
|
26 |
# Load OpenAI Model
|
27 |
llm = ChatOpenAI(model="gpt-4o-mini")
|
28 |
qd_api_key = os.getenv("QDRANT_CLOUD_API_KEY")
|
29 |
-
EVALUATION_MODE = os.getenv("EVALUATION_MODE", "false").lower() == "
|
30 |
|
31 |
|
32 |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
33 |
|
34 |
-
#
|
35 |
qd_client = QdrantClient(
|
36 |
"https://40c458f2-24a9-4153-b15b-0addf6a6bbcf.us-east-1-0.aws.cloud.qdrant.io:6333",
|
37 |
api_key=qd_api_key
|
@@ -65,31 +65,74 @@ def search(query_vector, top_k=1) -> list:
|
|
65 |
|
66 |
return return_hits
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
def evaluate_retrieved_docs(question: str, retrieved_docs: list):
|
69 |
"""Evaluate the retrieved documents using RAGAS metrics."""
|
70 |
|
71 |
-
#
|
72 |
ragas_docs = [
|
73 |
Document(page_content=hit["metadata"].get("content", ""))
|
74 |
for hit in retrieved_docs
|
75 |
if "content" in hit["metadata"] and hit["metadata"]["content"]
|
76 |
]
|
77 |
|
78 |
-
#
|
79 |
print("π Debug: RAGAS Docs Format:", ragas_docs)
|
80 |
|
81 |
if not ragas_docs:
|
82 |
print("β οΈ No relevant documents to evaluate.")
|
83 |
return 0 # Return low score if no documents found
|
84 |
|
85 |
-
#
|
86 |
queries = [question]
|
87 |
contexts = [[doc.page_content for doc in ragas_docs]]
|
88 |
|
89 |
print("β
Debug: Queries ->", queries)
|
90 |
print("β
Debug: Contexts ->", contexts)
|
91 |
|
92 |
-
#
|
93 |
scores = evaluate(
|
94 |
queries=queries,
|
95 |
contexts=contexts,
|
@@ -191,21 +234,24 @@ def research_node(state) -> dict:
|
|
191 |
query_vector = embedding_model.embed_query(question)
|
192 |
|
193 |
# Query Qdrant with the vector
|
194 |
-
relevant_docs = search(query_vector=query_vector, top_k=1)
|
195 |
|
196 |
-
|
197 |
-
# Evaluate retrieved documents using RAGAS
|
198 |
-
relevance_score = evaluate_retrieved_docs(question, relevant_docs)
|
199 |
-
print(f"π [Evaluation Mode] RAGAS Score: {relevance_score}")
|
200 |
|
201 |
-
if relevant_docs[0]['score'] >
|
202 |
# Found relevant document β Summarize it
|
203 |
document_name = relevant_docs[0]["metadata"].get("document_name", "No source available.")
|
204 |
document_text = get_document_by_name(document_name)
|
205 |
-
|
206 |
messages = summary_prompt.format_messages(document=document_text)
|
207 |
response = llm.invoke(messages)
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
return {**state, "messages": state["messages"] + [HumanMessage(content=response.content)], "_next": "post_processing"}
|
210 |
|
211 |
else:
|
|
|
12 |
from langchain_openai import OpenAIEmbeddings
|
13 |
import os
|
14 |
from ragas import evaluate
|
15 |
+
from ragas.metrics import answer_relevancy, faithfulness, context_precision, context_recall
|
16 |
from langchain_core.documents import Document
|
17 |
import json
|
18 |
import numpy as np
|
|
|
26 |
# Load OpenAI Model
|
27 |
llm = ChatOpenAI(model="gpt-4o-mini")
|
28 |
qd_api_key = os.getenv("QDRANT_CLOUD_API_KEY")
|
29 |
+
EVALUATION_MODE = os.getenv("EVALUATION_MODE", "false").lower() == "true"
|
30 |
|
31 |
|
32 |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
33 |
|
34 |
+
# Initialize Qdrant Client
|
35 |
qd_client = QdrantClient(
|
36 |
"https://40c458f2-24a9-4153-b15b-0addf6a6bbcf.us-east-1-0.aws.cloud.qdrant.io:6333",
|
37 |
api_key=qd_api_key
|
|
|
65 |
|
66 |
return return_hits
|
67 |
|
68 |
+
def evaluate_ragas_metrics(question: str, model_answer: str, retrieved_docs: list):
|
69 |
+
"""Evaluate faithfulness, context precision, and context recall using RAGAS."""
|
70 |
+
|
71 |
+
# Extract document content from metadata
|
72 |
+
ragas_docs = [
|
73 |
+
Document(page_content=hit["metadata"].get("content", ""))
|
74 |
+
for hit in retrieved_docs if "content" in hit["metadata"] and hit["metadata"]["content"]
|
75 |
+
]
|
76 |
+
|
77 |
+
if not ragas_docs:
|
78 |
+
print("β οΈ No relevant documents to evaluate.")
|
79 |
+
return {"faithfulness": 0, "context_precision": 0, "context_recall": 0}
|
80 |
+
|
81 |
+
# Construct required input
|
82 |
+
queries = [question]
|
83 |
+
generated_answers = [model_answer]
|
84 |
+
contexts = [[doc.page_content for doc in ragas_docs]]
|
85 |
+
|
86 |
+
# Run evaluation
|
87 |
+
scores = evaluate(
|
88 |
+
queries=queries,
|
89 |
+
contexts=contexts,
|
90 |
+
generated_answers=generated_answers,
|
91 |
+
metrics=[faithfulness, context_precision, context_recall]
|
92 |
+
)
|
93 |
+
|
94 |
+
print("π Debug: RAGAS Metrics Output ->", scores)
|
95 |
+
|
96 |
+
# Extract individual scores
|
97 |
+
faithfulness_score = scores.iloc[0]["faithfulness"]
|
98 |
+
context_precision_score = scores.iloc[0]["context_precision"]
|
99 |
+
context_recall_score = scores.iloc[0]["context_recall"]
|
100 |
+
|
101 |
+
print(f"π Faithfulness Score: {faithfulness_score}")
|
102 |
+
print(f"π Context Precision Score: {context_precision_score}")
|
103 |
+
print(f"π Context Recall Score: {context_recall_score}")
|
104 |
+
|
105 |
+
return {
|
106 |
+
"faithfulness": faithfulness_score,
|
107 |
+
"context_precision": context_precision_score,
|
108 |
+
"context_recall": context_recall_score
|
109 |
+
}
|
110 |
+
|
111 |
def evaluate_retrieved_docs(question: str, retrieved_docs: list):
|
112 |
"""Evaluate the retrieved documents using RAGAS metrics."""
|
113 |
|
114 |
+
# Extract document content from metadata
|
115 |
ragas_docs = [
|
116 |
Document(page_content=hit["metadata"].get("content", ""))
|
117 |
for hit in retrieved_docs
|
118 |
if "content" in hit["metadata"] and hit["metadata"]["content"]
|
119 |
]
|
120 |
|
121 |
+
# Debugging Output
|
122 |
print("π Debug: RAGAS Docs Format:", ragas_docs)
|
123 |
|
124 |
if not ragas_docs:
|
125 |
print("β οΈ No relevant documents to evaluate.")
|
126 |
return 0 # Return low score if no documents found
|
127 |
|
128 |
+
# Construct required input
|
129 |
queries = [question]
|
130 |
contexts = [[doc.page_content for doc in ragas_docs]]
|
131 |
|
132 |
print("β
Debug: Queries ->", queries)
|
133 |
print("β
Debug: Contexts ->", contexts)
|
134 |
|
135 |
+
# Run evaluation
|
136 |
scores = evaluate(
|
137 |
queries=queries,
|
138 |
contexts=contexts,
|
|
|
234 |
query_vector = embedding_model.embed_query(question)
|
235 |
|
236 |
# Query Qdrant with the vector
|
237 |
+
relevant_docs = search(query_vector=query_vector, top_k=1)
|
238 |
|
239 |
+
model_answer = "No answer generated yet"
|
|
|
|
|
|
|
240 |
|
241 |
+
if relevant_docs[0]['score'] > hit_score: # Threshold for good retrieval quality this will be the cosine similarity score
|
242 |
# Found relevant document β Summarize it
|
243 |
document_name = relevant_docs[0]["metadata"].get("document_name", "No source available.")
|
244 |
document_text = get_document_by_name(document_name)
|
|
|
245 |
messages = summary_prompt.format_messages(document=document_text)
|
246 |
response = llm.invoke(messages)
|
247 |
|
248 |
+
if EVALUATION_MODE:
|
249 |
+
# Evaluate retrieved documents using RAGAS
|
250 |
+
relevance_score = evaluate_retrieved_docs(question, relevant_docs)
|
251 |
+
print(f"π [Evaluation Mode] RAGAS Score: {relevance_score}")
|
252 |
+
ragas_scores = evaluate_ragas_metrics(question, model_answer, relevant_docs)
|
253 |
+
print(f"π [evaluate_ragas_metrics] RAGAS Scores: {ragas_scores}")
|
254 |
+
|
255 |
return {**state, "messages": state["messages"] + [HumanMessage(content=response.content)], "_next": "post_processing"}
|
256 |
|
257 |
else:
|