adding pandas to dump golden testing
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@ from langchain_core.documents import Document
|
|
17 |
import json
|
18 |
import numpy as np
|
19 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
20 |
|
21 |
|
22 |
|
@@ -224,14 +225,12 @@ def compare_text_similarity(text1, text2):
|
|
224 |
return cosine_similarity(emb1, emb2)[0][0] # Return similarity score
|
225 |
|
226 |
def evaluate_against_golden_set(question, model_answer):
|
227 |
-
"""Compare model-generated answers against the golden dataset."""
|
228 |
-
|
229 |
-
# Locate the Golden Dataset
|
230 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
231 |
data_path = os.path.join(script_dir, "data")
|
232 |
full_path = os.path.join(data_path, "testingset.json")
|
233 |
-
|
234 |
-
# Check if file exists
|
235 |
if not os.path.exists(full_path):
|
236 |
print(f"β Error: Golden dataset not found at {full_path}")
|
237 |
return None
|
@@ -240,26 +239,33 @@ def evaluate_against_golden_set(question, model_answer):
|
|
240 |
with open(full_path, "r", encoding="utf-8") as f:
|
241 |
golden_data = json.load(f)
|
242 |
|
243 |
-
#
|
244 |
-
|
245 |
-
for entry in golden_data:
|
246 |
-
if entry.get("question", "").strip() == question.strip():
|
247 |
-
expected_answer = entry.get("expected_answer", "").strip()
|
248 |
-
break
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
|
254 |
-
|
255 |
-
try:
|
256 |
similarity_score = compare_text_similarity(model_answer, expected_answer)
|
257 |
-
print(f"π [Evaluation] Model vs. Expected Score: {similarity_score:.2f}")
|
258 |
-
return similarity_score
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
|
265 |
# **Post-Processing Node: Formats response using `ot_formatted_prompt`**
|
@@ -269,7 +275,7 @@ def post_processing_node(state) -> dict:
|
|
269 |
# Evaluate the model against the golden dataset
|
270 |
if EVALUATION_MODE:
|
271 |
question = state["messages"][0].content
|
272 |
-
evaluate_against_golden_set(question, response_text)
|
273 |
|
274 |
messages = format_prompt.format_messages(context=response_text)
|
275 |
response = llm.invoke(messages)
|
|
|
17 |
import json
|
18 |
import numpy as np
|
19 |
from sklearn.metrics.pairwise import cosine_similarity
|
20 |
+
import pandas as pd
|
21 |
|
22 |
|
23 |
|
|
|
225 |
return cosine_similarity(emb1, emb2)[0][0] # Return similarity score
|
226 |
|
227 |
def evaluate_against_golden_set(question, model_answer):
|
228 |
+
"""Compare model-generated answers against the golden dataset and display results in a DataFrame."""
|
229 |
+
|
|
|
230 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
231 |
data_path = os.path.join(script_dir, "data")
|
232 |
full_path = os.path.join(data_path, "testingset.json")
|
233 |
+
|
|
|
234 |
if not os.path.exists(full_path):
|
235 |
print(f"β Error: Golden dataset not found at {full_path}")
|
236 |
return None
|
|
|
239 |
with open(full_path, "r", encoding="utf-8") as f:
|
240 |
golden_data = json.load(f)
|
241 |
|
242 |
+
# Store results in a list for Pandas DataFrame
|
243 |
+
results = []
|
|
|
|
|
|
|
|
|
244 |
|
245 |
+
for entry in golden_data:
|
246 |
+
expected_answer = entry.get("expected_answer", "").strip()
|
247 |
+
q = entry.get("question", "").strip()
|
248 |
|
249 |
+
# Compute similarity score
|
|
|
250 |
similarity_score = compare_text_similarity(model_answer, expected_answer)
|
|
|
|
|
251 |
|
252 |
+
# Append to results list
|
253 |
+
results.append({
|
254 |
+
"Question": q,
|
255 |
+
"Expected Answer": expected_answer,
|
256 |
+
"Model Answer": model_answer,
|
257 |
+
"Similarity Score": round(similarity_score, 2) # Round to 2 decimal places
|
258 |
+
})
|
259 |
+
|
260 |
+
# Convert to DataFrame
|
261 |
+
df = pd.DataFrame(results)
|
262 |
+
|
263 |
+
# Print DataFrame
|
264 |
+
print("\nπ **Evaluation Results**")
|
265 |
+
print(df.to_string(index=False)) # Pretty-print without row index
|
266 |
+
|
267 |
+
return df
|
268 |
+
|
269 |
|
270 |
|
271 |
# **Post-Processing Node: Formats response using `ot_formatted_prompt`**
|
|
|
275 |
# Evaluate the model against the golden dataset
|
276 |
if EVALUATION_MODE:
|
277 |
question = state["messages"][0].content
|
278 |
+
pdf = evaluate_against_golden_set(question, response_text)
|
279 |
|
280 |
messages = format_prompt.format_messages(context=response_text)
|
281 |
response = llm.invoke(messages)
|