shivXy commited on
Commit
59ee619
Β·
1 Parent(s): 13c47e2

adding pandas to dump golden testing

Browse files
Files changed (1) hide show
  1. app.py +28 -22
app.py CHANGED
@@ -17,6 +17,7 @@ from langchain_core.documents import Document
17
  import json
18
  import numpy as np
19
  from sklearn.metrics.pairwise import cosine_similarity
 
20
 
21
 
22
 
@@ -224,14 +225,12 @@ def compare_text_similarity(text1, text2):
224
  return cosine_similarity(emb1, emb2)[0][0] # Return similarity score
225
 
226
  def evaluate_against_golden_set(question, model_answer):
227
- """Compare model-generated answers against the golden dataset."""
228
-
229
- # Locate the Golden Dataset
230
  script_dir = os.path.dirname(os.path.abspath(__file__))
231
  data_path = os.path.join(script_dir, "data")
232
  full_path = os.path.join(data_path, "testingset.json")
233
-
234
- # Check if file exists
235
  if not os.path.exists(full_path):
236
  print(f"❌ Error: Golden dataset not found at {full_path}")
237
  return None
@@ -240,26 +239,33 @@ def evaluate_against_golden_set(question, model_answer):
240
  with open(full_path, "r", encoding="utf-8") as f:
241
  golden_data = json.load(f)
242
 
243
- # Find Matching Question in the Golden Dataset
244
- expected_answer = None
245
- for entry in golden_data:
246
- if entry.get("question", "").strip() == question.strip():
247
- expected_answer = entry.get("expected_answer", "").strip()
248
- break
249
 
250
- if not expected_answer:
251
- print(f"⚠️ Question not found in the Golden Data Set: {question}")
252
- return None
253
 
254
- # Compare Model Answer vs Expected Answer
255
- try:
256
  similarity_score = compare_text_similarity(model_answer, expected_answer)
257
- print(f"πŸ“Š [Evaluation] Model vs. Expected Score: {similarity_score:.2f}")
258
- return similarity_score
259
 
260
- except Exception as e:
261
- print(f"❌ Error in similarity evaluation: {e}")
262
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
 
265
  # **Post-Processing Node: Formats response using `ot_formatted_prompt`**
@@ -269,7 +275,7 @@ def post_processing_node(state) -> dict:
269
  # Evaluate the model against the golden dataset
270
  if EVALUATION_MODE:
271
  question = state["messages"][0].content
272
- evaluate_against_golden_set(question, response_text)
273
 
274
  messages = format_prompt.format_messages(context=response_text)
275
  response = llm.invoke(messages)
 
17
  import json
18
  import numpy as np
19
  from sklearn.metrics.pairwise import cosine_similarity
20
+ import pandas as pd
21
 
22
 
23
 
 
225
  return cosine_similarity(emb1, emb2)[0][0] # Return similarity score
226
 
227
  def evaluate_against_golden_set(question, model_answer):
228
+ """Compare model-generated answers against the golden dataset and display results in a DataFrame."""
229
+
 
230
  script_dir = os.path.dirname(os.path.abspath(__file__))
231
  data_path = os.path.join(script_dir, "data")
232
  full_path = os.path.join(data_path, "testingset.json")
233
+
 
234
  if not os.path.exists(full_path):
235
  print(f"❌ Error: Golden dataset not found at {full_path}")
236
  return None
 
239
  with open(full_path, "r", encoding="utf-8") as f:
240
  golden_data = json.load(f)
241
 
242
+ # Store results in a list for Pandas DataFrame
243
+ results = []
 
 
 
 
244
 
245
+ for entry in golden_data:
246
+ expected_answer = entry.get("expected_answer", "").strip()
247
+ q = entry.get("question", "").strip()
248
 
249
+ # Compute similarity score
 
250
  similarity_score = compare_text_similarity(model_answer, expected_answer)
 
 
251
 
252
+ # Append to results list
253
+ results.append({
254
+ "Question": q,
255
+ "Expected Answer": expected_answer,
256
+ "Model Answer": model_answer,
257
+ "Similarity Score": round(similarity_score, 2) # Round to 2 decimal places
258
+ })
259
+
260
+ # Convert to DataFrame
261
+ df = pd.DataFrame(results)
262
+
263
+ # Print DataFrame
264
+ print("\nπŸ“Š **Evaluation Results**")
265
+ print(df.to_string(index=False)) # Pretty-print without row index
266
+
267
+ return df
268
+
269
 
270
 
271
  # **Post-Processing Node: Formats response using `ot_formatted_prompt`**
 
275
  # Evaluate the model against the golden dataset
276
  if EVALUATION_MODE:
277
  question = state["messages"][0].content
278
+ pdf = evaluate_against_golden_set(question, response_text)
279
 
280
  messages = format_prompt.format_messages(context=response_text)
281
  response = llm.invoke(messages)