Spaces:

shivXy
/

otmidterm

Sleeping

App Files Files Community

shivXy commited on 29 days ago

Commit

59ee619

1 Parent(s): 13c47e2

adding pandas to dump golden testing

Browse files

Files changed (1) hide show

app.py +28 -22

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ from langchain_core.documents import Document
 import json
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
@@ -224,14 +225,12 @@ def compare_text_similarity(text1, text2):
     return cosine_similarity(emb1, emb2)[0][0]  # Return similarity score
 def evaluate_against_golden_set(question, model_answer):
-    """Compare model-generated answers against the golden dataset."""
-    # Locate the Golden Dataset
     script_dir = os.path.dirname(os.path.abspath(__file__))
     data_path = os.path.join(script_dir, "data")
     full_path = os.path.join(data_path, "testingset.json")
-    # Check if file exists
     if not os.path.exists(full_path):
         print(f"❌ Error: Golden dataset not found at {full_path}")
         return None
@@ -240,26 +239,33 @@ def evaluate_against_golden_set(question, model_answer):
     with open(full_path, "r", encoding="utf-8") as f:
         golden_data = json.load(f)
-    # Find Matching Question in the Golden Dataset
-    expected_answer = None
-    for entry in golden_data:
-        if entry.get("question", "").strip() == question.strip():
-            expected_answer = entry.get("expected_answer", "").strip()
-            break
-    if not expected_answer:
-        print(f"⚠️ Question not found in the Golden Data Set: {question}")
-        return None
-    # Compare Model Answer vs Expected Answer
-    try:
         similarity_score = compare_text_similarity(model_answer, expected_answer)
-        print(f"📊 [Evaluation] Model vs. Expected Score: {similarity_score:.2f}")
-        return similarity_score
-    except Exception as e:
-        print(f"❌ Error in similarity evaluation: {e}")
-        return None
 # **Post-Processing Node: Formats response using `ot_formatted_prompt`**
@@ -269,7 +275,7 @@ def post_processing_node(state) -> dict:
     # Evaluate the model against the golden dataset
     if EVALUATION_MODE:
         question = state["messages"][0].content
-        evaluate_against_golden_set(question, response_text)
     messages = format_prompt.format_messages(context=response_text)
     response = llm.invoke(messages)

 import json
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
     return cosine_similarity(emb1, emb2)[0][0]  # Return similarity score
 def evaluate_against_golden_set(question, model_answer):
+    """Compare model-generated answers against the golden dataset and display results in a DataFrame."""
     script_dir = os.path.dirname(os.path.abspath(__file__))
     data_path = os.path.join(script_dir, "data")
     full_path = os.path.join(data_path, "testingset.json")
     if not os.path.exists(full_path):
         print(f"❌ Error: Golden dataset not found at {full_path}")
         return None
     with open(full_path, "r", encoding="utf-8") as f:
         golden_data = json.load(f)
+    # Store results in a list for Pandas DataFrame
+    results = []
+    for entry in golden_data:
+        expected_answer = entry.get("expected_answer", "").strip()
+        q = entry.get("question", "").strip()
+        # Compute similarity score
         similarity_score = compare_text_similarity(model_answer, expected_answer)
+        # Append to results list
+        results.append({
+            "Question": q,
+            "Expected Answer": expected_answer,
+            "Model Answer": model_answer,
+            "Similarity Score": round(similarity_score, 2)  # Round to 2 decimal places
+        })
+    # Convert to DataFrame
+    df = pd.DataFrame(results)
+    # Print DataFrame
+    print("\n📊 **Evaluation Results**")
+    print(df.to_string(index=False))  # Pretty-print without row index
+    return df
 # **Post-Processing Node: Formats response using `ot_formatted_prompt`**
     # Evaluate the model against the golden dataset
     if EVALUATION_MODE:
         question = state["messages"][0].content
+        pdf = evaluate_against_golden_set(question, response_text)
     messages = format_prompt.format_messages(context=response_text)
     response = llm.invoke(messages)