Update llmeval.py
Browse files- llmeval.py +56 -1
llmeval.py
CHANGED
|
@@ -98,7 +98,7 @@ Think step by step
|
|
| 98 |
de.Update(data=data)
|
| 99 |
|
| 100 |
|
| 101 |
-
def
|
| 102 |
SYSTEM='''
|
| 103 |
Task:
|
| 104 |
Evaluate the biological quality of a prompt–research data–response triplet from an Observations Generator Agent on a 0–1 continuous scale.
|
|
@@ -153,4 +153,59 @@ Reasoning: The response introduces observations unrelated to epithelial tumors o
|
|
| 153 |
"biological_context_alignment":evaluation_response
|
| 154 |
}
|
| 155 |
de.Update(data=data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
|
|
|
| 98 |
de.Update(data=data)
|
| 99 |
|
| 100 |
|
| 101 |
+
def Observation_LLM_Evaluator(self,promptversion):
|
| 102 |
SYSTEM='''
|
| 103 |
Task:
|
| 104 |
Evaluate the biological quality of a prompt–research data–response triplet from an Observations Generator Agent on a 0–1 continuous scale.
|
|
|
|
| 153 |
"biological_context_alignment":evaluation_response
|
| 154 |
}
|
| 155 |
de.Update(data=data)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def Anomaly_LLM_Evaluator(self,promptversion):
|
| 161 |
+
SYSTEM='''
|
| 162 |
+
Task:
|
| 163 |
+
Evaluate the biological quality of a prompt–observations–response triplet from an Anomaly Detector Agent on a 0–1 continuous scale.
|
| 164 |
+
|
| 165 |
+
Goal:
|
| 166 |
+
Assess:
|
| 167 |
+
Whether the Prompt clearly defines the biological context and intent.
|
| 168 |
+
|
| 169 |
+
Whether the Observations are biologically plausible and internally consistent.
|
| 170 |
+
|
| 171 |
+
Whether the Response correctly identifies biologically relevant inconsistencies between the Paradigm and Observations.
|
| 172 |
+
|
| 173 |
+
Scoring Guide (0–1 continuous scale):
|
| 174 |
+
|
| 175 |
+
Score 1.0 if:
|
| 176 |
+
|
| 177 |
+
The prompt is clear and biologically grounded.
|
| 178 |
+
|
| 179 |
+
The response lists true, biologically meaningful anomalies based on the observations.
|
| 180 |
+
|
| 181 |
+
All major contradictions or gaps are captured.
|
| 182 |
+
|
| 183 |
+
Lower scores if:
|
| 184 |
+
|
| 185 |
+
The prompt is vague.
|
| 186 |
+
|
| 187 |
+
The response misses key anomalies, adds irrelevant ones, or shows poor biological reasoning.
|
| 188 |
+
|
| 189 |
+
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
|
| 190 |
+
Output:
|
| 191 |
+
Score: 0.2
|
| 192 |
+
Reasoning: Your reasoning.
|
| 193 |
+
'''
|
| 194 |
+
|
| 195 |
+
data_to_evaluate=dbe.GetData(promptversion)
|
| 196 |
+
messages=[
|
| 197 |
+
{"role":"system","content":SYSTEM},
|
| 198 |
+
{"role":"user","content":f"""
|
| 199 |
+
Prompt :{data_to_evaluate["prompt"]}
|
| 200 |
+
Observations :{ data_to_evaluate["context"]}
|
| 201 |
+
Agent's Response :{data_to_evaluate["response"]}
|
| 202 |
+
"""}
|
| 203 |
+
]
|
| 204 |
+
evaluation_response=self.___engine_core(messages=messages)
|
| 205 |
+
data={
|
| 206 |
+
"prompt":promptversion,
|
| 207 |
+
"biological_context_alignment":evaluation_response
|
| 208 |
+
}
|
| 209 |
+
de.Update(data=data)
|
| 210 |
+
|
| 211 |
|