Junaidb commited on
Commit
6d43d2f
·
verified ·
1 Parent(s): 912d9b0

Update llmeval.py

Browse files
Files changed (1) hide show
  1. llmeval.py +56 -1
llmeval.py CHANGED
@@ -98,7 +98,7 @@ Think step by step
98
  de.Update(data=data)
99
 
100
 
101
- def ObservationEvaluator(self,promptversion):
102
  SYSTEM='''
103
  Task:
104
  Evaluate the biological quality of a prompt–research data–response triplet from an Observations Generator Agent on a 0–1 continuous scale.
@@ -153,4 +153,59 @@ Reasoning: The response introduces observations unrelated to epithelial tumors o
153
  "biological_context_alignment":evaluation_response
154
  }
155
  de.Update(data=data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
 
98
  de.Update(data=data)
99
 
100
 
101
+ def Observation_LLM_Evaluator(self,promptversion):
102
  SYSTEM='''
103
  Task:
104
  Evaluate the biological quality of a prompt–research data–response triplet from an Observations Generator Agent on a 0–1 continuous scale.
 
153
  "biological_context_alignment":evaluation_response
154
  }
155
  de.Update(data=data)
156
+
157
+
158
+
159
+
160
+ def Anomaly_LLM_Evaluator(self,promptversion):
161
+ SYSTEM='''
162
+ Task:
163
+ Evaluate the biological quality of a prompt–observations–response triplet from an Anomaly Detector Agent on a 0–1 continuous scale.
164
+
165
+ Goal:
166
+ Assess:
167
+ Whether the Prompt clearly defines the biological context and intent.
168
+
169
+ Whether the Observations are biologically plausible and internally consistent.
170
+
171
+ Whether the Response correctly identifies biologically relevant inconsistencies between the Paradigm and Observations.
172
+
173
+ Scoring Guide (0–1 continuous scale):
174
+
175
+ Score 1.0 if:
176
+
177
+ The prompt is clear and biologically grounded.
178
+
179
+ The response lists true, biologically meaningful anomalies based on the observations.
180
+
181
+ All major contradictions or gaps are captured.
182
+
183
+ Lower scores if:
184
+
185
+ The prompt is vague.
186
+
187
+ The response misses key anomalies, adds irrelevant ones, or shows poor biological reasoning.
188
+
189
+ Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
190
+ Output:
191
+ Score: 0.2
192
+ Reasoning: Your reasoning.
193
+ '''
194
+
195
+ data_to_evaluate=dbe.GetData(promptversion)
196
+ messages=[
197
+ {"role":"system","content":SYSTEM},
198
+ {"role":"user","content":f"""
199
+ Prompt :{data_to_evaluate["prompt"]}
200
+ Observations :{ data_to_evaluate["context"]}
201
+ Agent's Response :{data_to_evaluate["response"]}
202
+ """}
203
+ ]
204
+ evaluation_response=self.___engine_core(messages=messages)
205
+ data={
206
+ "prompt":promptversion,
207
+ "biological_context_alignment":evaluation_response
208
+ }
209
+ de.Update(data=data)
210
+
211