Update llmeval.py
Browse files- llmeval.py +25 -17
llmeval.py
CHANGED
@@ -42,16 +42,16 @@ class LLM_as_Evaluator():
|
|
42 |
|
43 |
SYSTEM='''
|
44 |
Task:
|
45 |
-
Evaluate the biological quality of a prompt, research data, paradigm list, and
|
46 |
|
47 |
Goal:
|
48 |
Assess:
|
49 |
|
50 |
Whether the Prompt is clear, biologically specific, and aligned with the Research Data and the Paradigm List.
|
51 |
|
52 |
-
Whether the
|
53 |
|
54 |
-
Whether the
|
55 |
|
56 |
Scoring Guide (0–1 continuous scale):
|
57 |
|
@@ -59,15 +59,15 @@ Score 1.0 if:
|
|
59 |
|
60 |
The Prompt is clear, biologically detailed, and well-aligned to the Research Data and Paradigm List.
|
61 |
|
62 |
-
The
|
63 |
|
64 |
Lower scores if:
|
65 |
|
66 |
The prompt is vague or misaligned with the research context.
|
67 |
|
68 |
-
The
|
69 |
|
70 |
-
The
|
71 |
|
72 |
|
73 |
Your output must begin with Score: and contain only two fields: Score: and Reasoning:. No extra commentary, no markdown, no explanations before or after.:
|
@@ -97,7 +97,7 @@ Think step by step
|
|
97 |
def Observation_LLM_Evaluator(self,promptversion):
|
98 |
SYSTEM='''
|
99 |
Task:
|
100 |
-
Evaluate the biological quality of a prompt , research data
|
101 |
|
102 |
Goal:
|
103 |
Assess:
|
@@ -122,6 +122,8 @@ The response includes irrelevant, biologically implausible, contradictory, or tr
|
|
122 |
|
123 |
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
|
124 |
|
|
|
|
|
125 |
'''
|
126 |
data_to_evaluate=dbe.GetData(promptversion)
|
127 |
messages =[
|
@@ -146,36 +148,42 @@ Your output must begin with Score: and contain only two fields: Score: and Reaso
|
|
146 |
def Anomaly_LLM_Evaluator(self,promptversion):
|
147 |
SYSTEM='''
|
148 |
Task:
|
149 |
-
Evaluate the biological quality of a prompt
|
150 |
|
151 |
Goal:
|
152 |
Assess:
|
|
|
153 |
Whether the Prompt clearly defines the biological context and intent.
|
154 |
|
155 |
Whether the Observations are biologically plausible and internally consistent.
|
156 |
|
157 |
-
Whether the
|
|
|
|
|
158 |
|
159 |
Scoring Guide (0–1 continuous scale):
|
160 |
|
161 |
Score 1.0 if:
|
162 |
|
163 |
-
The
|
164 |
|
165 |
-
The
|
166 |
|
167 |
-
|
|
|
|
|
168 |
|
169 |
Lower scores if:
|
170 |
|
171 |
-
The
|
|
|
|
|
172 |
|
173 |
-
The
|
174 |
|
175 |
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
|
176 |
-
|
177 |
-
|
178 |
-
Reasoning: Your reasoning.
|
179 |
'''
|
180 |
|
181 |
data_to_evaluate=dbe.GetData(promptversion)
|
|
|
42 |
|
43 |
SYSTEM='''
|
44 |
Task:
|
45 |
+
Evaluate the biological quality of a prompt, research data, paradigm list, and response on a 0–1 continuous scale.
|
46 |
|
47 |
Goal:
|
48 |
Assess:
|
49 |
|
50 |
Whether the Prompt is clear, biologically specific, and aligned with the Research Data and the Paradigm List.
|
51 |
|
52 |
+
Whether the response is biologically relevant, mechanistically coherent, and experimentally actionable based on the Research Data.
|
53 |
|
54 |
+
Whether the response is correctly chosen from the Paradigm List in light of the Research Data.
|
55 |
|
56 |
Scoring Guide (0–1 continuous scale):
|
57 |
|
|
|
59 |
|
60 |
The Prompt is clear, biologically detailed, and well-aligned to the Research Data and Paradigm List.
|
61 |
|
62 |
+
The response correctly reflects a biologically valid interpretation of the Research Data and is appropriately drawn from the Paradigm List.
|
63 |
|
64 |
Lower scores if:
|
65 |
|
66 |
The prompt is vague or misaligned with the research context.
|
67 |
|
68 |
+
The response is biologically irrelevant, mechanistically incoherent, or mismatched with the Research Data.
|
69 |
|
70 |
+
The paradigm is not the most plausible or supported choice from the Paradigm List.
|
71 |
|
72 |
|
73 |
Your output must begin with Score: and contain only two fields: Score: and Reasoning:. No extra commentary, no markdown, no explanations before or after.:
|
|
|
97 |
def Observation_LLM_Evaluator(self,promptversion):
|
98 |
SYSTEM='''
|
99 |
Task:
|
100 |
+
Evaluate the biological quality of a prompt , research data and response from an Observations Generator Agent on a 0–1 continuous scale.
|
101 |
|
102 |
Goal:
|
103 |
Assess:
|
|
|
122 |
|
123 |
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
|
124 |
|
125 |
+
Think step by step
|
126 |
+
|
127 |
'''
|
128 |
data_to_evaluate=dbe.GetData(promptversion)
|
129 |
messages =[
|
|
|
148 |
def Anomaly_LLM_Evaluator(self,promptversion):
|
149 |
SYSTEM='''
|
150 |
Task:
|
151 |
+
Evaluate the biological quality of a prompt , observations , paradigms and response from an Anomaly Detector Agent on a 0–1 continuous scale.
|
152 |
|
153 |
Goal:
|
154 |
Assess:
|
155 |
+
|
156 |
Whether the Prompt clearly defines the biological context and intent.
|
157 |
|
158 |
Whether the Observations are biologically plausible and internally consistent.
|
159 |
|
160 |
+
Whether the Paradigms are plausible biological frameworks given the context.
|
161 |
+
|
162 |
+
Whether the Response correctly identifies biologically relevant inconsistencies or contradictions between the Paradigms and the Observations.
|
163 |
|
164 |
Scoring Guide (0–1 continuous scale):
|
165 |
|
166 |
Score 1.0 if:
|
167 |
|
168 |
+
The Prompt is clear, biologically grounded, and well-scoped.
|
169 |
|
170 |
+
The Observations are plausible and logically consistent.
|
171 |
|
172 |
+
The Response accurately identifies true anomalies—i.e., meaningful contradictions or gaps—between the Paradigms and the Observations.
|
173 |
+
|
174 |
+
All major conflicts are captured.
|
175 |
|
176 |
Lower scores if:
|
177 |
|
178 |
+
The Prompt is vague or misaligned with the context.
|
179 |
+
|
180 |
+
Observations are biologically implausible or incoherent.
|
181 |
|
182 |
+
The Response overlooks key inconsistencies, includes irrelevant anomalies, or shows poor biological reasoning.
|
183 |
|
184 |
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
|
185 |
+
|
186 |
+
Think step by step
|
|
|
187 |
'''
|
188 |
|
189 |
data_to_evaluate=dbe.GetData(promptversion)
|