Update llmeval.py
Browse files- llmeval.py +59 -1
llmeval.py
CHANGED
@@ -95,4 +95,62 @@ Think step by step
|
|
95 |
"prompt":data_to_evaluate["prompt"],
|
96 |
"biological_context_alignment":evaluation_response
|
97 |
}
|
98 |
-
de.Update(data=data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
"prompt":data_to_evaluate["prompt"],
|
96 |
"biological_context_alignment":evaluation_response
|
97 |
}
|
98 |
+
de.Update(data=data)
|
99 |
+
|
100 |
+
|
101 |
+
def ObservationEvaluator(self,promptversion):
|
102 |
+
SYSTEM='''
|
103 |
+
Task:
|
104 |
+
Evaluate the biological quality of a prompt–research data–response triplet from an Observations Generator Agent on a 0–1 continuous scale.
|
105 |
+
|
106 |
+
Goal:
|
107 |
+
Assess:
|
108 |
+
|
109 |
+
Whether the Prompt clearly defines the research context and specifies the scope of valid observations.
|
110 |
+
|
111 |
+
Whether the Response includes observations that are biologically plausible, factually grounded, and consistent with the Research Data.
|
112 |
+
|
113 |
+
Scoring Guide (0–1 continuous scale):
|
114 |
+
|
115 |
+
Score 1.0 if:
|
116 |
+
|
117 |
+
Prompt is clear, biologically specific, and well-aligned to the data context.
|
118 |
+
|
119 |
+
Response consists of multiple observations that are each biologically valid, non-redundant, and directly grounded in the data.
|
120 |
+
|
121 |
+
Lower scores if:
|
122 |
+
|
123 |
+
The prompt is vague or overly generic.
|
124 |
+
|
125 |
+
The response includes irrelevant, biologically implausible, contradictory, or trivial observations.
|
126 |
+
|
127 |
+
EXAMPLE:
|
128 |
+
Input:
|
129 |
+
Prompt: Generate diverse biological observations derived from the functional consequences of TP53 R175H mutations in epithelial tumors.
|
130 |
+
Research Data: TP53 R175H mutants lose sequence-specific DNA binding, form dominant-negative complexes with wild-type TP53, and lead to unchecked cell proliferation.
|
131 |
+
Agent's Response: TP53 R175H mutations increase glucose uptake in muscle cells and promote heart tissue regeneration.
|
132 |
+
|
133 |
+
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
|
134 |
+
|
135 |
+
Output:
|
136 |
+
Score: 0.2
|
137 |
+
Reasoning: The response introduces observations unrelated to epithelial tumors or TP53's DNA binding function. The mention of muscle and heart tissue is off-context, and the observations are biologically implausible in this setting.
|
138 |
+
|
139 |
+
'''
|
140 |
+
data_to_evaluate=dbe.GetData(promptversion)
|
141 |
+
messages =[
|
142 |
+
|
143 |
+
{"role":"system","content":SYSTEM},
|
144 |
+
{"role":"user","content":f"""
|
145 |
+
Prompt :{data_to_evaluate["prompt"]}
|
146 |
+
Research Data :{data_to_evaluate["context"]}
|
147 |
+
Agent's Response : {data_to_evaluate["response"]}
|
148 |
+
"""}
|
149 |
+
]
|
150 |
+
evaluation_response=self.___engine_core(messages=messages)
|
151 |
+
data={
|
152 |
+
"prompt":data_to_evaluate["prompt"],
|
153 |
+
"biological_context_alignment":evaluation_response
|
154 |
+
}
|
155 |
+
de.Update(data=data)
|
156 |
+
|