Update llmeval.py
Browse files- llmeval.py +37 -72
llmeval.py
CHANGED
@@ -180,16 +180,17 @@ class LLM_as_Evaluator():
|
|
180 |
def ___engine_core(self,messages):
|
181 |
|
182 |
completion = client.chat.completions.create(
|
183 |
-
model="
|
184 |
messages=messages,
|
185 |
temperature=0.0,
|
186 |
-
max_completion_tokens=
|
187 |
#top_p=1,
|
188 |
stream=False,
|
189 |
stop=None,
|
190 |
)
|
191 |
actual_message=completion.choices[0].message.content
|
192 |
-
return re.sub(r"<think>.*?</think>", "", actual_message, flags=re.DOTALL).strip()
|
|
|
193 |
|
194 |
|
195 |
|
@@ -199,80 +200,44 @@ class LLM_as_Evaluator():
|
|
199 |
|
200 |
data_to_evaluate=de.GetData(promptversion)
|
201 |
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
evaluation_responses=[]
|
205 |
-
|
206 |
for metric in metrics:
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
evaluation_response=self.___engine_core(messages=messages)
|
221 |
-
evaluation_responses.append({"biological_context_alignment":evaluation_response})
|
222 |
-
|
223 |
-
case "contextual_relevance_alignment":
|
224 |
-
|
225 |
-
messages =[
|
226 |
-
|
227 |
-
{"role":"system","content":SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT},
|
228 |
-
{"role":"user","content":f"""
|
229 |
-
Prompt :{data_to_evaluate["prompt"]}
|
230 |
-
Context :{data_to_evaluate["context"]}
|
231 |
-
Agent's Response : {data_to_evaluate["response"]}
|
232 |
-
"""}
|
233 |
-
]
|
234 |
-
evaluation_response=self.___engine_core(messages=messages)
|
235 |
-
evaluation_responses.append({"contextual_relevance_alignment":evaluation_response})
|
236 |
-
|
237 |
-
|
238 |
-
case "response_specificity":
|
239 |
-
|
240 |
-
messages =[
|
241 |
-
|
242 |
-
{"role":"system","content":SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY},
|
243 |
-
{"role":"user","content":f"""
|
244 |
-
Prompt :{data_to_evaluate["prompt"]}
|
245 |
-
Context :{data_to_evaluate["context"]}
|
246 |
-
Agent's Response : {data_to_evaluate["response"]}
|
247 |
-
"""}
|
248 |
-
]
|
249 |
-
evaluation_response=self.___engine_core(messages=messages)
|
250 |
-
evaluation_responses.append({"response_specificity":evaluation_response})
|
251 |
-
|
252 |
-
|
253 |
-
case "unit_coherence":
|
254 |
-
|
255 |
-
messages =[
|
256 |
-
|
257 |
-
{"role":"system","content":SYSTEM_PROMPT_FOR_TRIAD_COHERENCE},
|
258 |
-
{"role":"user","content":f"""
|
259 |
-
Prompt :{data_to_evaluate["prompt"]}
|
260 |
-
Context :{data_to_evaluate["context"]}
|
261 |
-
Agent's Response : {data_to_evaluate["response"]}
|
262 |
-
"""}
|
263 |
-
]
|
264 |
-
evaluation_response=self.___engine_core(messages=messages)
|
265 |
-
evaluation_responses.append({"unit_coherence":evaluation_response})
|
266 |
-
|
267 |
|
|
|
|
|
268 |
data={
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
"unit_coherence":"",
|
275 |
-
"response_specificity":""
|
276 |
}
|
277 |
|
278 |
for resp in evaluation_responses:
|
|
|
180 |
def ___engine_core(self,messages):
|
181 |
|
182 |
completion = client.chat.completions.create(
|
183 |
+
model="llama-3.1-8b-instant",
|
184 |
messages=messages,
|
185 |
temperature=0.0,
|
186 |
+
max_completion_tokens=5000,
|
187 |
#top_p=1,
|
188 |
stream=False,
|
189 |
stop=None,
|
190 |
)
|
191 |
actual_message=completion.choices[0].message.content
|
192 |
+
#return re.sub(r"<think>.*?</think>", "", actual_message, flags=re.DOTALL).strip()
|
193 |
+
return actual_message
|
194 |
|
195 |
|
196 |
|
|
|
200 |
|
201 |
data_to_evaluate=de.GetData(promptversion)
|
202 |
|
203 |
+
(
|
204 |
+
SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT,
|
205 |
+
SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT,
|
206 |
+
SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY,
|
207 |
+
SYSTEM_PROMPT_FOR_TRIAD_COHERENCE
|
208 |
+
) = PROMPT_UPDATER("observation agent")
|
209 |
+
|
210 |
+
prompt_map = {
|
211 |
+
"biological_context_alignment": SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT,
|
212 |
+
"contextual_relevance_alignment": SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT,
|
213 |
+
"response_specificity": SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY,
|
214 |
+
"unit_coherence": SYSTEM_PROMPT_FOR_TRIAD_COHERENCE
|
215 |
+
}
|
216 |
|
217 |
evaluation_responses=[]
|
|
|
218 |
for metric in metrics:
|
219 |
+
system_prompt = prompt_map[metric]
|
220 |
+
|
221 |
+
messages = [
|
222 |
+
{"role": "system", "content": system_prompt},
|
223 |
+
{"role": "user", "content": f"""
|
224 |
+
Prompt: {data_to_evaluate["prompt"]}
|
225 |
+
Context: {data_to_evaluate["context"]}
|
226 |
+
Agent's Response: {data_to_evaluate["response"]}
|
227 |
+
"""}
|
228 |
+
]
|
229 |
+
|
230 |
+
evaluation_response = self.___engine_core(messages=messages)
|
231 |
+
evaluation_responses.append({metric: evaluation_response})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
+
|
234 |
+
|
235 |
data={
|
236 |
+
"promptversion":promptversion,
|
237 |
+
"biological_context_alignment":"",
|
238 |
+
"contextual_relevance_alignment":"",
|
239 |
+
"unit_coherence":"",
|
240 |
+
"response_specificity":""
|
|
|
|
|
241 |
}
|
242 |
|
243 |
for resp in evaluation_responses:
|