File size: 9,982 Bytes
ef9830d 580e6c1 ef9830d 580e6c1 ef9830d 295be2b 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 295be2b 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 295be2b 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 295be2b 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 85b3ca7 097333a 295be2b ef9830d c9b0834 ef9830d c9b0834 85b3ca7 ef9830d c9b0834 ef9830d 746897d b6ac80f c77b851 a08f08b 85b3ca7 c9b0834 b6ac80f c9b0834 85b3ca7 58da707 a08f08b c9b0834 58da707 a08f08b 295be2b 6d43d2f 324d9f3 de613f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
from groq import Groq
import re
import json
from databaseengine import DatabaseEngine
AK="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"
client = Groq(api_key=AK)
de=DatabaseEngine()
def PROMPT_UPDATER(agenttype):
SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT=f'''
Task:
Evaluate the biological quality of a Prompt, Context, and Response from an {agenttype} Agent on a 0–10 continuous scale.
Goal:
Assess:
Whether Prompt precisely defines a biologically specific research objective, explicitly frames the agent's role, and delineates valid output types or constraints and is well aligned to the context.
Whether Context is highly relevant, internally consistent, sufficiently rich in biological context, and presented in a way that supports fine-grained inference or analysis.
Whether Response consists of output that is biologically valid, mechanistically sound, non-redundant, free from trivialities, contradictions, or generic phrasing and directly grounded in the context.
Scoring Guide (0–1 continuous scale):
Score 10 if all of the following are true:
Prompt precisely defines a biologically specific research objective, explicitly frames the agent's role, and delineates valid output types or constraints and is well aligned to the context.
Context is highly relevant, internally consistent, sufficiently rich in biological context, and presented in a way that supports fine-grained inference or analysis.
Response consists of output that is biologically valid, mechanistically sound, non-redundant, free from trivialities, contradictions, or generic phrasing and directly grounded in the context.
Lower scores if:
Prompt does not clearly define a biologically specific objective, fails to frame the agent’s role or valid outputs, and is misaligned with the context.
Context is irrelevant, inconsistent, lacking biological detail, or presented in a way that hinders meaningful analysis.
Response includes output that is biologically invalid, mechanistically flawed, redundant, trivial, contradictory, or generic, and not clearly grounded in the context.
Your output must begin with:
Score:
and contain only two fields:
Score: and Reasoning:
No extra commentary, no markdown, no explanations before or after.
Think step by step
'''
SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT=f'''
Task:
Evaluate how well the {agenttype} Response addresses the specific Prompt by leveraging the provided Context on a 0–10 continuous scale.
Goal:
Assess:
Whether the Prompt is precisely tailored to the Context, clearly sets expectations, and aligns with the scope of valid outputs.
Whether the Context is highly relevant, biologically rich, and sufficient to enable effective fulfillment of the Prompt.
Whether the Response directly and comprehensively utilizes the Context to fulfill the Prompt’s objective, without deviating or introducing irrelevant information.
Scoring Guide (0–10 scale):
Score 10 if all of the following are true:
Prompt is precisely tailored to the Context, setting clear, biologically specific expectations and constraints for the agent.
Context is sufficient, relevant, and complete, directly supporting the generation of appropriate output.
Response directly addresses the Prompt, utilizing the Context to comprehensively satisfy the Prompt’s expectations with no deviation or irrelevant information.
Low scores if :
Prompt is not tailored to the Context, lacks clear, biologically specific expectations, and fails to set appropriate constraints for the agent
Context is insufficient, irrelevant, or incomplete, failing to support the generation of appropriate output.
Response does not directly address the Prompt, fails to utilize the Context effectively, and includes deviations or irrelevant information that do not satisfy the Prompt’s expectations.
Your output must begin with:
Score:
and contain only two fields:
Score: and Reasoning:
No extra commentary, no markdown, no explanations before or after.
Think step by step
'''
SYSTEM_PROMPT_FOR_TRIAD_COHERENCE=f'''
Task:
Evaluate the logical and semantic coherence of the Prompt, Context, and Response of {agenttype} as a unified set on a 0–10 continuous scale.
Goal:
Assess:
Whether the Prompt is logically consistent with the provided Context, setting a clear, biologically grounded framework for the Response.
Whether the Response logically and semantically follows from both the Prompt and provided Context, without contradictions or unsupported claims.
Whether there are gaps, contradictions, or misalignments among the Prompt, Context and the Response that affect the overall coherence.
Scoring Guide (0–10 scale):
Score 10 if all are true:
The Prompt is logically coherent with the Context, clearly framing the research objectives.
The Response seamlessly builds on the Prompt and the Context, maintaining consistency without contradiction or ambiguity.
All elements form a logically unified and semantically sound narrative, with no gaps or contradictions between them.
Low scores if:
The Prompt is not logically coherent with the Context, failing to clearly frame the research objectives.
The Response does not seamlessly build on the Prompt and the Context, introducing contradictions or ambiguity.
The elements do not form a logically unified or semantically sound narrative, containing gaps or contradictions between them.
Your output must begin with:
Score:
and contain only two fields:
Score: and Reasoning:
No extra commentary, no markdown, no explanations before or after.
Think step by step
'''
SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY=f'''
Task:
Evaluate how focused, detailed, and context-aware the {agenttype} Response is with respect to the Prompt and Context on a 0–10 continuous scale.
Goal:
Assess:
Whether the Response is highly specific and precisely targeted to the Prompt, addressing the research objectives without deviation.
Whether the Response includes sufficient, detailed insights directly drawn from the Context, ensuring relevance and biological accuracy.
Whether the Response avoids vagueness, overly generic statements, and provides only relevant, factually grounded content.
Scoring Guide (0–10 scale):
Score 10 if all are true:
The Response is exceptionally specific to the Prompt, addressing every aspect with precision and detail.
The Response draws clear, biologically grounded, and highly detailed insights from the Context, ensuring all claims are backed by relevant data.
No generic, irrelevant, or off-topic content is present, and every statement is purposeful and directly tied to the research objectives.
Low scores if :
The Response is not specific to the Prompt, failing to address important aspects with precision or detail.
The Response does not draw clear, biologically grounded, or detailed insights from the Context, and many claims are not supported by relevant data.The Response contains generic, irrelevant, or off-topic content, and many statements are not purposeful or aligned with the research objectives
The Response contains generic, irrelevant, or off-topic content, and many statements are not purposeful or aligned with the research objectives
Your output must begin with:
Score:
and contain only two fields:
Score: and Reasoning:
No extra commentary, no markdown, no explanations before or after.
Think step by step
'''
return SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT,SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT,SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY,SYSTEM_PROMPT_FOR_TRIAD_COHERENCE
class LLM_as_Evaluator():
def __init__(self):
pass
def ___engine_core(self,messages):
completion = client.chat.completions.create(
model="llama-3.1-8b-instant",
messages=messages,
temperature=0.0,
max_completion_tokens=5000,
#top_p=1,
stream=False,
stop=None,
)
actual_message=completion.choices[0].message.content
#return re.sub(r"<think>.*?</think>", "", actual_message, flags=re.DOTALL).strip()
return actual_message
def LLM_Evaluator(self,promptversion):
promptversion_splitted=promptversion.split(":")
agent_type=promptversion_splitted[0]
metrics=["biological_context_alignment","contextual_relevance_alignment","response_specificity","unit_coherence"]
data_to_evaluate=de.GetData(promptversion)
(
SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT,
SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT,
SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY,
SYSTEM_PROMPT_FOR_TRIAD_COHERENCE
) = PROMPT_UPDATER(agent_type)
prompt_map = {
"biological_context_alignment": SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT,
"contextual_relevance_alignment": SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT,
"response_specificity": SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY,
"unit_coherence": SYSTEM_PROMPT_FOR_TRIAD_COHERENCE
}
#evaluation_responses=[]
data={
"promptversion":promptversion,
"biological_context_alignment":"",
"contextual_relevance_alignment":"",
"unit_coherence":"",
"response_specificity":""
}
for metric in metrics:
system_prompt = prompt_map[metric]
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"""
Prompt: {data_to_evaluate["prompt"]}
Context: {data_to_evaluate["context"]}
Agent's Response: {data_to_evaluate["response"]}
"""}
]
evaluation_response = self.___engine_core(messages=messages)
data[metric]=evaluation_response
de.Update(data=data)
|