|
from groq import Groq |
|
import re |
|
import json |
|
from databaseengine import DatabaseEngine |
|
|
|
AK="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" |
|
client = Groq(api_key=AK) |
|
de=DatabaseEngine() |
|
|
|
|
|
def PROMPT_UPDATER(agenttype): |
|
SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT=f''' |
|
Task: |
|
Evaluate the biological quality of a Prompt, Context, and Response from an {agenttype} Agent on a 0–10 continuous scale. |
|
|
|
Goal: |
|
Assess: |
|
|
|
Whether Prompt precisely defines a biologically specific research objective, explicitly frames the agent's role, and delineates valid output types or constraints and is well aligned to the context. |
|
|
|
Whether Context is highly relevant, internally consistent, sufficiently rich in biological context, and presented in a way that supports fine-grained inference or analysis. |
|
|
|
Whether Response consists of output that is biologically valid, mechanistically sound, non-redundant, free from trivialities, contradictions, or generic phrasing and directly grounded in the context. |
|
|
|
Scoring Guide (0–1 continuous scale): |
|
|
|
Score 10 if all of the following are true: |
|
|
|
Prompt precisely defines a biologically specific research objective, explicitly frames the agent's role, and delineates valid output types or constraints and is well aligned to the context. |
|
|
|
Context is highly relevant, internally consistent, sufficiently rich in biological context, and presented in a way that supports fine-grained inference or analysis. |
|
|
|
Response consists of output that is biologically valid, mechanistically sound, non-redundant, free from trivialities, contradictions, or generic phrasing and directly grounded in the context. |
|
|
|
|
|
Lower scores if: |
|
|
|
Prompt does not clearly define a biologically specific objective, fails to frame the agent’s role or valid outputs, and is misaligned with the context. |
|
|
|
Context is irrelevant, inconsistent, lacking biological detail, or presented in a way that hinders meaningful analysis. |
|
|
|
Response includes output that is biologically invalid, mechanistically flawed, redundant, trivial, contradictory, or generic, and not clearly grounded in the context. |
|
|
|
|
|
Your output must begin with: |
|
Score: |
|
and contain only two fields: |
|
Score: and Reasoning: |
|
No extra commentary, no markdown, no explanations before or after. |
|
Think step by step |
|
''' |
|
|
|
SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT=f''' |
|
Task: |
|
Evaluate how well the {agenttype} Response addresses the specific Prompt by leveraging the provided Context on a 0–10 continuous scale. |
|
|
|
Goal: |
|
Assess: |
|
|
|
Whether the Prompt is precisely tailored to the Context, clearly sets expectations, and aligns with the scope of valid outputs. |
|
|
|
Whether the Context is highly relevant, biologically rich, and sufficient to enable effective fulfillment of the Prompt. |
|
|
|
Whether the Response directly and comprehensively utilizes the Context to fulfill the Prompt’s objective, without deviating or introducing irrelevant information. |
|
|
|
Scoring Guide (0–10 scale): |
|
|
|
Score 10 if all of the following are true: |
|
|
|
Prompt is precisely tailored to the Context, setting clear, biologically specific expectations and constraints for the agent. |
|
|
|
Context is sufficient, relevant, and complete, directly supporting the generation of appropriate output. |
|
|
|
Response directly addresses the Prompt, utilizing the Context to comprehensively satisfy the Prompt’s expectations with no deviation or irrelevant information. |
|
|
|
Low scores if : |
|
|
|
Prompt is not tailored to the Context, lacks clear, biologically specific expectations, and fails to set appropriate constraints for the agent |
|
|
|
Context is insufficient, irrelevant, or incomplete, failing to support the generation of appropriate output. |
|
|
|
Response does not directly address the Prompt, fails to utilize the Context effectively, and includes deviations or irrelevant information that do not satisfy the Prompt’s expectations. |
|
|
|
Your output must begin with: |
|
Score: |
|
and contain only two fields: |
|
Score: and Reasoning: |
|
No extra commentary, no markdown, no explanations before or after. |
|
Think step by step |
|
|
|
''' |
|
|
|
|
|
SYSTEM_PROMPT_FOR_TRIAD_COHERENCE=f''' |
|
Task: |
|
Evaluate the logical and semantic coherence of the Prompt, Context, and Response of {agenttype} as a unified set on a 0–10 continuous scale. |
|
|
|
Goal: |
|
Assess: |
|
|
|
Whether the Prompt is logically consistent with the provided Context, setting a clear, biologically grounded framework for the Response. |
|
|
|
Whether the Response logically and semantically follows from both the Prompt and provided Context, without contradictions or unsupported claims. |
|
|
|
Whether there are gaps, contradictions, or misalignments among the Prompt, Context and the Response that affect the overall coherence. |
|
|
|
Scoring Guide (0–10 scale): |
|
|
|
Score 10 if all are true: |
|
|
|
The Prompt is logically coherent with the Context, clearly framing the research objectives. |
|
|
|
The Response seamlessly builds on the Prompt and the Context, maintaining consistency without contradiction or ambiguity. |
|
|
|
All elements form a logically unified and semantically sound narrative, with no gaps or contradictions between them. |
|
|
|
Low scores if: |
|
The Prompt is not logically coherent with the Context, failing to clearly frame the research objectives. |
|
The Response does not seamlessly build on the Prompt and the Context, introducing contradictions or ambiguity. |
|
The elements do not form a logically unified or semantically sound narrative, containing gaps or contradictions between them. |
|
|
|
Your output must begin with: |
|
Score: |
|
and contain only two fields: |
|
Score: and Reasoning: |
|
No extra commentary, no markdown, no explanations before or after. |
|
Think step by step |
|
|
|
''' |
|
|
|
|
|
SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY=f''' |
|
Task: |
|
Evaluate how focused, detailed, and context-aware the {agenttype} Response is with respect to the Prompt and Context on a 0–10 continuous scale. |
|
|
|
Goal: |
|
Assess: |
|
|
|
Whether the Response is highly specific and precisely targeted to the Prompt, addressing the research objectives without deviation. |
|
|
|
Whether the Response includes sufficient, detailed insights directly drawn from the Context, ensuring relevance and biological accuracy. |
|
|
|
Whether the Response avoids vagueness, overly generic statements, and provides only relevant, factually grounded content. |
|
|
|
Scoring Guide (0–10 scale): |
|
|
|
Score 10 if all are true: |
|
|
|
The Response is exceptionally specific to the Prompt, addressing every aspect with precision and detail. |
|
|
|
The Response draws clear, biologically grounded, and highly detailed insights from the Context, ensuring all claims are backed by relevant data. |
|
|
|
No generic, irrelevant, or off-topic content is present, and every statement is purposeful and directly tied to the research objectives. |
|
|
|
Low scores if : |
|
|
|
The Response is not specific to the Prompt, failing to address important aspects with precision or detail. |
|
The Response does not draw clear, biologically grounded, or detailed insights from the Context, and many claims are not supported by relevant data.The Response contains generic, irrelevant, or off-topic content, and many statements are not purposeful or aligned with the research objectives |
|
|
|
The Response contains generic, irrelevant, or off-topic content, and many statements are not purposeful or aligned with the research objectives |
|
|
|
Your output must begin with: |
|
Score: |
|
and contain only two fields: |
|
Score: and Reasoning: |
|
No extra commentary, no markdown, no explanations before or after. |
|
Think step by step |
|
|
|
''' |
|
|
|
return SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT,SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT,SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY,SYSTEM_PROMPT_FOR_TRIAD_COHERENCE |
|
|
|
|
|
class LLM_as_Evaluator(): |
|
|
|
def __init__(self): |
|
pass |
|
|
|
|
|
def ___engine_core(self,messages): |
|
|
|
completion = client.chat.completions.create( |
|
model="llama-3.1-8b-instant", |
|
messages=messages, |
|
temperature=0.0, |
|
max_completion_tokens=5000, |
|
|
|
stream=False, |
|
stop=None, |
|
) |
|
actual_message=completion.choices[0].message.content |
|
|
|
return actual_message |
|
|
|
|
|
|
|
|
|
def LLM_Evaluator(self,promptversion): |
|
|
|
promptversion_splitted=promptversion.split(":") |
|
agent_type=promptversion_splitted[0] |
|
|
|
metrics=["biological_context_alignment","contextual_relevance_alignment","response_specificity","unit_coherence"] |
|
|
|
data_to_evaluate=de.GetData(promptversion) |
|
|
|
( |
|
SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT, |
|
SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT, |
|
SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY, |
|
SYSTEM_PROMPT_FOR_TRIAD_COHERENCE |
|
) = PROMPT_UPDATER(agent_type) |
|
|
|
prompt_map = { |
|
"biological_context_alignment": SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT, |
|
"contextual_relevance_alignment": SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT, |
|
"response_specificity": SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY, |
|
"unit_coherence": SYSTEM_PROMPT_FOR_TRIAD_COHERENCE |
|
} |
|
|
|
|
|
data={ |
|
"promptversion":promptversion, |
|
"biological_context_alignment":"", |
|
"contextual_relevance_alignment":"", |
|
"unit_coherence":"", |
|
"response_specificity":"" |
|
} |
|
|
|
for metric in metrics: |
|
system_prompt = prompt_map[metric] |
|
|
|
messages = [ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": f""" |
|
Prompt: {data_to_evaluate["prompt"]} |
|
Context: {data_to_evaluate["context"]} |
|
Agent's Response: {data_to_evaluate["response"]} |
|
"""} |
|
] |
|
|
|
evaluation_response = self.___engine_core(messages=messages) |
|
data[metric]=evaluation_response |
|
|
|
|
|
de.Update(data=data) |
|
|
|
|
|
|