|
from groq import Groq |
|
import re |
|
import json |
|
from databaseengine import DatabaseEngine |
|
|
|
AK="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" |
|
client = Groq(api_key=AK) |
|
de=DatabaseEngine() |
|
|
|
|
|
|
|
|
|
class LLM_as_Evaluator(): |
|
|
|
def __init__(self): |
|
pass |
|
|
|
|
|
def ___engine_core(self,messages): |
|
|
|
completion = client.chat.completions.create( |
|
model="llama3-8b-8192", |
|
messages=messages, |
|
temperature=0.0, |
|
max_completion_tokens=5000, |
|
top_p=1, |
|
stream=False, |
|
stop=None, |
|
) |
|
actual_message=completion.choices[0].message.content |
|
return actual_message |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def Paradigm_LLM_Evaluator(self,promptversion): |
|
|
|
|
|
SYSTEM=''' |
|
Task: |
|
Evaluate the biological quality of a prompt, research data, paradigm list, and response on a 0–1 continuous scale. |
|
|
|
Goal: |
|
Assess: |
|
|
|
Whether the Prompt is clear, biologically specific, and aligned with the Research Data and the Paradigm List. |
|
|
|
Whether the response is biologically relevant, mechanistically coherent, and experimentally actionable based on the Research Data. |
|
|
|
Whether the response is correctly chosen from the Paradigm List in light of the Research Data. |
|
|
|
Scoring Guide (0–1 continuous scale): |
|
|
|
Score 1.0 if: |
|
|
|
The Prompt is clear, biologically detailed, and well-aligned to the Research Data and Paradigm List. |
|
|
|
The response correctly reflects a biologically valid interpretation of the Research Data and is appropriately drawn from the Paradigm List. |
|
|
|
Lower scores if: |
|
|
|
The prompt is vague or misaligned with the research context. |
|
|
|
The response is biologically irrelevant, mechanistically incoherent, or mismatched with the Research Data. |
|
|
|
The paradigm is not the most plausible or supported choice from the Paradigm List. |
|
|
|
|
|
Your output must begin with Score: and contain only two fields: Score: and Reasoning:. No extra commentary, no markdown, no explanations before or after.: |
|
|
|
Think step by step |
|
''' |
|
|
|
data_to_evaluate=dbe.GetData(promptversion) |
|
messages=[ |
|
{"role":"system","content":SYSTEM}, |
|
{"role":"user","content":f""" |
|
Prompt:{data_to_evaluate["prompt"]}, |
|
Research Data :{data_to_evaluate["context"]}, |
|
Agent's Response:{data_to_evaluate["response"]} |
|
|
|
"""} |
|
] |
|
|
|
evaluation_response=self.___engine_core(messages=messages) |
|
data={ |
|
"promptversion":promptversion, |
|
"biological_context_alignment":evaluation_response |
|
} |
|
de.Update(data=data) |
|
|
|
|
|
def Observation_LLM_Evaluator(self,promptversion): |
|
SYSTEM=''' |
|
Task: |
|
Evaluate the biological quality of a prompt , research data and response from an Observations Generator Agent on a 0–1 continuous scale. |
|
|
|
Goal: |
|
Assess: |
|
|
|
Whether the Prompt clearly defines the research context and specifies the scope of valid observations. |
|
|
|
Whether the Response includes observations that are biologically plausible, factually grounded, and consistent with the Research Data. |
|
|
|
Scoring Guide (0–1 continuous scale): |
|
|
|
Score 1.0 if: |
|
|
|
Prompt is clear, biologically specific, and well-aligned to the data context. |
|
|
|
Response consists of multiple observations that are each biologically valid, non-redundant, and directly grounded in the data. |
|
|
|
Lower scores if: |
|
|
|
The prompt is vague or overly generic. |
|
|
|
The response includes irrelevant, biologically implausible, contradictory, or trivial observations. |
|
|
|
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after. |
|
|
|
Think step by step |
|
|
|
''' |
|
data_to_evaluate=dbe.GetData(promptversion) |
|
messages =[ |
|
|
|
{"role":"system","content":SYSTEM}, |
|
{"role":"user","content":f""" |
|
Prompt :{data_to_evaluate["prompt"]} |
|
Research Data :{data_to_evaluate["context"]} |
|
Agent's Response : {data_to_evaluate["response"]} |
|
"""} |
|
] |
|
evaluation_response=self.___engine_core(messages=messages) |
|
data={ |
|
"promptversion":promptversion, |
|
"biological_context_alignment":evaluation_response |
|
} |
|
de.Update(data=data) |
|
|
|
|
|
|
|
|
|
def Anomaly_LLM_Evaluator(self,promptversion): |
|
SYSTEM=''' |
|
Task: |
|
Evaluate the biological quality of a prompt , observations , paradigms and response from an Anomaly Detector Agent on a 0–1 continuous scale. |
|
|
|
Goal: |
|
Assess: |
|
|
|
Whether the Prompt clearly defines the biological context and intent. |
|
|
|
Whether the Observations are biologically plausible and internally consistent. |
|
|
|
Whether the Paradigms are plausible biological frameworks given the context. |
|
|
|
Whether the Response correctly identifies biologically relevant inconsistencies or contradictions between the Paradigms and the Observations. |
|
|
|
Scoring Guide (0–1 continuous scale): |
|
|
|
Score 1.0 if: |
|
|
|
The Prompt is clear, biologically grounded, and well-scoped. |
|
|
|
The Observations are plausible and logically consistent. |
|
|
|
The Response accurately identifies true anomalies—i.e., meaningful contradictions or gaps—between the Paradigms and the Observations. |
|
|
|
All major conflicts are captured. |
|
|
|
Lower scores if: |
|
|
|
The Prompt is vague or misaligned with the context. |
|
|
|
Observations are biologically implausible or incoherent. |
|
|
|
The Response overlooks key inconsistencies, includes irrelevant anomalies, or shows poor biological reasoning. |
|
|
|
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after. |
|
|
|
Think step by step |
|
''' |
|
|
|
data_to_evaluate=dbe.GetData(promptversion) |
|
messages=[ |
|
{"role":"system","content":SYSTEM}, |
|
{"role":"user","content":f""" |
|
Prompt :{data_to_evaluate["prompt"]} |
|
Observations :{ data_to_evaluate["context"]} |
|
Agent's Response :{data_to_evaluate["response"]} |
|
"""} |
|
] |
|
evaluation_response=self.___engine_core(messages=messages) |
|
data={ |
|
"promptversion":promptversion, |
|
"biological_context_alignment":evaluation_response |
|
} |
|
de.Update(data=data) |
|
|
|
|