guard / llmeval.py
Junaidb's picture
Update llmeval.py
6a48eb3 verified
raw
history blame
6.58 kB
from groq import Groq
import re
import json
from databaseengine import DatabaseEngine
AK="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"
client = Groq(api_key=AK)
de=DatabaseEngine()
class LLM_as_Evaluator():
def __init__(self):
pass
def ___engine_core(self,messages):
completion = client.chat.completions.create(
model="llama3-8b-8192",
messages=messages,
temperature=0.0,
max_completion_tokens=5000,
top_p=1,
stream=False,
stop=None,
)
actual_message=completion.choices[0].message.content
return actual_message
#cleaned_json=re.sub(r"```(?:json)?\s*(.*?)\s*```", r"\1", actual_message, flags=re.DOTALL).strip()
#is_json_like = cleaned_json.strip().startswith("{") and cleaned_json.strip().endswith("}")
#if is_json_like==True:
#return cleaned_json
#else:
#return "FATAL"
def Paradigm_LLM_Evaluator(self,promptversion):
SYSTEM='''
Task:
Evaluate the biological quality of a prompt, research data, paradigm list, and response on a 0–1 continuous scale.
Goal:
Assess:
Whether the Prompt is clear, biologically specific, and aligned with the Research Data and the Paradigm List.
Whether the response is biologically relevant, mechanistically coherent, and experimentally actionable based on the Research Data.
Whether the response is correctly chosen from the Paradigm List in light of the Research Data.
Scoring Guide (0–1 continuous scale):
Score 1.0 if:
The Prompt is clear, biologically detailed, and well-aligned to the Research Data and Paradigm List.
The response correctly reflects a biologically valid interpretation of the Research Data and is appropriately drawn from the Paradigm List.
Lower scores if:
The prompt is vague or misaligned with the research context.
The response is biologically irrelevant, mechanistically incoherent, or mismatched with the Research Data.
The paradigm is not the most plausible or supported choice from the Paradigm List.
Your output must begin with Score: and contain only two fields: Score: and Reasoning:. No extra commentary, no markdown, no explanations before or after.:
Think step by step
'''
data_to_evaluate=dbe.GetData(promptversion)
messages=[
{"role":"system","content":SYSTEM},
{"role":"user","content":f"""
Prompt:{data_to_evaluate["prompt"]},
Research Data :{data_to_evaluate["context"]},
Agent's Response:{data_to_evaluate["response"]}
"""}
]
evaluation_response=self.___engine_core(messages=messages)
data={
"promptversion":promptversion,
"biological_context_alignment":evaluation_response
}
de.Update(data=data)
def Observation_LLM_Evaluator(self,promptversion):
SYSTEM='''
Task:
Evaluate the biological quality of a prompt , research data and response from an Observations Generator Agent on a 0–1 continuous scale.
Goal:
Assess:
Whether the Prompt clearly defines the research context and specifies the scope of valid observations.
Whether the Response includes observations that are biologically plausible, factually grounded, and consistent with the Research Data.
Scoring Guide (0–1 continuous scale):
Score 1.0 if:
Prompt is clear, biologically specific, and well-aligned to the data context.
Response consists of multiple observations that are each biologically valid, non-redundant, and directly grounded in the data.
Lower scores if:
The prompt is vague or overly generic.
The response includes irrelevant, biologically implausible, contradictory, or trivial observations.
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
Think step by step
'''
data_to_evaluate=dbe.GetData(promptversion)
messages =[
{"role":"system","content":SYSTEM},
{"role":"user","content":f"""
Prompt :{data_to_evaluate["prompt"]}
Research Data :{data_to_evaluate["context"]}
Agent's Response : {data_to_evaluate["response"]}
"""}
]
evaluation_response=self.___engine_core(messages=messages)
data={
"promptversion":promptversion,
"biological_context_alignment":evaluation_response
}
de.Update(data=data)
def Anomaly_LLM_Evaluator(self,promptversion):
SYSTEM='''
Task:
Evaluate the biological quality of a prompt , observations , paradigms and response from an Anomaly Detector Agent on a 0–1 continuous scale.
Goal:
Assess:
Whether the Prompt clearly defines the biological context and intent.
Whether the Observations are biologically plausible and internally consistent.
Whether the Paradigms are plausible biological frameworks given the context.
Whether the Response correctly identifies biologically relevant inconsistencies or contradictions between the Paradigms and the Observations.
Scoring Guide (0–1 continuous scale):
Score 1.0 if:
The Prompt is clear, biologically grounded, and well-scoped.
The Observations are plausible and logically consistent.
The Response accurately identifies true anomalies—i.e., meaningful contradictions or gaps—between the Paradigms and the Observations.
All major conflicts are captured.
Lower scores if:
The Prompt is vague or misaligned with the context.
Observations are biologically implausible or incoherent.
The Response overlooks key inconsistencies, includes irrelevant anomalies, or shows poor biological reasoning.
Your output must begin with Score: and contain only two fields: Score: and Reasoning: No extra commentary, no markdown, no explanations before or after.
Think step by step
'''
data_to_evaluate=dbe.GetData(promptversion)
messages=[
{"role":"system","content":SYSTEM},
{"role":"user","content":f"""
Prompt :{data_to_evaluate["prompt"]}
Observations :{ data_to_evaluate["context"]}
Agent's Response :{data_to_evaluate["response"]}
"""}
]
evaluation_response=self.___engine_core(messages=messages)
data={
"promptversion":promptversion,
"biological_context_alignment":evaluation_response
}
de.Update(data=data)