Spaces:

Agents-MCP-Hackathon
/

drift-detector

Sleeping

App Files Files Community

drift-detector / ourllm.py

HarshBhati

registered agents are showing but groq is not working fine

0733fd6 3 months ago

raw

history blame contribute delete

4.3 kB

	import difflib
	from typing import List
	import mcp.types as types
	from langchain.chat_models import init_chat_model
	from dotenv import load_dotenv
	import os
	import re

	# Load environment variables from .env file
	load_dotenv()
	print("GROQ_API_KEY is set:", "GROQ_API_KEY" in os.environ)

	llm = init_chat_model("llama-3.1-8b-instant", model_provider='groq')


	def genratequestionnaire(model: str, capabilities: str) -> List[str]:
	"""
	Generate a baseline questionnaire for the given model.
	Returns a list of question strings for diagnostic purposes.
	"""
	global llm
	questions = []
	previously_generated = ""

	for i in range(5):
	try:
	response = llm.invoke(
	f"Generate a questionnaire for a model with the following capabilities:\n"
	f"Model Name: {model}\n"
	f"Capabilities Overview:\n{capabilities}\n"
	f"Please provide one more question that covers the model's capabilities and typical use-cases.\n"
	f"Previously generated questions:\n{previously_generated}\n"
	f"Question {i + 1}:"
	)
	new_question = str(response.content).strip()
	questions.append(new_question)

	# Update previously_generated to include the new question
	if previously_generated:
	previously_generated += "\n"
	previously_generated += f"Question {i + 1}: {new_question}"

	except Exception as e:
	print(f"Error generating question {i + 1}: {e}")
	# Fallback question
	questions.append(f"What are your capabilities as {model}?")

	return questions


	def gradeanswers(old_answers: List[str], new_answers: List[str]) -> str:
	"""
	Use the LLM to compare the old and new answers to compute a drift score.
	Returns a drift percentage as a string.
	"""
	global llm

	if not old_answers or not new_answers:
	return "0"

	if len(old_answers) != len(new_answers):
	return "100" # Major drift if answer count differs

	try:
	# Prepare a prompt with old and new answers for the LLM to analyze
	prompt = "You're tasked with detecting semantic drift between two sets of model responses.\n\n"
	prompt += "Original responses:\n"
	for i, ans in enumerate(old_answers):
	prompt += f"Response {i + 1}: {ans}\n\n"

	prompt += "New responses:\n"
	for i, ans in enumerate(new_answers):
	prompt += f"Response {i + 1}: {ans}\n\n"

	prompt += ("Analyze the semantic differences between the original and new responses. "
	"Provide a drift percentage score (0-100%) that represents how much the meaning, "
	"intent, or capabilities have changed between the two sets of responses. "
	"Only return the numerical percentage value without any explanation or additional text.")

	# Get the drift assessment from the LLM
	response = llm.invoke(prompt)
	drift_text = str(response.content).strip()

	# Extract just the numerical value if there's extra text
	drift_match = re.search(r'(\d+\.?\d*)', drift_text)
	if drift_match:
	drift_pct = float(drift_match.group(1))
	return str(int(drift_pct)) # Return as integer string
	else:
	# Fallback: calculate simple text similarity
	similarity_scores = []
	for old, new in zip(old_answers, new_answers):
	similarity = difflib.SequenceMatcher(None, old, new).ratio()
	similarity_scores.append(similarity)

	avg_similarity = sum(similarity_scores) / len(similarity_scores)
	drift_pct = (1 - avg_similarity) * 100
	return str(int(drift_pct))

	except Exception as e:
	print(f"Error grading answers: {e}")
	# Fallback: calculate simple text similarity
	similarity_scores = []
	for old, new in zip(old_answers, new_answers):
	similarity = difflib.SequenceMatcher(None, old, new).ratio()
	similarity_scores.append(similarity)

	avg_similarity = sum(similarity_scores) / len(similarity_scores)
	drift_pct = (1 - avg_similarity) * 100
	return str(int(drift_pct))