Spaces:
Sleeping
Sleeping
import difflib | |
from typing import List | |
import mcp.types as types | |
from langchain.chat_models import init_chat_model | |
from dotenv import load_dotenv | |
import os | |
import re | |
# Load environment variables from .env file | |
load_dotenv() | |
print("GROQ_API_KEY is set:", "GROQ_API_KEY" in os.environ) | |
llm = init_chat_model("llama-3.1-8b-instant", model_provider='groq') | |
def genratequestionnaire(model: str, capabilities: str) -> List[str]: | |
""" | |
Generate a baseline questionnaire for the given model. | |
Returns a list of question strings for diagnostic purposes. | |
""" | |
global llm | |
questions = [] | |
previously_generated = "" | |
for i in range(5): | |
try: | |
response = llm.invoke( | |
f"Generate a questionnaire for a model with the following capabilities:\n" | |
f"Model Name: {model}\n" | |
f"Capabilities Overview:\n{capabilities}\n" | |
f"Please provide one more question that covers the model's capabilities and typical use-cases.\n" | |
f"Previously generated questions:\n{previously_generated}\n" | |
f"Question {i + 1}:" | |
) | |
new_question = str(response.content).strip() | |
questions.append(new_question) | |
# Update previously_generated to include the new question | |
if previously_generated: | |
previously_generated += "\n" | |
previously_generated += f"Question {i + 1}: {new_question}" | |
except Exception as e: | |
print(f"Error generating question {i + 1}: {e}") | |
# Fallback question | |
questions.append(f"What are your capabilities as {model}?") | |
return questions | |
def gradeanswers(old_answers: List[str], new_answers: List[str]) -> str: | |
""" | |
Use the LLM to compare the old and new answers to compute a drift score. | |
Returns a drift percentage as a string. | |
""" | |
global llm | |
if not old_answers or not new_answers: | |
return "0" | |
if len(old_answers) != len(new_answers): | |
return "100" # Major drift if answer count differs | |
try: | |
# Prepare a prompt with old and new answers for the LLM to analyze | |
prompt = "You're tasked with detecting semantic drift between two sets of model responses.\n\n" | |
prompt += "Original responses:\n" | |
for i, ans in enumerate(old_answers): | |
prompt += f"Response {i + 1}: {ans}\n\n" | |
prompt += "New responses:\n" | |
for i, ans in enumerate(new_answers): | |
prompt += f"Response {i + 1}: {ans}\n\n" | |
prompt += ("Analyze the semantic differences between the original and new responses. " | |
"Provide a drift percentage score (0-100%) that represents how much the meaning, " | |
"intent, or capabilities have changed between the two sets of responses. " | |
"Only return the numerical percentage value without any explanation or additional text.") | |
# Get the drift assessment from the LLM | |
response = llm.invoke(prompt) | |
drift_text = str(response.content).strip() | |
# Extract just the numerical value if there's extra text | |
drift_match = re.search(r'(\d+\.?\d*)', drift_text) | |
if drift_match: | |
drift_pct = float(drift_match.group(1)) | |
return str(int(drift_pct)) # Return as integer string | |
else: | |
# Fallback: calculate simple text similarity | |
similarity_scores = [] | |
for old, new in zip(old_answers, new_answers): | |
similarity = difflib.SequenceMatcher(None, old, new).ratio() | |
similarity_scores.append(similarity) | |
avg_similarity = sum(similarity_scores) / len(similarity_scores) | |
drift_pct = (1 - avg_similarity) * 100 | |
return str(int(drift_pct)) | |
except Exception as e: | |
print(f"Error grading answers: {e}") | |
# Fallback: calculate simple text similarity | |
similarity_scores = [] | |
for old, new in zip(old_answers, new_answers): | |
similarity = difflib.SequenceMatcher(None, old, new).ratio() | |
similarity_scores.append(similarity) | |
avg_similarity = sum(similarity_scores) / len(similarity_scores) | |
drift_pct = (1 - avg_similarity) * 100 | |
return str(int(drift_pct)) |