Spaces:
Sleeping
Sleeping
File size: 8,661 Bytes
b9a6dd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
from multiprocessing import Pool
from typing import List
import numpy as np
import torch
from pyscripts.utils.dialog_eval.vert import (
get_auto_bleu2_geometric,
get_self_bleu2_geometric,
run_f,
)
from scipy.stats import gmean
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
def perplexity(LLM_Output: str, model_id: str = "gpt2") -> str:
"""
Compute the perplexity of the given text using a specified model from the
`evaluate` library (default: GPT-2).
Args:
LLM_Output str:
The text (string) for which perplexity is to be computed.
model_id (str, optional):
The identifier of the model to use for computing
perplexity. Defaults to "gpt2".
Returns:
str:
A formatted string showing the perplexity of the
provided text(s), for example:
"Perplexity: 45.23\n"
Raises:
ImportError:
If the `evaluate` library is not installed or cannot be imported.
Example:
>>> text = "Hello world, this is a test."
>>> result = perplexity(text, model_id="gpt2")
>>> print(result)
"Perplexity: 27.34\n"
"""
try:
import evaluate
except Exception as e:
print("Error: evaluate is not properly installed.")
raise e
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(model_id=model_id, predictions=[LLM_Output])
return f"Perplexity: {results['mean_perplexity']:.2f}\n"
def vert(LLM_response_arr: List[str]) -> str:
"""
Calculate and return Self BLEU-2, Auto BLEU-2 and VERT-2
metrics for a list of LLM responses.
Args:
LLM_response_arr (List[str]):
A list of responses (strings) generated by the language
model acting as text dialog response generator.
Returns:
str:
A formatted string that includes each computed metric and the final
VERT value, for example:
"Self-BLEU2-geometric: 42.13
Auto-BLEU2-geometric: 38.94
VERT: 40.5
"
Example:
>>> # Suppose we have the following LLM responses:
>>> responses = ["Hello world", "Foo bar", "Lorem ipsum dolor sit amet"]
>>> result = vert(responses)
>>> print(result)
"Self-BLEU2-geometric: 42.13
Auto-BLEU2-geometric: 38.94
VERT: 40.5
"
"""
terms = [x.strip().split() for x in LLM_response_arr]
tasks = [
("Self-BLEU2-geometric", get_self_bleu2_geometric),
("Auto-BLEU2-geometric", get_auto_bleu2_geometric),
]
n_processes = min(16, len(tasks))
with Pool(n_processes) as pool:
metrics = pool.map(run_f, [(t[1], terms) for t in tasks])
metric_arr = []
str1 = ""
for (metric_name, _), metric in zip(tasks, metrics):
metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric))
metric, sem = [round(100 * x, 2) for x in [metric, sem]]
metric_arr.append(metric)
str1 += f"{metric_name}: {metric}\n"
str1 += f"VERT: {round(gmean(metric_arr), 2)}\n"
return str1
def bert_score(
total_response_arr: List[str], bert_model_name: str = "bert-base-uncased"
) -> str:
"""
Compute a cosine similarity score between the concatenated
context (all but the last element)
and the final response (last element) using a BERT-based model.
This serves as a simplified
measure of how closely the response aligns with the preceding context semantically.
Args:
total_response_arr (List[str]):
A list of strings. The last element represents the response,
while all other elements
are treated as the context.
bert_model_name (str, optional):
The name or path of the BERT model to use (from the Hugging Face Model Hub).
Defaults to "bert-base-uncased".
Returns:
str:
A string containing the cosine similarity
(as a percentage) followed by a newline.
For example:
"Cosine Similarity: 85.67\n"
Example:
>>> total_responses = [
... "User: Hi, how are you?",
... "Assistant: I'm good! How can I help you today?",
... "User: Can you tell me a joke?",
... "Assistant: Sure! Here's one: Why did the chicken join a band?"
... ]
>>> result = bert_score(total_responses, bert_model_name="bert-base-uncased")
>>> print(result)
"Cosine Similarity: 75.89\n"
"""
def cosine_similarity_context_response(context, response, model, tokenizer):
# Tokenize and encode both context and response
context_inputs = tokenizer(context, return_tensors="pt", truncation=True)
response_inputs = tokenizer(response, return_tensors="pt", truncation=True)
for k in context_inputs:
context_inputs[k] = context_inputs[k].cuda()
for k in response_inputs:
response_inputs[k] = response_inputs[k].cuda()
# Get embeddings from the model
with torch.no_grad():
context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1)
response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1)
# Compute cosine similarity
similarity = cosine_similarity(
context_embedding.cpu().numpy(), response_embedding.cpu().numpy()
)
return similarity[0][0]
bert_model = AutoModel.from_pretrained(bert_model_name).cuda()
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
similarity = cosine_similarity_context_response(
" ".join(total_response_arr[:-1]),
total_response_arr[-1],
bert_model,
bert_tokenizer,
)
return f"Cosine Similarity: {similarity*100:.2f}" + "\n"
def DialoGPT_perplexity(
user_utterance: str,
response: str,
dialog_model_name: str = "microsoft/DialoGPT-medium",
) -> str:
"""
Compute the perplexity of a response given a user utterance using a pre-trained
DialoGPT model. The function loads DialoGPT (medium by default)
from the Hugging Face Model Hub, then calculates the perplexity
for the
(context + response) sequence.
Args:
user_utterance (str):
The user utterance preceding the model's response.
response (str):
The generated response whose perplexity needs to be evaluated.
Returns:
str:
A formatted string containing the DialoGPT perplexity score. For example:
"DialoGPT Perplexity: 25.67\n"
Example:
>>> user_text = "Hi, how are you today?"
>>> system_response = "I'm good, thank you! How can I help you?"
>>> result = DialoGPT_perplexity(user_text, system_response)
>>> print(result)
"DialoGPT Perplexity: 31.45\n"
"""
def evaluate_response_with_dialoGPT(context, response, model, tokenizer):
"""
Evaluate the appropriateness of a response based on the
given context using DialoGPT.
Args:
context (str): The dialogue context (previous conversation).
response (str): The generated response to evaluate.
model: Pre-trained DialoGPT model.
tokenizer: Corresponding tokenizer for the DialoGPT model.
Returns:
float: Perplexity score of the response given the context.
"""
model.eval()
# Combine context and response as input
input_text = context + tokenizer.eos_token + response + tokenizer.eos_token
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
inputs["input_ids"] = inputs["input_ids"].cuda()
inputs["attention_mask"] = inputs["attention_mask"].cuda()
# import pdb;pdb.set_trace()
# Compute model outputs and loss
with torch.no_grad():
outputs = model(**inputs, labels=inputs["input_ids"].cuda())
loss = outputs.loss
# Calculate perplexity
perplexity = torch.exp(loss)
return perplexity.cpu().item()
# Load DialoGPT model and tokenizer
model_name = dialog_model_name
model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)
perplexity = evaluate_response_with_dialoGPT(
user_utterance, response, model, tokenizer
)
return f"DialoGPT Perplexity: {perplexity:.2f}" + "\n"
|