Spaces:
Sleeping
Sleeping
JiaenLiu
commited on
Commit
·
8abf414
1
Parent(s):
bd1cc4e
LLM and other scores fix
Browse filesFormer-commit-id: bc9daf82c6ef50234b836a57f656d646e5eb5ca6
evaluation/scores/LLM_eval.py
CHANGED
@@ -11,11 +11,12 @@ from langchain.chat_models import ChatOpenAI
|
|
11 |
|
12 |
# Load the evaluator
|
13 |
|
14 |
-
def init_evaluator():
|
|
|
15 |
|
16 |
-
|
17 |
|
18 |
-
fstring = """
|
19 |
You are grading the following question:
|
20 |
{input}
|
21 |
Here is the real answer:
|
@@ -24,26 +25,45 @@ def init_evaluator():
|
|
24 |
{output}
|
25 |
based on the following criteria:
|
26 |
{criteria}
|
27 |
-
Give
|
28 |
-
|
29 |
numerically incorrect this also includes values that have the $ in front
|
30 |
Please give the completeness score first followed by the accuracy score.
|
31 |
-
For example:
|
32 |
Do not differ from the format ever
|
33 |
"""
|
34 |
-
prompt = PromptTemplate.from_template(fstring)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def evaluate_prediction(input, reference, prediction, evaluator):
|
39 |
eval_result = evaluator.evaluate_strings(
|
40 |
prediction=prediction,
|
41 |
input=input,
|
42 |
reference=reference,
|
43 |
)
|
44 |
-
return eval_result
|
45 |
|
46 |
if __name__ == "__main__":
|
47 |
evaluator = init_evaluator()
|
|
|
48 |
eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
|
49 |
print(eval_result)
|
|
|
11 |
|
12 |
# Load the evaluator
|
13 |
|
14 |
+
def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
|
15 |
+
llm = ChatOpenAI(temperature=0, model=model)
|
16 |
|
17 |
+
lang_str = f"You are an expert {source_lang} to {target_lang} translator specialized in {domain}."
|
18 |
|
19 |
+
fstring = """
|
20 |
You are grading the following question:
|
21 |
{input}
|
22 |
Here is the real answer:
|
|
|
25 |
{output}
|
26 |
based on the following criteria:
|
27 |
{criteria}
|
28 |
+
Give one grades, accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy) and 100 is the highest (very high accuracy)?
|
29 |
+
Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
|
30 |
numerically incorrect this also includes values that have the $ in front
|
31 |
Please give the completeness score first followed by the accuracy score.
|
32 |
+
For example: Accuracy: 40. Explanation here
|
33 |
Do not differ from the format ever
|
34 |
"""
|
35 |
+
prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")
|
36 |
|
37 |
+
|
38 |
+
# Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
|
39 |
+
# Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
|
40 |
+
# numerically incorrect this also includes values that have the $ in front
|
41 |
+
# Please give the completeness score first followed by the accuracy score.
|
42 |
+
# For example: Completeness: 70. Accuracy: 40. Explanation here
|
43 |
+
# Do not differ from the format ever
|
44 |
return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
|
45 |
|
46 |
+
# prase the output of the evaluation
|
47 |
+
# example :
|
48 |
+
# 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
|
49 |
+
def parse_eval_result(eval_result):
|
50 |
+
# score = eval_result.score
|
51 |
+
value = eval_result["value"]
|
52 |
+
value = value.split("Accuracy: ")[1].split(".")
|
53 |
+
# combine the rest of the string into the whole explanation
|
54 |
+
explanation = ".".join(value[1:])
|
55 |
+
return int(value[0]), explanation
|
56 |
+
|
57 |
def evaluate_prediction(input, reference, prediction, evaluator):
|
58 |
eval_result = evaluator.evaluate_strings(
|
59 |
prediction=prediction,
|
60 |
input=input,
|
61 |
reference=reference,
|
62 |
)
|
63 |
+
return parse_eval_result(eval_result)
|
64 |
|
65 |
if __name__ == "__main__":
|
66 |
evaluator = init_evaluator()
|
67 |
+
# For no input english sentence, just put "" in the input
|
68 |
eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
|
69 |
print(eval_result)
|
evaluation/scores/multi_scores.py
CHANGED
@@ -1,16 +1,26 @@
|
|
1 |
from comet import download_model, load_from_checkpoint
|
2 |
from sacrebleu.metrics import BLEU, CHRF, TER
|
3 |
-
|
4 |
|
5 |
class multi_scores:
|
6 |
-
def __init__(self) -> None:
|
7 |
self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
|
8 |
-
self.bleu_model = BLEU()
|
9 |
-
self.LLM_model = LLM_eval.init_evaluator()
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
|
13 |
-
bleu_score = self.bleu_model.corpus_score(mt, ref).score
|
14 |
-
LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
|
15 |
-
return {'
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
|
|
1 |
from comet import download_model, load_from_checkpoint
|
2 |
from sacrebleu.metrics import BLEU, CHRF, TER
|
3 |
+
import LLM_eval
|
4 |
|
5 |
class multi_scores:
|
6 |
+
def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
|
7 |
self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
|
8 |
+
self.bleu_model = BLEU(tokenize="zh")
|
9 |
+
self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
|
10 |
|
11 |
+
# The function to get the scores
|
12 |
+
# src: orginal sentence
|
13 |
+
# mt: machine translation
|
14 |
+
# ref: reference translation
|
15 |
+
def get_scores(self, src:str, mt:str, ref:str) -> dict:
|
16 |
comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
|
17 |
+
bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
|
18 |
+
LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
|
19 |
+
return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':LLM_score[0], 'llm_explanation':LLM_score[1]}
|
20 |
+
|
21 |
+
if __name__ == "__main__":
|
22 |
+
src = "this is an test sentences"
|
23 |
+
mt = "这是一个测试句子。"
|
24 |
+
ref = "这不是一个测试语句。"
|
25 |
+
print(multi_scores().get_scores(src, mt, ref))
|
26 |
|