JiaenLiu commited on
Commit
8abf414
·
1 Parent(s): bd1cc4e

LLM and other scores fix

Browse files

Former-commit-id: bc9daf82c6ef50234b836a57f656d646e5eb5ca6

evaluation/scores/LLM_eval.py CHANGED
@@ -11,11 +11,12 @@ from langchain.chat_models import ChatOpenAI
11
 
12
  # Load the evaluator
13
 
14
- def init_evaluator():
 
15
 
16
- llm = ChatOpenAI(temperature=0, model="gpt-4-0613")
17
 
18
- fstring = """You are an expert English to Chinese translator specialized in Startcraft2.
19
  You are grading the following question:
20
  {input}
21
  Here is the real answer:
@@ -24,26 +25,45 @@ def init_evaluator():
24
  {output}
25
  based on the following criteria:
26
  {criteria}
27
- Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
28
- Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
29
  numerically incorrect this also includes values that have the $ in front
30
  Please give the completeness score first followed by the accuracy score.
31
- For example: Completeness: 70. Accuracy: 40. Explanation here
32
  Do not differ from the format ever
33
  """
34
- prompt = PromptTemplate.from_template(fstring)
35
 
 
 
 
 
 
 
 
36
  return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
37
 
 
 
 
 
 
 
 
 
 
 
 
38
  def evaluate_prediction(input, reference, prediction, evaluator):
39
  eval_result = evaluator.evaluate_strings(
40
  prediction=prediction,
41
  input=input,
42
  reference=reference,
43
  )
44
- return eval_result
45
 
46
  if __name__ == "__main__":
47
  evaluator = init_evaluator()
 
48
  eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
49
  print(eval_result)
 
11
 
12
  # Load the evaluator
13
 
14
+ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
15
+ llm = ChatOpenAI(temperature=0, model=model)
16
 
17
+ lang_str = f"You are an expert {source_lang} to {target_lang} translator specialized in {domain}."
18
 
19
+ fstring = """
20
  You are grading the following question:
21
  {input}
22
  Here is the real answer:
 
25
  {output}
26
  based on the following criteria:
27
  {criteria}
28
+ Give one grades, accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy) and 100 is the highest (very high accuracy)?
29
+ Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
30
  numerically incorrect this also includes values that have the $ in front
31
  Please give the completeness score first followed by the accuracy score.
32
+ For example: Accuracy: 40. Explanation here
33
  Do not differ from the format ever
34
  """
35
+ prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")
36
 
37
+
38
+ # Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
39
+ # Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
40
+ # numerically incorrect this also includes values that have the $ in front
41
+ # Please give the completeness score first followed by the accuracy score.
42
+ # For example: Completeness: 70. Accuracy: 40. Explanation here
43
+ # Do not differ from the format ever
44
  return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
45
 
46
+ # prase the output of the evaluation
47
+ # example :
48
+ # 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
49
+ def parse_eval_result(eval_result):
50
+ # score = eval_result.score
51
+ value = eval_result["value"]
52
+ value = value.split("Accuracy: ")[1].split(".")
53
+ # combine the rest of the string into the whole explanation
54
+ explanation = ".".join(value[1:])
55
+ return int(value[0]), explanation
56
+
57
  def evaluate_prediction(input, reference, prediction, evaluator):
58
  eval_result = evaluator.evaluate_strings(
59
  prediction=prediction,
60
  input=input,
61
  reference=reference,
62
  )
63
+ return parse_eval_result(eval_result)
64
 
65
  if __name__ == "__main__":
66
  evaluator = init_evaluator()
67
+ # For no input english sentence, just put "" in the input
68
  eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
69
  print(eval_result)
evaluation/scores/multi_scores.py CHANGED
@@ -1,16 +1,26 @@
1
  from comet import download_model, load_from_checkpoint
2
  from sacrebleu.metrics import BLEU, CHRF, TER
3
- from . import LLM_eval
4
 
5
  class multi_scores:
6
- def __init__(self) -> None:
7
  self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
8
- self.bleu_model = BLEU()
9
- self.LLM_model = LLM_eval.init_evaluator()
10
 
11
- def get(self, src:str, mt:str, ref:str) -> dict:
 
 
 
 
12
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
13
- bleu_score = self.bleu_model.corpus_score(mt, ref).score
14
- LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model).score
15
- return {'bleu':bleu_score, 'comet':comet_score, 'llm':LLM_score}
 
 
 
 
 
 
16
 
 
1
  from comet import download_model, load_from_checkpoint
2
  from sacrebleu.metrics import BLEU, CHRF, TER
3
+ import LLM_eval
4
 
5
  class multi_scores:
6
+ def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
7
  self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
8
+ self.bleu_model = BLEU(tokenize="zh")
9
+ self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
10
 
11
+ # The function to get the scores
12
+ # src: orginal sentence
13
+ # mt: machine translation
14
+ # ref: reference translation
15
+ def get_scores(self, src:str, mt:str, ref:str) -> dict:
16
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
17
+ bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
18
+ LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
19
+ return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':LLM_score[0], 'llm_explanation':LLM_score[1]}
20
+
21
+ if __name__ == "__main__":
22
+ src = "this is an test sentences"
23
+ mt = "这是一个测试句子。"
24
+ ref = "这不是一个测试语句。"
25
+ print(multi_scores().get_scores(src, mt, ref))
26