Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

JiaenLiu commited on Oct 2, 2023

Commit

8abf414

1 Parent(s): bd1cc4e

LLM and other scores fix

Browse files

Former-commit-id: bc9daf82c6ef50234b836a57f656d646e5eb5ca6

Files changed (2) hide show

evaluation/scores/LLM_eval.py +28 -8
evaluation/scores/multi_scores.py +18 -8

evaluation/scores/LLM_eval.py CHANGED Viewed

@@ -11,11 +11,12 @@ from langchain.chat_models import ChatOpenAI
 # Load the evaluator
-def init_evaluator():
-    llm = ChatOpenAI(temperature=0, model="gpt-4-0613")
-    fstring = """You are an expert English to Chinese translator specialized in Startcraft2.
             You are grading the following question:
             {input}
             Here is the real answer:
@@ -24,26 +25,45 @@ def init_evaluator():
             {output}
             based on the following criteria:
             {criteria}
-            Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
-            Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
             numerically incorrect this also includes values that have the $ in front
             Please give the completeness score first followed by the accuracy score.
-            For example: Completeness: 70. Accuracy: 40. Explanation here
             Do not differ from the format ever
             """
-    prompt = PromptTemplate.from_template(fstring)
     return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
 def evaluate_prediction(input, reference, prediction, evaluator):
     eval_result = evaluator.evaluate_strings(
         prediction=prediction,
         input=input,
         reference=reference,
     )
-    return eval_result
 if __name__ == "__main__":
     evaluator = init_evaluator()
     eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
     print(eval_result)

 # Load the evaluator
+def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
+    llm = ChatOpenAI(temperature=0, model=model)
+    lang_str = f"You are an expert {source_lang} to {target_lang} translator specialized in {domain}."
+    fstring = """
             You are grading the following question:
             {input}
             Here is the real answer:
             {output}
             based on the following criteria:
             {criteria}
+            Give one grades, accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy) and 100 is the highest (very high accuracy)?
+            Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
             numerically incorrect this also includes values that have the $ in front
             Please give the completeness score first followed by the accuracy score.
+            For example: Accuracy: 40. Explanation here
             Do not differ from the format ever
             """
+    prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")
+# Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
+#             Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
+#             numerically incorrect this also includes values that have the $ in front
+#             Please give the completeness score first followed by the accuracy score.
+#             For example: Completeness: 70. Accuracy: 40. Explanation here
+#             Do not differ from the format ever
     return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
+# prase the output of the evaluation
+# example :
+# 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
+def parse_eval_result(eval_result):
+    # score = eval_result.score
+    value = eval_result["value"]
+    value = value.split("Accuracy: ")[1].split(".")
+    # combine the rest of the string into the whole explanation
+    explanation = ".".join(value[1:])
+    return int(value[0]), explanation
 def evaluate_prediction(input, reference, prediction, evaluator):
     eval_result = evaluator.evaluate_strings(
         prediction=prediction,
         input=input,
         reference=reference,
     )
+    return parse_eval_result(eval_result)
 if __name__ == "__main__":
     evaluator = init_evaluator()
+    # For no input english sentence, just put "" in the input
     eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
     print(eval_result)

evaluation/scores/multi_scores.py CHANGED Viewed

@@ -1,16 +1,26 @@
 from comet import download_model, load_from_checkpoint
 from sacrebleu.metrics import BLEU, CHRF, TER
-from . import LLM_eval
 class multi_scores:
-    def __init__(self) -> None:
         self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
-        self.bleu_model = BLEU()
-        self.LLM_model = LLM_eval.init_evaluator()
-    def get(self, src:str, mt:str, ref:str) -> dict:
         comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
-        bleu_score = self.bleu_model.corpus_score(mt, ref).score
-        LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model).score
-        return {'bleu':bleu_score, 'comet':comet_score, 'llm':LLM_score}

 from comet import download_model, load_from_checkpoint
 from sacrebleu.metrics import BLEU, CHRF, TER
+import LLM_eval
 class multi_scores:
+    def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
         self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
+        self.bleu_model = BLEU(tokenize="zh")
+        self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
+    # The function to get the scores
+    # src: orginal sentence
+    # mt: machine translation
+    # ref: reference translation
+    def get_scores(self, src:str, mt:str, ref:str) -> dict:
         comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
+        bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
+        LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
+        return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':LLM_score[0], 'llm_explanation':LLM_score[1]}
+if __name__ == "__main__":
+    src = "this is an test sentences"
+    mt = "这是一个测试句子。"
+    ref = "这不是一个测试语句。"
+    print(multi_scores().get_scores(src, mt, ref))