File size: 6,181 Bytes
9866f52
 
 
 
 
 
 
 
61f0c12
9866f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4e9b7d
 
9866f52
61f0c12
9866f52
 
 
 
 
 
61f0c12
9866f52
 
 
 
b4e9b7d
 
9866f52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c8e76f
9866f52
 
 
 
 
 
 
 
 
 
b4e9b7d
a5c5dd3
b4e9b7d
9866f52
 
 
 
 
 
1548d6f
a5c5dd3
b4e9b7d
 
9866f52
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import json
import os
import numpy as np
import re
import gradio as gr

tasks = ["hellaswag", "arc_challenge", "hendrycks", "truthfulqa_mc"]
validators = ["opentensor_foundation"]

def clean_result(result, task):
    if ("hendrycks" in task):
        if ((len(result["result"]) <= 2) and (result["result"] != "") and (result["result"][0].isupper())) or ((result["result"] != "") and (re.match('[A-Z]\.', result["result"][:2]))):
            if result["result"][0] == "A":
                result["cleaned_result"] = "1"
            elif result["result"][0] == "B":
                result["cleaned_result"] = "2"
            elif result["result"][0] == "C":
                result["cleaned_result"] = "3"
            elif result["result"][0] == "D":
                result["cleaned_result"] = "4"
            else:
                result["cleaned_result"] = "N/A"
        else:
            result["cleaned_result"] = "N/A"
    
    elif (task == "truthfulqa_mc"):
        cleaned_result = []
        for r in result['result']:
            if 'False' in r:
                cleaned_result.append(0)
            elif 'True' in r:
                cleaned_result.append(1)
            else:
                cleaned_result.append("N/A")
        result["cleaned_result"] = cleaned_result
    else:
        if (result["result"] != "") and (result["result"][0].isnumeric()):
            result["cleaned_result"] = result["result"][0]
        else:
            result["cleaned_result"] = "N/A"
    return result

def mc2(doc):
    # Split on the first `0` as everything before it is true (`1`).
    split_idx = list(doc["mc2_targets"]["labels"]).index(0)
    lls = doc["cleaned_result"]
    # Compute the normalized probability mass for the correct answer.
    ll_true, ll_false = lls[:split_idx], lls[split_idx:]
    p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
    p_true = p_true / (sum(p_true) + sum(p_false))
    return sum(p_true)

final_total_results = []
final_split_results = []
results_cumulative = []
for validator in validators:
    results_dir_file_list = os.listdir(f"""_results/few-shot/{validator}""")
    number_of_nas, number_of_results, inference_total = 0,0,0
    for task in tasks:
        task_results_files = [result_file for result_file in results_dir_file_list if task in result_file]
        
        results = []
        for task_results_file in task_results_files:
            results_file_dir = f"""_results/few-shot/{validator}/{task_results_file}"""
            f = open(results_file_dir)
            results += json.load(f)
        
        results = [clean_result(result, task) if "result" in result else result for result in results]
        results_cumulative += results

        # Total results 
        number_of_nas += len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])
        inference_total += np.array([result['inference_time'] for result in results if 'inference_time' in result]).sum()
        number_of_results += len([1 for result in results if ('cleaned_result' in result)])

        # Indiviudal results    
        result_coverage = round((sum(['result' in result for result in results])/len(results))*100,2)
        na_coverage = round((len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])/len(['result' in result for result in results]))*100,2)
        inference_avg = round(np.array([result['inference_time'] for result in results if 'inference_time' in result]).mean(), 2)
        
        if task == "truthfulqa_mc":
            metric = round(np.array([mc2(result) for result in results if ("cleaned_result" in result) and ("N/A" not in result["cleaned_result"])]).mean()*100,2)
        else:
            metric = round((len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") and (int(result["cleaned_result"]) == (int(result["gold"])+1))])/len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") ]))*100,2)
        
        final_split_results.append({
            "task" : task,
            "coverage_%" :result_coverage,
            "na_%" : na_coverage,
            "inference_avg" : inference_avg,
            "metric" : metric
        })

    print(final_split_results)    

    final_total_results.append({
        "Validator": validator.replace("_", " ").capitalize(),
        "N/A %" : round((number_of_nas/number_of_results)*100,2),
        "Avg Inference (s)" : round((inference_total/number_of_results),2),
        "Average ⬆️": 0,
        "ARC (25-shot) ⬆️": final_split_results[tasks.index("arc_challenge")]["metric"],
        "HellaSwag (10-shot) ⬆️": final_split_results[tasks.index("hellaswag")]["metric"],
        "MMLU (5-shot) ⬆️": final_split_results[tasks.index("hendrycks")]["metric"],
        "TruthfulQA (0-shot) ⬆️": final_split_results[tasks.index("truthfulqa_mc")]["metric"]
    })
    final_total_results[-1]["Average ⬆️"] = np.array([final_total_results[0]["ARC (25-shot) ⬆️"], final_total_results[0]["HellaSwag (10-shot) ⬆️"],final_total_results[0]["TruthfulQA (0-shot) ⬆️"], final_total_results[0]["MMLU (5-shot) ⬆️"]]).mean()

df = pd.DataFrame(results_cumulative)
df = df[df["cleaned_result"] == "N/A"].groupby("result", as_index=False).count().sort_values(by = ["id"], ascending = False).head(10)[["result","id"]].rename(columns={"result": "Result", "id": "ID"})

demo = gr.Blocks()
with demo:
    with gr.Row():
        title = gr.Markdown(value=f"""# <p style="text-align: center;"> Bittensor LMEH Leaderboard</p>""")
    with gr.Row():
        table_1 = gr.Dataframe(pd.DataFrame(final_total_results))
    with gr.Row():
        title = gr.Markdown(value=f"""# <p style="text-align: center;"> Analysis Of Top 10 N/A Responses</p>""")
    with gr.Row():
        table_2 = gr.Dataframe(df)
    # with gr.Row(visible = False):
    #     table_2 = gr.Dataframe(pd.DataFrame(final_split_results))

demo.queue(concurrency_count = 5)
demo.launch(enable_queue=True, debug=True, server_name="0.0.0.0", server_port=7860)