add auto eval
Browse files- app.py +10 -5
- eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +34 -0
- eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/{demo-leaderboard β omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json +12 -12
- eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json +35 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json +0 -0
- eval-results/{demo-leaderboard/qwen2-72b_jina-zh β omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json +1 -1
- src/about.py +5 -5
- src/envs.py +3 -2
- src/leaderboard/read_evals.py +1 -1
    	
        app.py
    CHANGED
    
    | @@ -24,7 +24,7 @@ from src.display.utils import ( | |
| 24 | 
             
                WeightType,
         | 
| 25 | 
             
                Precision
         | 
| 26 | 
             
            )
         | 
| 27 | 
            -
            from src.envs import API, EVAL_REQUESTS_PATH,  | 
| 28 | 
             
            from src.populate import get_evaluation_queue_df, get_leaderboard_df
         | 
| 29 | 
             
            from src.submission.submit import add_new_eval
         | 
| 30 |  | 
| @@ -41,7 +41,8 @@ try: | |
| 41 | 
             
            except Exception:
         | 
| 42 | 
             
                restart_space()
         | 
| 43 | 
             
            try:
         | 
| 44 | 
            -
                print( | 
|  | |
| 45 | 
             
                # snapshot_download(
         | 
| 46 | 
             
                #     repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
         | 
| 47 | 
             
                # )
         | 
| @@ -49,7 +50,8 @@ except Exception: | |
| 49 | 
             
                restart_space()
         | 
| 50 |  | 
| 51 |  | 
| 52 | 
            -
             | 
|  | |
| 53 |  | 
| 54 | 
             
            # (
         | 
| 55 | 
             
            #     finished_eval_queue_df,
         | 
| @@ -97,8 +99,11 @@ with demo: | |
| 97 | 
             
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         | 
| 98 |  | 
| 99 | 
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         | 
| 100 | 
            -
                    with gr.TabItem(" | 
| 101 | 
            -
                        leaderboard = init_leaderboard( | 
|  | |
|  | |
|  | |
| 102 |  | 
| 103 | 
             
                    with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
         | 
| 104 | 
             
                        gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         | 
|  | |
| 24 | 
             
                WeightType,
         | 
| 25 | 
             
                Precision
         | 
| 26 | 
             
            )
         | 
| 27 | 
            +
            from src.envs import API, EVAL_REQUESTS_PATH, AUTO_RESULTS_PATH, HUMAN_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
         | 
| 28 | 
             
            from src.populate import get_evaluation_queue_df, get_leaderboard_df
         | 
| 29 | 
             
            from src.submission.submit import add_new_eval
         | 
| 30 |  | 
|  | |
| 41 | 
             
            except Exception:
         | 
| 42 | 
             
                restart_space()
         | 
| 43 | 
             
            try:
         | 
| 44 | 
            +
                print(AUTO_RESULTS_PATH)
         | 
| 45 | 
            +
                print(HUMAN_RESULTS_PATH)
         | 
| 46 | 
             
                # snapshot_download(
         | 
| 47 | 
             
                #     repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
         | 
| 48 | 
             
                # )
         | 
|  | |
| 50 | 
             
                restart_space()
         | 
| 51 |  | 
| 52 |  | 
| 53 | 
            +
            AUTO_LEADERBOARD_DF = get_leaderboard_df(AUTO_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
         | 
| 54 | 
            +
            HUMAN_LEADERBOARD_DF = get_leaderboard_df(HUMAN_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
         | 
| 55 |  | 
| 56 | 
             
            # (
         | 
| 57 | 
             
            #     finished_eval_queue_df,
         | 
|  | |
| 99 | 
             
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         | 
| 100 |  | 
| 101 | 
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         | 
| 102 | 
            +
                    with gr.TabItem("πOmniEval-Human", elem_id="llm-benchmark-tab-table", id=0):
         | 
| 103 | 
            +
                        leaderboard = init_leaderboard(HUMAN_LEADERBOARD_DF)
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                    with gr.TabItem("π€OmniEval-Auto", elem_id="llm-benchmark-tab-table", id=1):
         | 
| 106 | 
            +
                        leaderboard = init_leaderboard(AUTO_LEADERBOARD_DF)
         | 
| 107 |  | 
| 108 | 
             
                    with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
         | 
| 109 | 
             
                        gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         | 
    	
        eval-results/omnieval-auto/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.0,
         | 
| 5 | 
            +
                  "map": 0.0
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0011680767773708802,
         | 
| 9 | 
            +
                  "f1": 0.3709233008524321,
         | 
| 10 | 
            +
                  "rouge1": 0.2570830224992733,
         | 
| 11 | 
            +
                  "rouge2": 0.09085043984411759,
         | 
| 12 | 
            +
                  "rougeL": 0.1860727124152372,
         | 
| 13 | 
            +
                  "accuracy": 0.35869427958075517,
         | 
| 14 | 
            +
                  "completeness": 0.5755086661642803,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.0,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.11213720316622691
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "CLOSE_deepseek-v2-chat",
         | 
| 22 | 
            +
                "generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
         | 
| 25 | 
            +
                  "num_params": 236,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "CLOSE",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "num_params": 0.0,
         | 
| 31 | 
            +
                  "open_source": true
         | 
| 32 | 
            +
                }
         | 
| 33 | 
            +
              }
         | 
| 34 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.0,
         | 
| 5 | 
            +
                  "map": 0.0
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0008839499936860714,
         | 
| 9 | 
            +
                  "f1": 0.39891051266403244,
         | 
| 10 | 
            +
                  "rouge1": 0.2679937299203498,
         | 
| 11 | 
            +
                  "rouge2": 0.09293819886242284,
         | 
| 12 | 
            +
                  "rougeL": 0.19931718897529843,
         | 
| 13 | 
            +
                  "accuracy": 0.3238413941154186,
         | 
| 14 | 
            +
                  "completeness": 0.52843637454982,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.0,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.06765619606489472
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "CLOSE_llama3-70b-instruct",
         | 
| 22 | 
            +
                "generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
         | 
| 25 | 
            +
                  "num_params": 70.6,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "CLOSE",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "num_params": 0.0,
         | 
| 31 | 
            +
                  "open_source": true
         | 
| 32 | 
            +
                }
         | 
| 33 | 
            +
              }
         | 
| 34 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.0,
         | 
| 5 | 
            +
                  "map": 0.0
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0002525571410531633,
         | 
| 9 | 
            +
                  "f1": 0.32215271896313463,
         | 
| 10 | 
            +
                  "rouge1": 0.2352109086389165,
         | 
| 11 | 
            +
                  "rouge2": 0.08060449522198783,
         | 
| 12 | 
            +
                  "rougeL": 0.16073680618083347,
         | 
| 13 | 
            +
                  "accuracy": 0.37883571157974494,
         | 
| 14 | 
            +
                  "completeness": 0.6016923768159353,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.0,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.1255931667193926
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "CLOSE_qwen2-72b",
         | 
| 22 | 
            +
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 25 | 
            +
                  "num_params": 72.7,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "CLOSE",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "num_params": 0.0,
         | 
| 31 | 
            +
                  "open_source": true
         | 
| 32 | 
            +
                }
         | 
| 33 | 
            +
              }
         | 
| 34 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,34 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.0,
         | 
| 5 | 
            +
                  "map": 0.0
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0,
         | 
| 9 | 
            +
                  "f1": 0.06725057117657031,
         | 
| 10 | 
            +
                  "rouge1": 0.1277764944666756,
         | 
| 11 | 
            +
                  "rouge2": 0.03211441875898112,
         | 
| 12 | 
            +
                  "rougeL": 0.03257144660565082,
         | 
| 13 | 
            +
                  "accuracy": 0.15734309887612072,
         | 
| 14 | 
            +
                  "completeness": 0.5063249001331558,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.0,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.06932865291794647
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "CLOSE_yi15-34b",
         | 
| 22 | 
            +
                "generative_model": "01ai/Yi-1.5-34B-Chat-16K",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "01ai/Yi-1.5-34B-Chat-16K",
         | 
| 25 | 
            +
                  "num_params": 34.4,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "CLOSE",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "num_params": 0.0,
         | 
| 31 | 
            +
                  "open_source": true
         | 
| 32 | 
            +
                }
         | 
| 33 | 
            +
              }
         | 
| 34 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/bge-large-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,35 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.3097634381445468,
         | 
| 5 | 
            +
                  "map": 0.30402197247127166
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0026518499810582142,
         | 
| 9 | 
            +
                  "f1": 0.2480828824153542,
         | 
| 10 | 
            +
                  "rouge1": 0.2493538725800514,
         | 
| 11 | 
            +
                  "rouge2": 0.1235656068292625,
         | 
| 12 | 
            +
                  "rougeL": 0.16098924930699862,
         | 
| 13 | 
            +
                  "accuracy": 0.3906427579239803,
         | 
| 14 | 
            +
                  "completeness": 0.5930474914396308,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.5045650189122212,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.28149656401119877
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "bge-large-zh_qwen2-72b",
         | 
| 22 | 
            +
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 25 | 
            +
                  "num_params": 72.7,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "BAAI/bge-large-zh",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "name": "BAAI/bge-large-zh",
         | 
| 31 | 
            +
                  "num_params": 0.326,
         | 
| 32 | 
            +
                  "open_source": true
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              }
         | 
| 35 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/bge-m3_qwen2-72b/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,35 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.33076566906595944,
         | 
| 5 | 
            +
                  "map": 0.32402765500694536
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.002525571410531633,
         | 
| 9 | 
            +
                  "f1": 0.2524796046548042,
         | 
| 10 | 
            +
                  "rouge1": 0.2542055585319881,
         | 
| 11 | 
            +
                  "rouge2": 0.12967013110722864,
         | 
| 12 | 
            +
                  "rougeL": 0.16623387811734364,
         | 
| 13 | 
            +
                  "accuracy": 0.0,
         | 
| 14 | 
            +
                  "completeness": 0.0,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.0,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.0
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "bge-m3_qwen2-72b",
         | 
| 22 | 
            +
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 25 | 
            +
                  "num_params": 72.7,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "BAAI/bge-m3",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "name": "BAAI/bge-m3",
         | 
| 31 | 
            +
                  "num_params": 0.5,
         | 
| 32 | 
            +
                  "open_source": true
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              }
         | 
| 35 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/e5-mistral-7b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,35 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.26059266742433806,
         | 
| 5 | 
            +
                  "map": 0.25533526960474806
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.002146735698951888,
         | 
| 9 | 
            +
                  "f1": 0.24207930410773865,
         | 
| 10 | 
            +
                  "rouge1": 0.24073805243800728,
         | 
| 11 | 
            +
                  "rouge2": 0.1162276261848681,
         | 
| 12 | 
            +
                  "rougeL": 0.1534679545927458,
         | 
| 13 | 
            +
                  "accuracy": 0.37713095087763604,
         | 
| 14 | 
            +
                  "completeness": 0.5855007473841555,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.49136152656008253,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.2582123758594347
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "e5-mistral-7b_qwen2-72b",
         | 
| 22 | 
            +
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 25 | 
            +
                  "num_params": 72.7,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "intfloat/e5-mistral-7b-instruct",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "name": "intfloat/e5-mistral-7b-instruct",
         | 
| 31 | 
            +
                  "num_params": 7.11,
         | 
| 32 | 
            +
                  "open_source": true
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              }
         | 
| 35 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,35 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.3406848507808225,
         | 
| 5 | 
            +
                  "map": 0.3337426863661236
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0035568464031653824,
         | 
| 9 | 
            +
                  "f1": 0.3226028700822056,
         | 
| 10 | 
            +
                  "rouge1": 0.29804464952499493,
         | 
| 11 | 
            +
                  "rouge2": 0.1619392409911174,
         | 
| 12 | 
            +
                  "rougeL": 0.21536150159516076,
         | 
| 13 | 
            +
                  "accuracy": 0.3783377209477247,
         | 
| 14 | 
            +
                  "completeness": 0.5935541629364369,
         | 
| 15 | 
            +
                  "hallucination": 0.06668379802132854,
         | 
| 16 | 
            +
                  "utilization": 0.48314821907315203,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.2761605035405193
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "gte-qwen2-1.5b_deepseek-v2-chat",
         | 
| 22 | 
            +
                "generative_model": "deepseek-ai/DeepSeek-V2-Chat-0628",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "deepseek-ai/DeepSeek-V2-Chat-0628",
         | 
| 25 | 
            +
                  "num_params": 236,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
         | 
| 31 | 
            +
                  "num_params": 1.78,
         | 
| 32 | 
            +
                  "open_source": true
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              }
         | 
| 35 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,35 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.3406848507808225,
         | 
| 5 | 
            +
                  "map": 0.3337426863661236
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.030906680136380857,
         | 
| 9 | 
            +
                  "f1": 0.4704248712273675,
         | 
| 10 | 
            +
                  "rouge1": 0.3844331865430577,
         | 
| 11 | 
            +
                  "rouge2": 0.21544656691735142,
         | 
| 12 | 
            +
                  "rougeL": 0.3082188596657867,
         | 
| 13 | 
            +
                  "accuracy": 0.4181714862987751,
         | 
| 14 | 
            +
                  "completeness": 0.586105675146771,
         | 
| 15 | 
            +
                  "hallucination": 0.0880543450397334,
         | 
| 16 | 
            +
                  "utilization": 0.45601078859491395,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.2751721876024926
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "gte-qwen2-1.5b_llama3-70b-instruct",
         | 
| 22 | 
            +
                "generative_model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
         | 
| 25 | 
            +
                  "num_params": 70.6,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
         | 
| 31 | 
            +
                  "num_params": 1.78,
         | 
| 32 | 
            +
                  "open_source": true
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              }
         | 
| 35 | 
            +
            }
         | 
    	
        eval-results/{demo-leaderboard β omnieval-auto}/gte-qwen2-1.5b_qwen2-72b/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | @@ -1,20 +1,20 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "results": {
         | 
| 3 | 
             
                "retrieval": {
         | 
| 4 | 
            -
                  "mrr": 0. | 
| 5 | 
            -
                  "map": 0. | 
| 6 | 
             
                },
         | 
| 7 | 
             
                "generation": {
         | 
| 8 | 
            -
                  "em": 0. | 
| 9 | 
            -
                  "f1": 0. | 
| 10 | 
            -
                  "rouge1": 0. | 
| 11 | 
            -
                  "rouge2": 0. | 
| 12 | 
            -
                  "rougeL": 0. | 
| 13 | 
            -
                  "accuracy": 0. | 
| 14 | 
            -
                  "completeness": 0. | 
| 15 | 
            -
                  "hallucination": 0. | 
| 16 | 
            -
                  "utilization":  | 
| 17 | 
            -
                  "numerical_accuracy": 0. | 
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "results": {
         | 
| 3 | 
             
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.3406848507808225,
         | 
| 5 | 
            +
                  "map": 0.3337426863661236
         | 
| 6 | 
             
                },
         | 
| 7 | 
             
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0028412678368480867,
         | 
| 9 | 
            +
                  "f1": 0.2477112059712835,
         | 
| 10 | 
            +
                  "rouge1": 0.25666135328401396,
         | 
| 11 | 
            +
                  "rouge2": 0.13256084364546591,
         | 
| 12 | 
            +
                  "rougeL": 0.1669344569228441,
         | 
| 13 | 
            +
                  "accuracy": 0.40573304710190683,
         | 
| 14 | 
            +
                  "completeness": 0.6131668895824045,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.5346272891410885,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.2971301335972291
         | 
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
    	
        eval-results/omnieval-auto/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,35 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.3406848507808225,
         | 
| 5 | 
            +
                  "map": 0.3337426863661236
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0,
         | 
| 9 | 
            +
                  "f1": 0.09732568803130702,
         | 
| 10 | 
            +
                  "rouge1": 0.1642342072893325,
         | 
| 11 | 
            +
                  "rouge2": 0.06542075931397044,
         | 
| 12 | 
            +
                  "rougeL": 0.059256539829821125,
         | 
| 13 | 
            +
                  "accuracy": 0.3304375804375804,
         | 
| 14 | 
            +
                  "completeness": 0.5735068912710567,
         | 
| 15 | 
            +
                  "hallucination": 0.06555017663221248,
         | 
| 16 | 
            +
                  "utilization": 0.4132755170113409,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.175
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "gte-qwen2-1.5b_yi15-34b",
         | 
| 22 | 
            +
                "generative_model": "01ai/Yi-1.5-34B-Chat-16K",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "01ai/Yi-1.5-34B-Chat-16K",
         | 
| 25 | 
            +
                  "num_params": 34.4,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "name": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
         | 
| 31 | 
            +
                  "num_params": 1.78,
         | 
| 32 | 
            +
                  "open_source": true
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              }
         | 
| 35 | 
            +
            }
         | 
    	
        eval-results/omnieval-auto/jina-zh_qwen2-72b/results_2023-12-08 15:46:20.425378.json
    ADDED
    
    | @@ -0,0 +1,35 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "results": {
         | 
| 3 | 
            +
                "retrieval": {
         | 
| 4 | 
            +
                  "mrr": 0.25315906890600665,
         | 
| 5 | 
            +
                  "map": 0.24830681483352277
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                "generation": {
         | 
| 8 | 
            +
                  "em": 0.0026518499810582142,
         | 
| 9 | 
            +
                  "f1": 0.24837825152624493,
         | 
| 10 | 
            +
                  "rouge1": 0.24111819423215256,
         | 
| 11 | 
            +
                  "rouge2": 0.11665848753826197,
         | 
| 12 | 
            +
                  "rougeL": 0.1558018779014647,
         | 
| 13 | 
            +
                  "accuracy": 0.3705644652102538,
         | 
| 14 | 
            +
                  "completeness": 0.5820335932813437,
         | 
| 15 | 
            +
                  "hallucination": 0.0,
         | 
| 16 | 
            +
                  "utilization": 0.4738984364905027,
         | 
| 17 | 
            +
                  "numerical_accuracy": 0.24648820567187915
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "config": {
         | 
| 21 | 
            +
                "eval_name": "jina-zh_qwen2-72b",
         | 
| 22 | 
            +
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
            +
                "generative_model_args": {
         | 
| 24 | 
            +
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 25 | 
            +
                  "num_params": 72.7,
         | 
| 26 | 
            +
                  "open_source": true
         | 
| 27 | 
            +
                },
         | 
| 28 | 
            +
                "retrieval_model": "jinaai/jina-embeddings-v2-base-zh",
         | 
| 29 | 
            +
                "retrieval_model_args": {
         | 
| 30 | 
            +
                  "name": "jinaai/jina-embeddings-v2-base-zh",
         | 
| 31 | 
            +
                  "num_params": 0.161,
         | 
| 32 | 
            +
                  "open_source": true
         | 
| 33 | 
            +
                }
         | 
| 34 | 
            +
              }
         | 
| 35 | 
            +
            }
         | 
    	
        eval-results/{demo-leaderboard β omnieval-human}/CLOSE_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        eval-results/{demo-leaderboard β omnieval-human}/CLOSE_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        eval-results/{demo-leaderboard β omnieval-human}/CLOSE_qwen2-72b/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        eval-results/{demo-leaderboard β omnieval-human}/CLOSE_yi15-34b/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        eval-results/{demo-leaderboard/qwen2-72b_bge-large-zh β omnieval-human/bge-large-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | @@ -18,7 +18,7 @@ | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            -
                "eval_name": " | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
|  | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            +
                "eval_name": "bge-large-zh_qwen2-72b",
         | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
    	
        eval-results/{demo-leaderboard/qwen2-72b_bge-m3 β omnieval-human/bge-m3_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | @@ -18,7 +18,7 @@ | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            -
                "eval_name": " | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
|  | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            +
                "eval_name": "bge-m3_qwen2-72b",
         | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
    	
        eval-results/{demo-leaderboard/qwen2-72b_e5-mistral-7b β omnieval-human/e5-mistral-7b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | @@ -18,7 +18,7 @@ | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            -
                "eval_name": " | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
|  | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            +
                "eval_name": "e5-mistral-7b_qwen2-72b",
         | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
    	
        eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_deepseek-v2-chat/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_llama3-70b-instruct/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        eval-results/{demo-leaderboard/qwen2-72b_gte-qwen2-1.5b β omnieval-human/gte-qwen2-1.5b_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | @@ -18,7 +18,7 @@ | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            -
                "eval_name": " | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
|  | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            +
                "eval_name": "gte-qwen2-1.5b_qwen2-72b",
         | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
    	
        eval-results/{demo-leaderboard β omnieval-human}/gte-qwen2-1.5b_yi15-34b/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | 
            File without changes
         | 
    	
        eval-results/{demo-leaderboard/qwen2-72b_jina-zh β omnieval-human/jina-zh_qwen2-72b}/results_2023-12-08 15:46:20.425378.json
    RENAMED
    
    | @@ -18,7 +18,7 @@ | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            -
                "eval_name": " | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
|  | |
| 18 | 
             
                }
         | 
| 19 | 
             
              },
         | 
| 20 | 
             
              "config": {
         | 
| 21 | 
            +
                "eval_name": "jina-zh_qwen2-72b",
         | 
| 22 | 
             
                "generative_model": "Qwen/Qwen2.5-72B-Instruct",
         | 
| 23 | 
             
                "generative_model_args": {
         | 
| 24 | 
             
                  "name": "Qwen/Qwen2.5-72B-Instruct",
         | 
    	
        src/about.py
    CHANGED
    
    | @@ -83,12 +83,12 @@ LLM_BENCHMARKS_TEXT = f""" | |
| 83 | 
             
            With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
         | 
| 84 |  | 
| 85 |  | 
| 86 | 
            -
            ##  | 
| 87 | 
             
            `conda env create -f environment.yml && conda activate finrag`
         | 
| 88 |  | 
| 89 | 
            -
            <!-- ##  | 
| 90 | 
             
            1. -->
         | 
| 91 | 
            -
            ##  | 
| 92 | 
             
            Notion:
         | 
| 93 | 
             
            1. The code run path is `./OpenFinBench`
         | 
| 94 | 
             
            2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
         | 
| @@ -136,11 +136,11 @@ Then conduct the model-based evaluate using the following codes, (change the par | |
| 136 | 
             
            sh evaluator/judgement/judger.sh 
         | 
| 137 | 
             
            ``` 
         | 
| 138 |  | 
| 139 | 
            -
            ##  | 
| 140 |  | 
| 141 | 
             
            OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
         | 
| 142 |  | 
| 143 | 
            -
            ##  | 
| 144 | 
             
            The paper is waiting to be released!
         | 
| 145 |  | 
| 146 | 
             
            <!-- # Check Infos
         | 
|  | |
| 83 | 
             
            With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. -->
         | 
| 84 |  | 
| 85 |  | 
| 86 | 
            +
            ## π§ Installation
         | 
| 87 | 
             
            `conda env create -f environment.yml && conda activate finrag`
         | 
| 88 |  | 
| 89 | 
            +
            <!-- ## β¨ Features
         | 
| 90 | 
             
            1. -->
         | 
| 91 | 
            +
            ## π Quick-Start
         | 
| 92 | 
             
            Notion:
         | 
| 93 | 
             
            1. The code run path is `./OpenFinBench`
         | 
| 94 | 
             
            2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a>
         | 
|  | |
| 136 | 
             
            sh evaluator/judgement/judger.sh 
         | 
| 137 | 
             
            ``` 
         | 
| 138 |  | 
| 139 | 
            +
            ## π License
         | 
| 140 |  | 
| 141 | 
             
            OmniEval is licensed under the [<u>MIT License</u>](./LICENSE).
         | 
| 142 |  | 
| 143 | 
            +
            ## π Citation
         | 
| 144 | 
             
            The paper is waiting to be released!
         | 
| 145 |  | 
| 146 | 
             
            <!-- # Check Infos
         | 
    	
        src/envs.py
    CHANGED
    
    | @@ -6,7 +6,7 @@ from huggingface_hub import HfApi | |
| 6 | 
             
            # ----------------------------------
         | 
| 7 | 
             
            TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
         | 
| 8 |  | 
| 9 | 
            -
            OWNER = " | 
| 10 | 
             
            # ----------------------------------
         | 
| 11 |  | 
| 12 | 
             
            REPO_ID = f"{OWNER}/leaderboard"
         | 
| @@ -18,7 +18,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".") | |
| 18 |  | 
| 19 | 
             
            # Local caches
         | 
| 20 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
| 21 | 
            -
             | 
|  | |
| 22 | 
             
            EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
         | 
| 23 | 
             
            EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
         | 
| 24 |  | 
|  | |
| 6 | 
             
            # ----------------------------------
         | 
| 7 | 
             
            TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
         | 
| 8 |  | 
| 9 | 
            +
            OWNER = "RUC-NLPIR" # Change to your org - don't forget to create a results and request dataset, with the correct format!
         | 
| 10 | 
             
            # ----------------------------------
         | 
| 11 |  | 
| 12 | 
             
            REPO_ID = f"{OWNER}/leaderboard"
         | 
|  | |
| 18 |  | 
| 19 | 
             
            # Local caches
         | 
| 20 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
| 21 | 
            +
            HUMAN_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-human")
         | 
| 22 | 
            +
            AUTO_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results", "omnieval-auto")
         | 
| 23 | 
             
            EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
         | 
| 24 | 
             
            EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
         | 
| 25 |  | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -183,7 +183,7 @@ def get_request_file_for_model(requests_path, model_name, precision): | |
| 183 | 
             
            def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
         | 
| 184 | 
             
                """From the path of the results folder root, extract all needed info for results"""
         | 
| 185 | 
             
                model_result_filepaths = []
         | 
| 186 | 
            -
             | 
| 187 | 
             
                for root, _, files in os.walk(results_path):
         | 
| 188 | 
             
                    # We should only have json files in model results
         | 
| 189 | 
             
                    if len(files) == 0 or any([not f.endswith(".json") for f in files]):
         | 
|  | |
| 183 | 
             
            def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
         | 
| 184 | 
             
                """From the path of the results folder root, extract all needed info for results"""
         | 
| 185 | 
             
                model_result_filepaths = []
         | 
| 186 | 
            +
                print(f"Reading results from {results_path}")
         | 
| 187 | 
             
                for root, _, files in os.walk(results_path):
         | 
| 188 | 
             
                    # We should only have json files in model results
         | 
| 189 | 
             
                    if len(files) == 0 or any([not f.endswith(".json") for f in files]):
         | 

