Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	File size: 3,121 Bytes
			
			| be62d39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | from lm_eval import tasks, evaluator, utils
from lm_eval.tasks import initialize_tasks, TaskManager
try: 
    from lm_eval.tasks import include_task_folder
except:
    from lm_eval.tasks import include_path
from src.backend.manage_requests import EvalRequest
# from src.backend.tasks.xsum.task import XSum
# from src.backend.tasks.xsum.task_v2 import XSumv2
# from src.backend.tasks.cnndm.task import CNNDM
# from src.backend.tasks.cnndm.task_v2 import CNNDMv2
# from src.backend.tasks.selfcheckgpt.task import SelfCheckGpt
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
    if limit:
        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
    # try: 
    #     include_task_folder("src/backend/tasks/")
    # except:
    #     include_path("src/backend/tasks")
    # initialize_tasks('INFO')
    # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
    # indexes all tasks from the `lm_eval/tasks` subdirectory.
    # Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
    # to include a set of tasks in a separate directory.
    task_manager = TaskManager(include_path="src/backend/probing_tasks")
    if "gpt" in eval_request.model:
        model = "openai-chat-completions"
    else:
        model = "hf-auto"
    print(f"Considered Tasks (after overriding): {task_names}")
    print(f"model_args: {eval_request.get_model_args()}")
    results = evaluator.simple_evaluate(model=model,  # "hf-causal-experimental",  # "hf-causal" how can i make this work for 
                                        model_args=eval_request.get_model_args(),
                                        task_manager=task_manager,
                                        tasks=task_names,
                                        num_fewshot=num_fewshot,
                                        batch_size=batch_size,
                                        max_batch_size=8,
                                        device=device,
                                        use_cache=use_cache,
                                        limit=limit,
                                        # task_manager=task_manager,
                                        # include_path="/Users/chaeeunlee/Documents/VSC_workspaces/biomed_probing_leaderboard/src/backend/tasks",
                                        write_out=True)
    results["config"]["model_dtype"] = eval_request.precision
    results["config"]["model_name"] = eval_request.model
    results["config"]["model_sha"] = eval_request.revision
    if max_nb_samples is not None:
        if 'samples' in results:
            samples = results['samples']
            for task_name in samples.keys():
                if len(samples[task_name]) > max_nb_samples:
                    results['samples'][task_name] = results['samples'][task_name][:max_nb_samples]
    # print(evaluator.make_table(results))
    return results
 | 
