model-eval-be / svc /router.py
Ahmet Kaan Sever
Changed return json format
41affa9
from datetime import datetime, timedelta
from fastapi import APIRouter, HTTPException, Depends
import logging
from lm_eval import evaluator
from svc.schemas import LMHarnessTaskRequest, TaskResponse, LoadModelRequest, DeepEvalSuiteRequest
from fastapi.security import OAuth2PasswordRequestForm
from auth.authentication import get_current_user, create_access_token
from dotenv import load_dotenv
import os
import json
from pathlib import Path
from src.deepeval.deepeval_task_manager import DeepEvalTaskManager
import torch
import gc
from time import time
from huggingface_hub import HfApi, ModelInfo
import threading
router = APIRouter()
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
# Or configure a HfApi client
hf_api = HfApi(
token=HF_TOKEN, # Token is not persisted on the machine.
)
@router.post("/token")
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
auth_token = os.getenv("AUTH_UUID")
if auth_token != form_data.password:
raise HTTPException(status_code=400, detail="Incorrect username or password")
access_token = create_access_token(data={"sub": form_data.username})
return {"access_token": access_token, "token_type": "bearer"}
@router.get("/protected")
async def protected_route(username: str = Depends(get_current_user)):
return {"message": f"Hello, {username}! This is a protected resource."}
@router.get("/deepeval/status")
async def deep_eval_status():
#Return running with 200 status code
return {"status": "running"}
@router.get("/deepeval/hardware")
def hardware_status():
info = get_gpu_tier()
return info
@router.post("/chat", response_model=TaskResponse)
def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
logger.info(request)
try:
logger.info("Inside")
results = evaluator.simple_evaluate(
model=request.model,
model_args=request.model_args,
tasks=request.tasks,
num_fewshot=request.num_fewshot,
batch_size=request.batch_size,
device=request.device,
limit=request.limit,
write_out=request.write_out # Whether to write out an example document and model input, for checking task integrity
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"lm-harness task execution failed for model: {request.model_args}")
torch.cuda.empty_cache()
results["config"]["model_dtype"] = request.precision
model_name = request.model_args.split(",")[0].split("=")[1]
results["config"]["model_name"] = model_name
results["config"]["model_sha"] = request.model_sha
dumped = json.dumps(results, indent=2)
logger.info("-------------------results------------------\n")
logger.info(dumped)
logger.info("-------------------results end------------------\n")
return TaskResponse(results=dumped)
@router.post("/deepeval/eval", response_model=TaskResponse)
def deep_eval_suite(request: DeepEvalSuiteRequest):
def run_in_background():
try:
torch.cuda.empty_cache()
des = DeepEvalTaskManager(request.model_name, request.tasks)
start_time = time()
results = des.run_tasks()
end_time = time()
duration = round(end_time - start_time, 2)
model_info: ModelInfo = hf_api.model_info(request.model_name)
config = {
"model_source": "hf",
"num_fewshot": 0,
"batch_size": 8,
"device": "cuda:0",
"model_dtype": "torch.float16",
"model_name": request.model_name,
"model_sha": model_info.sha,
}
final_results = {
"results": results,
"config": config,
"total_evaluation_time_seconds": duration,
"start_time": start_time,
"end_time": end_time
}
# Save and upload
dumped = json.dumps(final_results, indent=2)
path = Path("/tmp", request.model_name, f"results_{datetime.now()}.json")
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(dumped)
RESULTS_REPO = "metunlp/results"
hf_api.upload_file(
path_or_fileobj=path,
path_in_repo=path.relative_to("/tmp").as_posix(),
repo_id=RESULTS_REPO,
repo_type="dataset",
)
logger.info(f"βœ… Uploaded results to HF Hub for {request.model_name}")
except Exception as e:
logger.exception(f"❌ Background evaluation failed: {e}")
# πŸ” Start evaluation in background
threading.Thread(target=run_in_background, daemon=True).start()
# βœ… Immediately respond
return TaskResponse(results=json.dumps({"status": "Evaluation started in background"}))
def get_gpu_tier():
if not torch.cuda.is_available():
return {"gpu": "CPU", "tier": "cpu"}
gpu_name = torch.cuda.get_device_name(0).lower()
# Normalize GPU model to your custom tier system
if "t4" in gpu_name:
# You can improve this by checking memory or other context
return {"gpu": "Tesla T4", "tier": "t4-medium"}
elif "l4" in gpu_name:
return {"gpu": "NVIDIA L4", "tier": "l4x1"}
elif "l40s" in gpu_name:
return {"gpu": "NVIDIA L40S", "tier": "l40sx1"}
elif "a10g" in gpu_name:
return {"gpu": "NVIDIA A10G", "tier": "a10g"}
else:
return {"gpu": gpu_name, "tier": "unknown"}