ameerazam08's picture
Upload folder using huggingface_hub
03da825 verified
import logging
import os
import time
from typing import List
import torch
from eval import verification
from utils.utils_logging import AverageMeter
from torch.utils.tensorboard import SummaryWriter
from torch import distributed
class CallBackVerification(object):
def __init__(self, val_targets, rec_prefix, summary_writer=None, image_size=(112, 112), wandb_logger=None):
self.rank: int = distributed.get_rank()
self.highest_acc: float = 0.0
self.highest_acc_list: List[float] = [0.0] * len(val_targets)
self.ver_list: List[object] = []
self.ver_name_list: List[str] = []
if self.rank is 0:
self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size)
self.summary_writer = summary_writer
self.wandb_logger = wandb_logger
def ver_test(self, backbone: torch.nn.Module, global_step: int):
results = []
for i in range(len(self.ver_list)):
acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
self.ver_list[i], backbone, 10, 10)
logging.info('[%s][%d]XNorm: %f' % (self.ver_name_list[i], global_step, xnorm))
logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2))
self.summary_writer: SummaryWriter
self.summary_writer.add_scalar(tag=self.ver_name_list[i], scalar_value=acc2, global_step=global_step, )
if self.wandb_logger:
import wandb
self.wandb_logger.log({
f'Acc/val-Acc1 {self.ver_name_list[i]}': acc1,
f'Acc/val-Acc2 {self.ver_name_list[i]}': acc2,
# f'Acc/val-std1 {self.ver_name_list[i]}': std1,
# f'Acc/val-std2 {self.ver_name_list[i]}': acc2,
})
if acc2 > self.highest_acc_list[i]:
self.highest_acc_list[i] = acc2
logging.info(
'[%s][%d]Accuracy-Highest: %1.5f' % (self.ver_name_list[i], global_step, self.highest_acc_list[i]))
results.append(acc2)
def init_dataset(self, val_targets, data_dir, image_size):
for name in val_targets:
path = os.path.join(data_dir, name + ".bin")
if os.path.exists(path):
data_set = verification.load_bin(path, image_size)
self.ver_list.append(data_set)
self.ver_name_list.append(name)
def __call__(self, num_update, backbone: torch.nn.Module):
if self.rank is 0 and num_update > 0:
backbone.eval()
self.ver_test(backbone, num_update)
backbone.train()
class CallBackLogging(object):
def __init__(self, frequent, total_step, batch_size, start_step=0,writer=None):
self.frequent: int = frequent
self.rank: int = distributed.get_rank()
self.world_size: int = distributed.get_world_size()
self.time_start = time.time()
self.total_step: int = total_step
self.start_step: int = start_step
self.batch_size: int = batch_size
self.writer = writer
self.init = False
self.tic = 0
def __call__(self,
global_step: int,
loss: AverageMeter,
epoch: int,
fp16: bool,
learning_rate: float,
grad_scaler: torch.cuda.amp.GradScaler):
if self.rank == 0 and global_step > 0 and global_step % self.frequent == 0:
if self.init:
try:
speed: float = self.frequent * self.batch_size / (time.time() - self.tic)
speed_total = speed * self.world_size
except ZeroDivisionError:
speed_total = float('inf')
#time_now = (time.time() - self.time_start) / 3600
#time_total = time_now / ((global_step + 1) / self.total_step)
#time_for_end = time_total - time_now
time_now = time.time()
time_sec = int(time_now - self.time_start)
time_sec_avg = time_sec / (global_step - self.start_step + 1)
eta_sec = time_sec_avg * (self.total_step - global_step - 1)
time_for_end = eta_sec/3600
if self.writer is not None:
self.writer.add_scalar('time_for_end', time_for_end, global_step)
self.writer.add_scalar('learning_rate', learning_rate, global_step)
self.writer.add_scalar('loss', loss.avg, global_step)
if fp16:
msg = "Speed %.2f samples/sec Loss %.4f LearningRate %.6f Epoch: %d Global Step: %d " \
"Fp16 Grad Scale: %2.f Required: %1.f hours" % (
speed_total, loss.avg, learning_rate, epoch, global_step,
grad_scaler.get_scale(), time_for_end
)
else:
msg = "Speed %.2f samples/sec Loss %.4f LearningRate %.6f Epoch: %d Global Step: %d " \
"Required: %1.f hours" % (
speed_total, loss.avg, learning_rate, epoch, global_step, time_for_end
)
logging.info(msg)
loss.reset()
self.tic = time.time()
else:
self.init = True
self.tic = time.time()