Spaces:
Running
on
L4
Running
on
L4
| from typing import Optional, Union | |
| import lightning.pytorch as pl | |
| import torch | |
| from lightning import LightningModule, Trainer | |
| from lightning.pytorch.callbacks import Callback | |
| from torch import Tensor, nn | |
| from torch.utils._foreach_utils import ( | |
| _group_tensors_by_device_and_dtype, | |
| _has_foreach_support, | |
| ) | |
| def grad_norm( | |
| parameters: Union[Tensor, list[Tensor]], | |
| norm_type: float = 2.0, | |
| ) -> float: | |
| """ | |
| Returns the norm of the gradients of the given parameters. | |
| Args: | |
| parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a | |
| single Tensor that will have gradients normalized | |
| norm_type (float): type of the used p-norm. | |
| Returns: | |
| Total norm of the parameter gradients (viewed as a single vector). | |
| """ # noqa: E501 | |
| if isinstance(parameters, Tensor): | |
| parameters = [parameters] | |
| grads = [p.grad for p in parameters if p.grad is not None] | |
| if len(grads) == 0: | |
| return None | |
| first_device = grads[0].device | |
| grouped_grads: dict[ | |
| tuple[torch.device, torch.dtype], list[list[Tensor]] | |
| ] = _group_tensors_by_device_and_dtype( | |
| [[g.detach() for g in grads]] | |
| ) # type: ignore[assignment] | |
| norms = [] | |
| for (device, _), ([grads], _) in grouped_grads.items(): | |
| if _has_foreach_support(grads, device=device): | |
| norms.extend(torch._foreach_norm(grads, norm_type)) | |
| else: | |
| norms.extend([torch.norm(g, norm_type) for g in grads]) | |
| return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type) | |
| class GradNormMonitor(Callback): | |
| """ | |
| Callback that computes the gradient norm of the model parameters. | |
| """ | |
| def __init__( | |
| self, | |
| norm_type: float = 2.0, | |
| logging_interval: str = "step", | |
| sub_module: Optional[Union[str, list[str]]] = None, | |
| ) -> None: | |
| """ | |
| Args: | |
| norm_type (float): type of the used p-norm. | |
| logging_interval (str): "step" or "epoch". | |
| """ | |
| super().__init__() | |
| self.norm_type = norm_type | |
| self.logging_interval = logging_interval | |
| self.sub_module = sub_module | |
| def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None: | |
| """ | |
| Computes the gradient norm of the model parameters and logs it to the logger. | |
| Args: | |
| trainer (Trainer): The trainer object | |
| model (LightningModule): The current lightningModule | |
| """ | |
| lightning_model = model | |
| if self.sub_module is None: | |
| return self.log_sub_module_grad_norm(lightning_model, model, "") | |
| sub_modules = self.sub_module | |
| if isinstance(sub_modules, str): | |
| sub_modules = [sub_modules] | |
| for sub_module in sub_modules: | |
| self.log_sub_module_grad_norm( | |
| lightning_model, getattr(model, sub_module), f"/{sub_module}" | |
| ) | |
| def log_sub_module_grad_norm( | |
| self, lightning_model: LightningModule, model: nn.Module, path: str | |
| ) -> None: | |
| grad_norm_val = grad_norm(model.parameters(), self.norm_type) | |
| if grad_norm_val is None: | |
| return | |
| on_step = self.logging_interval == "step" | |
| lightning_model.log( | |
| f"train{path}/grad_norm", | |
| grad_norm_val, | |
| on_step=on_step, | |
| on_epoch=not on_step, | |
| ) | |