add @register_model and @register_callback decorators

Refactor ClockTrainer to include Callback
2024-11-22 06:08:46 +00:00 · 2024-02-12 08:28:41 +00:00 · 2024-02-12 08:28:41 +00:00 · d6546c9026
parent f541badcb3
commit d6546c9026
12 changed files with 565 additions and 703 deletions
--- a/src/refiners/training_utils/init.py
+++ b/src/refiners/training_utils/init.py
@ -4,8 +4,20 @@ from importlib.metadata import requires
 from packaging.requirements import Requirement
-from refiners.training_utils.config import BaseConfig
+from refiners.training_utils.callback import Callback, CallbackConfig
-from refiners.training_utils.trainer import Trainer
+from refiners.training_utils.clock import ClockConfig
 from refiners.training_utils.config import (
    BaseConfig,
    ModelConfig,
    OptimizerConfig,
    Optimizers,
    SchedulerConfig,
    SchedulerType,
    TrainingConfig,
 )
 from refiners.training_utils.gradient_clipping import GradientClippingConfig
 from refiners.training_utils.trainer import Trainer, register_callback, register_model
 from refiners.training_utils.wandb import WandbConfig, WandbMixin
 refiners_requires = requires("refiners")
 assert refiners_requires is not None
@ -29,4 +41,18 @@ for dep in refiners_requires:
 __all__ = [
    "Trainer",
    "BaseConfig",
    "ModelConfig",
    "register_callback",
    "register_model",
    "Callback",
    "CallbackConfig",
    "WandbMixin",
    "WandbConfig",
    "SchedulerConfig",
    "OptimizerConfig",
    "TrainingConfig",
    "ClockConfig",
    "GradientClippingConfig",
    "Optimizers",
    "SchedulerType",
 ]
--- a/src/refiners/training_utils/callback.py
+++ b/src/refiners/training_utils/callback.py
@ -1,43 +1,22 @@
-from typing import TYPE_CHECKING, Any, Generic, Iterable, TypeVar
+from typing import TYPE_CHECKING, Any, Generic, TypeVar
-from loguru import logger
+from pydantic import BaseModel, ConfigDict
 from torch import tensor
 from torch.nn import Parameter
 if TYPE_CHECKING:
    from refiners.training_utils.config import BaseConfig
    from refiners.training_utils.trainer import Trainer
-__all__ = [
+T = TypeVar("T", bound="Trainer[BaseConfig, Any]")
    "Callback",
    "GradientNormClipping",
    "GradientValueClipping",
    "ClockCallback",
 ]
-def clip_gradient_norm(parameters: Iterable[Parameter], total_norm: float, clip_norm: float = 1.0) -> None:
+class CallbackConfig(BaseModel):
    """
-    Clips the gradient norm of the parameters of a given model similar to `clip_grad_norm_`.
+    Base configuration for a callback.
    For your callback to be properly configured, you should inherit from this class and add your own configuration.
    """
    gradients = [p.grad.detach() for p in parameters if p.grad is not None]
    assert gradients, "The model has no gradients to clip."
    clip_coefficient = tensor(data=clip_norm / (total_norm + 1e-6)).clamp(max=1)
    for gradient in gradients:
        gradient.mul_(other=clip_coefficient)  # type: ignore
-
+    model_config = ConfigDict(extra="forbid")
 def clip_gradient_value(parameters: Iterable[Parameter], clip_value: float) -> None:
    """
    Clips the gradients of the parameters of a given model at an individual level similar to `clip_grad_value_`.
    """
    gradients = [p.grad.detach() for p in parameters if p.grad is not None]
    assert gradients, "The model has no gradients to clip."
    for gradient in gradients:
        gradient.clamp_(min=-clip_value, max=clip_value)
 T = TypeVar("T")
 class Callback(Generic[T]):
@ -97,71 +76,3 @@ class Callback(Generic[T]):
    def on_checkpoint_save(self, trainer: T) -> None:
        ...
 class ClockCallback(Callback["Trainer[BaseConfig, Any]"]):
    def on_train_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.reset()
        logger.info(
            (
                "Starting training for a total of: "
                f"{trainer.clock.num_steps} steps, "
                f"{trainer.clock.num_epochs} epochs, "
                f"{trainer.clock.num_iterations} iterations."
            )
        )
        trainer.clock.start_timer()
    def on_train_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.stop_timer()
        logger.info(
            (
                "Training took: "
                f"{trainer.clock.time_elapsed} seconds, "
                f"{trainer.clock.iteration} iterations, "
                f"{trainer.clock.epoch} epochs, "
                f"{trainer.clock.step} steps."
            )
        )
    def on_epoch_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        logger.info(f"Epoch {trainer.clock.epoch} started.")
    def on_epoch_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.epoch += 1
        trainer.clock.num_batches_processed = 0
    def on_batch_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        logger.info(f"Step {trainer.clock.step} started.")
    def on_backward_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.step += 1
        trainer.clock.num_batches_processed += 1
        trainer.clock.num_minibatches_processed += 1
    def on_optimizer_step_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        logger.info(f"Iteration {trainer.clock.iteration} ended.")
        trainer.clock.iteration += 1
        trainer.clock.num_minibatches_processed = 0
    def on_evaluate_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        logger.info("Evaluation started.")
    def on_evaluate_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        logger.info("Evaluation ended.")
 class GradientNormClipping(Callback["Trainer[BaseConfig, Any]"]):
    def on_backward_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        clip_norm = trainer.config.training.clip_grad_norm
        if clip_norm is not None:
            clip_gradient_norm(
                parameters=trainer.learnable_parameters, total_norm=trainer.total_gradient_norm, clip_norm=clip_norm
            )
 class GradientValueClipping(Callback["Trainer[BaseConfig, Any]"]):
    def on_backward_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        clip_value = trainer.config.training.clip_grad_value
        if clip_value is not None:
            clip_gradient_value(parameters=trainer.learnable_parameters, clip_value=clip_value)
--- a/src/refiners/training_utils/clock.py
+++ b/src/refiners/training_utils/clock.py
@ -0,0 +1,193 @@
 import time
 from functools import cached_property
 from typing import TYPE_CHECKING, Any
 from refiners.training_utils.callback import Callback, CallbackConfig
 from refiners.training_utils.common import TimeUnit, TimeValue
 if TYPE_CHECKING:
    from refiners.training_utils.config import BaseConfig
    from refiners.training_utils.trainer import Trainer
 from loguru import logger
 from torch import Tensor
 class ClockConfig(CallbackConfig):
    verbose: bool = True
 class TrainingClock(Callback["Trainer[BaseConfig, Any]"]):
    def __init__(
        self,
        dataset_length: int,
        batch_size: int,
        training_duration: TimeValue,
        gradient_accumulation: TimeValue,
        evaluation_interval: TimeValue,
        lr_scheduler_interval: TimeValue,
        verbose: bool = True,
    ) -> None:
        self.dataset_length = dataset_length
        self.batch_size = batch_size
        self.training_duration = training_duration
        self.gradient_accumulation = gradient_accumulation
        self.evaluation_interval = evaluation_interval
        self.lr_scheduler_interval = lr_scheduler_interval
        self.verbose = verbose
        self.num_batches_per_epoch = dataset_length // batch_size
        self.start_time = None
        self.end_time = None
        self.step = 0
        self.epoch = 0
        self.iteration = 0
        self.num_batches_processed = 0
        self.num_minibatches_processed = 0
        self.loss: Tensor | None = None
    @cached_property
    def unit_to_steps(self) -> dict[TimeUnit, int]:
        iteration_factor = self.num_batches_per_epoch if self.gradient_accumulation["unit"] == TimeUnit.EPOCH else 1
        return {
            TimeUnit.STEP: 1,
            TimeUnit.EPOCH: self.num_batches_per_epoch,
            TimeUnit.ITERATION: self.gradient_accumulation["number"] * iteration_factor,
        }
    def convert_time_unit_to_steps(self, number: int, unit: TimeUnit) -> int:
        return number * self.unit_to_steps[unit]
    def convert_steps_to_time_unit(self, steps: int, unit: TimeUnit) -> int:
        return steps // self.unit_to_steps[unit]
    def convert_time_value(self, time_value: TimeValue, target_unit: TimeUnit) -> int:
        number, unit = time_value["number"], time_value["unit"]
        steps = self.convert_time_unit_to_steps(number=number, unit=unit)
        return self.convert_steps_to_time_unit(steps=steps, unit=target_unit)
    @cached_property
    def num_epochs(self) -> int:
        return self.convert_time_value(time_value=self.training_duration, target_unit=TimeUnit.EPOCH)
    @cached_property
    def num_iterations(self) -> int:
        return self.convert_time_value(time_value=self.training_duration, target_unit=TimeUnit.ITERATION)
    @cached_property
    def num_steps(self) -> int:
        return self.convert_time_value(time_value=self.training_duration, target_unit=TimeUnit.STEP)
    @cached_property
    def num_step_per_iteration(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.gradient_accumulation["number"], unit=self.gradient_accumulation["unit"]
        )
    @cached_property
    def num_step_per_evaluation(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.evaluation_interval["number"], unit=self.evaluation_interval["unit"]
        )
    def reset(self) -> None:
        self.start_time = None
        self.end_time = None
        self.step = 0
        self.epoch = 0
        self.iteration = 0
        self.num_batches_processed = 0
        self.num_minibatches_processed = 0
    def start_timer(self) -> None:
        self.start_time = time.time()
    def stop_timer(self) -> None:
        self.end_time = time.time()
    @property
    def time_elapsed(self) -> int:
        assert self.start_time is not None, "Timer has not been started yet."
        return int(time.time() - self.start_time)
    @cached_property
    def evaluation_interval_steps(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.evaluation_interval["number"], unit=self.evaluation_interval["unit"]
        )
    @cached_property
    def lr_scheduler_interval_steps(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.lr_scheduler_interval["number"], unit=self.lr_scheduler_interval["unit"]
        )
    @property
    def is_optimizer_step(self) -> bool:
        return self.num_minibatches_processed == self.num_step_per_iteration
    @property
    def is_lr_scheduler_step(self) -> bool:
        return self.step % self.lr_scheduler_interval_steps == 0
    @property
    def done(self) -> bool:
        return self.step >= self.num_steps
    @property
    def is_evaluation_step(self) -> bool:
        return self.step % self.evaluation_interval_steps == 0
    def log(self, message: str, /) -> None:
        if self.verbose:
            logger.info(message)
    def on_train_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.reset()
        self.log(
            (
                "Starting training for a total of: "
                f"{trainer.clock.num_steps} steps, "
                f"{trainer.clock.num_epochs} epochs, "
                f"{trainer.clock.num_iterations} iterations."
            )
        )
        trainer.clock.start_timer()
    def on_train_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.stop_timer()
        self.log(
            (
                "Training took: "
                f"{trainer.clock.time_elapsed} seconds, "
                f"{trainer.clock.iteration} iterations, "
                f"{trainer.clock.epoch} epochs, "
                f"{trainer.clock.step} steps."
            )
        )
    def on_epoch_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        self.log(f"Epoch {trainer.clock.epoch} started.")
    def on_epoch_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.epoch += 1
        trainer.clock.num_batches_processed = 0
    def on_batch_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        self.log(f"Step {trainer.clock.step} started.")
    def on_backward_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        trainer.clock.step += 1
        trainer.clock.num_batches_processed += 1
        trainer.clock.num_minibatches_processed += 1
    def on_optimizer_step_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        self.log(f"Iteration {trainer.clock.iteration} ended.")
        trainer.clock.iteration += 1
        trainer.clock.num_minibatches_processed = 0
    def on_evaluate_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        self.log("Evaluation started.")
    def on_evaluate_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        self.log("Evaluation ended.")
--- a/src/refiners/training_utils/common.py
+++ b/src/refiners/training_utils/common.py
@ -0,0 +1,103 @@
 import random
 from enum import Enum
 from functools import wraps
 from typing import Any, Callable, Iterable
 import numpy as np
 import torch
 from loguru import logger
 from torch import Tensor, cuda, nn
 from typing_extensions import TypedDict
 from refiners.fluxion.utils import manual_seed
 def compute_grad_norm(parameters: Iterable[nn.Parameter]) -> float:
    """
    Computes the gradient norm of the parameters of a given model similar to `clip_grad_norm_` returned value.
    """
    gradients: list[Tensor] = [p.grad.detach() for p in parameters if p.grad is not None]
    assert gradients, "The model has no gradients to compute the norm."
    total_norm = torch.stack(tensors=[gradient.norm() for gradient in gradients]).norm().item()  # type: ignore
    return total_norm  # type: ignore
 def count_learnable_parameters(parameters: Iterable[nn.Parameter]) -> int:
    return sum(p.numel() for p in parameters if p.requires_grad)
 def human_readable_number(number: int) -> str:
    float_number = float(number)
    for unit in ["", "K", "M", "G", "T", "P"]:
        if abs(float_number) < 1000:
            return f"{float_number:.1f}{unit}"
        float_number /= 1000
    return f"{float_number:.1f}E"
 def seed_everything(seed: int | None = None) -> None:
    if seed is None:
        seed = random.randint(0, 2**32 - 1)
        logger.info(f"Using random seed: {seed}")
    random.seed(a=seed)
    np.random.seed(seed=seed)
    manual_seed(seed=seed)
    cuda.manual_seed_all(seed=seed)
 def scoped_seed(seed: int | Callable[..., int] | None = None) -> Callable[..., Callable[..., Any]]:
    """
    Decorator for setting a random seed within the scope of a function.
    This decorator sets the random seed for Python's built-in `random` module,
    `numpy`, and `torch` and `torch.cuda` at the beginning of the decorated function. After the
    function is executed, it restores the state of the random number generators
    to what it was before the function was called. This is useful for ensuring
    reproducibility for specific parts of the code without affecting randomness
    elsewhere.
    """
    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
        @wraps(func)
        def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
            random_state = random.getstate()
            numpy_state = np.random.get_state()
            torch_state = torch.get_rng_state()
            cuda_torch_state = cuda.get_rng_state()
            actual_seed = seed(*args) if callable(seed) else seed
            seed_everything(seed=actual_seed)
            result = func(*args, **kwargs)
            random.setstate(random_state)
            np.random.set_state(numpy_state)
            torch.set_rng_state(torch_state)
            cuda.set_rng_state(cuda_torch_state)
            return result
        return inner_wrapper
    return decorator
 class TimeUnit(Enum):
    STEP = "step"
    EPOCH = "epoch"
    ITERATION = "iteration"
    DEFAULT = "step"
 class TimeValue(TypedDict):
    number: int
    unit: TimeUnit
 def parse_number_unit_field(value: str | int | dict[str, str | int]) -> TimeValue:
    match value:
        case str(value_str):
            number, unit = value_str.split(sep=":")
            return {"number": int(number.strip()), "unit": TimeUnit(value=unit.strip().lower())}
        case int(number):
            return {"number": number, "unit": TimeUnit.DEFAULT}
        case {"number": int(number), "unit": str(unit)}:
            return {"number": number, "unit": TimeUnit(value=unit.lower())}
        case _:
            raise ValueError(f"Unsupported value format: {value}")
--- a/src/refiners/training_utils/config.py
+++ b/src/refiners/training_utils/config.py
@ -9,50 +9,16 @@ from prodigyopt import Prodigy  # type: ignore
 from pydantic import BaseModel, ConfigDict, validator
 from torch import Tensor
 from torch.optim import SGD, Adam, AdamW, Optimizer
 from typing_extensions import TypedDict  # https://errors.pydantic.dev/2.0b3/u/typed-dict-version
-import refiners.fluxion.layers as fl
+from refiners.training_utils.clock import ClockConfig
-from refiners.training_utils.dropout import apply_dropout, apply_gyro_dropout
+from refiners.training_utils.common import TimeUnit, TimeValue, parse_number_unit_field
 from refiners.training_utils.gradient_clipping import GradientClippingConfig
 # PyTorch optimizer parameters type
 # TODO: replace with `from torch.optim.optimizer import ParamsT` when PyTorch 2.2+ is enforced
 # See https://github.com/pytorch/pytorch/pull/111114
 ParamsT = Iterable[Tensor] | Iterable[dict[str, Any]]
 __all__ = [
    "parse_number_unit_field",
    "TimeUnit",
    "TimeValue",
    "TrainingConfig",
    "OptimizerConfig",
    "Optimizers",
 ]
 class TimeUnit(Enum):
    STEP = "step"
    EPOCH = "epoch"
    ITERATION = "iteration"
    DEFAULT = "step"
 class TimeValue(TypedDict):
    number: int
    unit: TimeUnit
 def parse_number_unit_field(value: str | int | dict[str, str | int]) -> TimeValue:
    match value:
        case str(value_str):
            number, unit = value_str.split(sep=":")
            return {"number": int(number.strip()), "unit": TimeUnit(value=unit.strip().lower())}
        case int(number):
            return {"number": number, "unit": TimeUnit.DEFAULT}
        case {"number": int(number), "unit": str(unit)}:
            return {"number": number, "unit": TimeUnit(value=unit.lower())}
        case _:
            raise ValueError(f"Unsupported value format: {value}")
 class TrainingConfig(BaseModel):
    device: str = "cpu"
@ -61,8 +27,6 @@ class TrainingConfig(BaseModel):
    seed: int = 0
    batch_size: int = 1
    gradient_accumulation: TimeValue = {"number": 1, "unit": TimeUnit.STEP}
    clip_grad_norm: float | None = None
    clip_grad_value: float | None = None
    evaluation_interval: TimeValue = {"number": 1, "unit": TimeUnit.ITERATION}
    evaluation_seed: int = 0
@ -195,29 +159,6 @@ class ModelConfig(BaseModel):
    model_config = ConfigDict(extra="forbid")
 class GyroDropoutConfig(BaseModel):
    total_subnetworks: int = 512
    concurrent_subnetworks: int = 64
    iters_per_epoch: int = 512
    num_features_threshold: float = 5e5
    model_config = ConfigDict(extra="forbid")
 class DropoutConfig(BaseModel):
    dropout_probability: float = 0.0
    gyro_dropout: GyroDropoutConfig | None = None
    model_config = ConfigDict(extra="forbid")
    def apply_dropout(self, model: fl.Chain) -> None:
        if self.dropout_probability > 0.0:
            if self.gyro_dropout is not None:
                apply_gyro_dropout(module=model, probability=self.dropout_probability, **self.gyro_dropout.model_dump())
            else:
                apply_dropout(module=model, probability=self.dropout_probability)
 T = TypeVar("T", bound="BaseConfig")
@ -226,7 +167,8 @@ class BaseConfig(BaseModel):
    training: TrainingConfig
    optimizer: OptimizerConfig
    scheduler: SchedulerConfig
-    dropout: DropoutConfig
+    clock: ClockConfig = ClockConfig()
    gradient_clipping: GradientClippingConfig = GradientClippingConfig()
    model_config = ConfigDict(extra="forbid")
--- a/src/refiners/training_utils/dropout.py
+++ b/src/refiners/training_utils/dropout.py
@ -1,200 +0,0 @@
 from typing import TYPE_CHECKING, Any, TypeVar
 from torch import Tensor, cat, rand, randint
 from torch.nn import Dropout as TorchDropout
 import refiners.fluxion.layers as fl
 from refiners.fluxion.adapters.adapter import Adapter
 from refiners.training_utils.callback import Callback
 if TYPE_CHECKING:
    from refiners.training_utils.config import BaseConfig
    from refiners.training_utils.trainer import Trainer
 __all__ = ["Dropout", "GyroDropout", "DropoutCallback"]
 class Dropout(TorchDropout, fl.Module):
    def __init__(self, probability: float = 0.5, inplace: bool = False) -> None:
        super().__init__(p=probability, inplace=inplace)
 class GyroDropout(fl.Module):
    """
    GyroDropout is a variant of dropout that maximizes the ensemble effect during neural network training.
    It pre-selects a fixed number of dropout masks and periodically selects a subset of them for training.
    This leads to increased robustness and diversity among the subnetworks, improving accuracy compared to conventional
    dropout.
    Parameters:
    -----------
    total_subnetworks:
        The total number of pre-selected subnetworks ('Sigma'). These subnetworks are dropout masks
        that are precomputed and stored.
    concurrent_subnetworks:
        The number of subnetworks to use concurrently in each forward pass ('Tau'). A random selection of
        masks from the precomputed set is used to dropout different portions of the input.
    dropout_probability: float, optional (default=0.5)
        The probability that an element will be zeroed by the dropout.
    iters_per_epoch:
        Number of iterations per epoch, used to determine how often the masks should be updated.
    num_features_threshold:
        If the number of features in the input is greater than this threshold, dropout is skipped. This is because
        gyro dropout mask size vram usage is proportional to the number of features in the input.
    """
    def __init__(
        self,
        total_subnetworks: int,
        concurrent_subnetworks: int,
        dropout_probability: float = 0.5,
        iters_per_epoch: int = 1,
        num_features_threshold: float = 5e5,
    ) -> None:
        super().__init__()
        assert (
            iters_per_epoch >= total_subnetworks
        ), "The number of iterations per epoch must be greater than the number of masks"
        self.dropout_probability = dropout_probability
        self.iters_per_epoch = iters_per_epoch
        self.total_subnetworks = total_subnetworks
        self.concurrent_subnetworks = concurrent_subnetworks
        self.scale = 1 / (1 - self.dropout_probability)
        self.mask_update_interval = int(self.iters_per_epoch / self.total_subnetworks) * self.concurrent_subnetworks
        self.preselected_masks: Tensor | None = None
        self.dropout_mask = None
        self.training_step = 0
        self.num_features_threshold = num_features_threshold
        self.skip_high_num_features = False
    def forward(self, x: Tensor) -> Tensor:
        if not self.training:
            return x
        if self.skip_high_num_features:
            return self.basic_dropout(x)
        if self.training_step == 0:
            num_features = x.shape[1] * x.shape[2] if x.dim() == 3 else x.shape[1]
            if num_features > self.num_features_threshold:
                self.skip_high_num_features = True
                self.basic_dropout = Dropout(probability=self.dropout_probability)
                return self.basic_dropout(x)
            self.init_masks(x=x)
        if self.training_step % self.mask_update_interval == 0:
            self.update_dropout_mask(x=x)
        self.training_step += 1
        return x * self.dropout_mask * self.scale
    def init_masks(self, x: Tensor) -> None:
        if x.dim() == 2:
            self.preselected_masks = (
                rand(self.total_subnetworks, x.shape[1], device=x.device) > self.dropout_probability
            )
        if x.dim() == 3:
            self.preselected_masks = (
                rand(self.total_subnetworks, x.shape[1], x.shape[2], device=x.device) > self.dropout_probability
            )
        assert self.preselected_masks is not None, "The input tensor must have 2 or 3 dimensions"
        self.preselected_masks = self.preselected_masks.float()
    def update_dropout_mask(self, x: Tensor) -> None:
        assert self.preselected_masks is not None
        indices = randint(low=0, high=self.total_subnetworks, size=(self.concurrent_subnetworks,), device=x.device)
        selected_masks = self.preselected_masks[indices]
        repeat_factor = x.shape[0] // self.concurrent_subnetworks
        remaining = x.shape[0] % self.concurrent_subnetworks
        repeated_masks = [selected_masks] * repeat_factor
        if remaining > 0:
            repeated_masks.append(selected_masks[:remaining])
        final_masks = cat(tensors=repeated_masks, dim=0)
        if x.dim() == 2:
            self.dropout_mask = final_masks
        if x.dim() == 3:
            self.dropout_mask = final_masks.expand_as(x)
 class DropoutAdapter(fl.Chain, Adapter[fl.Linear]):
    def __init__(self, target: fl.Linear, probability: float = 0.5):
        with self.setup_adapter(target):
            super().__init__(target, Dropout(probability=probability))
 class GyroDropoutAdapter(fl.Chain, Adapter[fl.Linear]):
    def __init__(
        self,
        target: fl.Linear,
        probability: float = 0.5,
        total_subnetworks: int = 512,
        concurrent_subnetworks: int = 64,
        iters_per_epoch: int = 512,
        num_features_threshold: float = 5e5,
    ) -> None:
        self.probability = probability
        self.total_subnetworks = total_subnetworks
        self.concurrent_subnetworks = concurrent_subnetworks
        self.iters_per_epoch = iters_per_epoch
        with self.setup_adapter(target):
            super().__init__(
                target,
                GyroDropout(
                    total_subnetworks=total_subnetworks,
                    concurrent_subnetworks=concurrent_subnetworks,
                    dropout_probability=probability,
                    iters_per_epoch=iters_per_epoch,
                    num_features_threshold=num_features_threshold,
                ),
            )
 def apply_dropout(module: fl.Chain, probability: float = 0.5) -> None:
    for linear, parent in module.walk(fl.Linear):
        if not linear.weight.requires_grad:
            continue
        assert not (
            isinstance(parent, Dropout) or isinstance(parent, GyroDropout)
        ), f"{linear} already has a dropout layer"
        DropoutAdapter(target=linear, probability=probability).inject(parent)
 def apply_gyro_dropout(
    module: fl.Chain,
    probability: float = 0.5,
    total_subnetworks: int = 32,
    concurrent_subnetworks: int = 16,
    iters_per_epoch: int = 32,
 ) -> None:
    for linear, parent in module.walk(fl.Linear):
        if not linear.weight.requires_grad:
            continue
        assert not (
            isinstance(parent, Dropout) or isinstance(parent, GyroDropout)
        ), f"{linear} already has a dropout layer"
        GyroDropoutAdapter(
            target=linear,
            probability=probability,
            total_subnetworks=total_subnetworks,
            concurrent_subnetworks=concurrent_subnetworks,
            iters_per_epoch=iters_per_epoch,
        ).inject(parent)
 ConfigType = TypeVar("ConfigType", bound="BaseConfig")
 class DropoutCallback(Callback["Trainer[ConfigType, Any]"]):
    def on_train_begin(self, trainer: "Trainer[ConfigType, Any]") -> None:
        dropout_config = trainer.config.dropout
        chain_models = [model for model in trainer.models.values() if isinstance(model, fl.Chain)]
        for model in chain_models:
            dropout_config.apply_dropout(model=model)
--- a/src/refiners/training_utils/gradient_clipping.py
+++ b/src/refiners/training_utils/gradient_clipping.py
@ -0,0 +1,52 @@
 from typing import TYPE_CHECKING, Any, Iterable
 import torch
 from torch import nn
 from refiners.training_utils.callback import Callback, CallbackConfig
 if TYPE_CHECKING:
    from refiners.training_utils.config import BaseConfig
    from refiners.training_utils.trainer import Trainer
 def clip_gradient_norm(parameters: Iterable[nn.Parameter], total_norm: float, clip_norm: float = 1.0) -> None:
    """
    Clips the gradient norm of the parameters of a given model similar to `clip_grad_norm_`.
    """
    gradients = [p.grad.detach() for p in parameters if p.grad is not None]
    assert gradients, "The model has no gradients to clip."
    clip_coefficient = torch.tensor(data=clip_norm / (total_norm + 1e-6)).clamp(max=1)
    for gradient in gradients:
        gradient.mul_(other=clip_coefficient)  # type: ignore
 def clip_gradient_value(parameters: Iterable[nn.Parameter], clip_value: float) -> None:
    """
    Clips the gradients of the parameters of a given model at an individual level similar to `clip_grad_value_`.
    """
    gradients = [p.grad.detach() for p in parameters if p.grad is not None]
    assert gradients, "The model has no gradients to clip."
    for gradient in gradients:
        gradient.clamp_(min=-clip_value, max=clip_value)
 class GradientClippingConfig(CallbackConfig):
    clip_grad_norm: float | None = None
    clip_grad_value: float | None = None
 class GradientClipping(Callback["Trainer[BaseConfig, Any]"]):
    def __init__(self, config: GradientClippingConfig) -> None:
        self.config = config
    def on_backward_end(self, trainer: "Trainer[BaseConfig, Any]") -> None:
        clip_norm = self.config.clip_grad_norm
        if clip_norm is not None:
            clip_gradient_norm(
                parameters=trainer.learnable_parameters, total_norm=trainer.total_gradient_norm, clip_norm=clip_norm
            )
        clip_value = self.config.clip_grad_value
        if clip_value is not None:
            clip_gradient_value(parameters=trainer.learnable_parameters, clip_value=clip_value)
--- a/src/refiners/training_utils/trainer.py
+++ b/src/refiners/training_utils/trainer.py
@ -1,15 +1,12 @@
 import random
 import time
 from abc import ABC, abstractmethod, abstractproperty
 from dataclasses import dataclass
 from functools import cached_property, wraps
-from typing import Any, Callable, Generic, Iterable, TypeVar, cast
+from typing import Any, Callable, Generic, Literal, TypeVar, cast
 import numpy as np
 import torch
 from loguru import logger
-from torch import Tensor, cuda, device as Device, dtype as DType, get_rng_state, set_rng_state, stack
+from torch import Tensor, device as Device, dtype as DType, nn
 from torch.autograd import backward
 from torch.nn import Parameter
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
@ -27,73 +24,20 @@ from torch.optim.lr_scheduler import (
 from torch.utils.data import DataLoader, Dataset
 from refiners.fluxion import layers as fl
-from refiners.fluxion.utils import manual_seed, no_grad
+from refiners.fluxion.utils import no_grad
 from refiners.training_utils.callback import (
    Callback,
-    ClockCallback,
+    CallbackConfig,
    GradientNormClipping,
    GradientValueClipping,
 )
-from refiners.training_utils.config import BaseConfig, SchedulerType, TimeUnit, TimeValue
+from refiners.training_utils.clock import ClockConfig, TrainingClock
-from refiners.training_utils.dropout import DropoutCallback
+from refiners.training_utils.common import (
-
+    compute_grad_norm,
-__all__ = ["seed_everything", "scoped_seed", "Trainer"]
+    count_learnable_parameters,
-
+    human_readable_number,
-
+    scoped_seed,
-def count_learnable_parameters(parameters: Iterable[Parameter]) -> int:
+)
-    return sum(p.numel() for p in parameters if p.requires_grad)
+from refiners.training_utils.config import BaseConfig, ModelConfig, SchedulerType
-
+from refiners.training_utils.gradient_clipping import GradientClipping, GradientClippingConfig
 def human_readable_number(number: int) -> str:
    float_number = float(number)
    for unit in ["", "K", "M", "G", "T", "P"]:
        if abs(float_number) < 1000:
            return f"{float_number:.1f}{unit}"
        float_number /= 1000
    return f"{float_number:.1f}E"
 def seed_everything(seed: int | None = None) -> None:
    if seed is None:
        seed = random.randint(0, 2**32 - 1)
        logger.info(f"Using random seed: {seed}")
    random.seed(a=seed)
    np.random.seed(seed=seed)
    manual_seed(seed=seed)
    cuda.manual_seed_all(seed=seed)
 def scoped_seed(seed: int | Callable[..., int] | None = None) -> Callable[..., Callable[..., Any]]:
    """
    Decorator for setting a random seed within the scope of a function.
    This decorator sets the random seed for Python's built-in `random` module,
    `numpy`, and `torch` and `torch.cuda` at the beginning of the decorated function. After the
    function is executed, it restores the state of the random number generators
    to what it was before the function was called. This is useful for ensuring
    reproducibility for specific parts of the code without affecting randomness
    elsewhere.
    """
    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
        @wraps(func)
        def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
            random_state = random.getstate()
            numpy_state = np.random.get_state()
            torch_state = get_rng_state()
            cuda_torch_state = cuda.get_rng_state()
            actual_seed = seed(*args) if callable(seed) else seed
            seed_everything(seed=actual_seed)
            result = func(*args, **kwargs)
            random.setstate(random_state)
            np.random.set_state(numpy_state)
            set_rng_state(torch_state)
            cuda.set_rng_state(cuda_torch_state)
            return result
        return inner_wrapper
    return decorator
 class WarmupScheduler(LRScheduler):
@ -117,135 +61,6 @@ class WarmupScheduler(LRScheduler):
            self._step_count += 1
 class TrainingClock:
    def __init__(
        self,
        dataset_length: int,
        batch_size: int,
        training_duration: TimeValue,
        gradient_accumulation: TimeValue,
        evaluation_interval: TimeValue,
        lr_scheduler_interval: TimeValue,
    ) -> None:
        self.dataset_length = dataset_length
        self.batch_size = batch_size
        self.training_duration = training_duration
        self.gradient_accumulation = gradient_accumulation
        self.evaluation_interval = evaluation_interval
        self.lr_scheduler_interval = lr_scheduler_interval
        self.num_batches_per_epoch = dataset_length // batch_size
        self.start_time = None
        self.end_time = None
        self.step = 0
        self.epoch = 0
        self.iteration = 0
        self.num_batches_processed = 0
        self.num_minibatches_processed = 0
        self.loss: Tensor | None = None
    @cached_property
    def unit_to_steps(self) -> dict[TimeUnit, int]:
        iteration_factor = self.num_batches_per_epoch if self.gradient_accumulation["unit"] == TimeUnit.EPOCH else 1
        return {
            TimeUnit.STEP: 1,
            TimeUnit.EPOCH: self.num_batches_per_epoch,
            TimeUnit.ITERATION: self.gradient_accumulation["number"] * iteration_factor,
        }
    def convert_time_unit_to_steps(self, number: int, unit: TimeUnit) -> int:
        return number * self.unit_to_steps[unit]
    def convert_steps_to_time_unit(self, steps: int, unit: TimeUnit) -> int:
        return steps // self.unit_to_steps[unit]
    def convert_time_value(self, time_value: TimeValue, target_unit: TimeUnit) -> int:
        number, unit = time_value["number"], time_value["unit"]
        steps = self.convert_time_unit_to_steps(number=number, unit=unit)
        return self.convert_steps_to_time_unit(steps=steps, unit=target_unit)
    @cached_property
    def num_epochs(self) -> int:
        return self.convert_time_value(time_value=self.training_duration, target_unit=TimeUnit.EPOCH)
    @cached_property
    def num_iterations(self) -> int:
        return self.convert_time_value(time_value=self.training_duration, target_unit=TimeUnit.ITERATION)
    @cached_property
    def num_steps(self) -> int:
        return self.convert_time_value(time_value=self.training_duration, target_unit=TimeUnit.STEP)
    @cached_property
    def num_step_per_iteration(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.gradient_accumulation["number"], unit=self.gradient_accumulation["unit"]
        )
    @cached_property
    def num_step_per_evaluation(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.evaluation_interval["number"], unit=self.evaluation_interval["unit"]
        )
    def reset(self) -> None:
        self.start_time = None
        self.end_time = None
        self.step = 0
        self.epoch = 0
        self.iteration = 0
        self.num_batches_processed = 0
        self.num_minibatches_processed = 0
    def start_timer(self) -> None:
        self.start_time = time.time()
    def stop_timer(self) -> None:
        self.end_time = time.time()
    @property
    def time_elapsed(self) -> int:
        assert self.start_time is not None, "Timer has not been started yet."
        return int(time.time() - self.start_time)
    @cached_property
    def evaluation_interval_steps(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.evaluation_interval["number"], unit=self.evaluation_interval["unit"]
        )
    @cached_property
    def lr_scheduler_interval_steps(self) -> int:
        return self.convert_time_unit_to_steps(
            number=self.lr_scheduler_interval["number"], unit=self.lr_scheduler_interval["unit"]
        )
    @property
    def is_optimizer_step(self) -> bool:
        return self.num_minibatches_processed == self.num_step_per_iteration
    @property
    def is_lr_scheduler_step(self) -> bool:
        return self.step % self.lr_scheduler_interval_steps == 0
    @property
    def done(self) -> bool:
        return self.step >= self.num_steps
    @property
    def is_evaluation_step(self) -> bool:
        return self.step % self.evaluation_interval_steps == 0
 def compute_grad_norm(parameters: Iterable[Parameter]) -> float:
    """
    Computes the gradient norm of the parameters of a given model similar to `clip_grad_norm_` returned value.
    """
    gradients: list[Tensor] = [p.grad.detach() for p in parameters if p.grad is not None]
    assert gradients, "The model has no gradients to compute the norm."
    total_norm = stack(tensors=[gradient.norm() for gradient in gradients]).norm().item()  # type: ignore
    return total_norm  # type: ignore
 Batch = TypeVar("Batch")
 ConfigType = TypeVar("ConfigType", bound=BaseConfig)
@ -267,44 +82,91 @@ class _Dataset(Dataset[Batch]):
        return self.length
-class Trainer(Generic[ConfigType, Batch], ABC):
+@dataclass
-    def __init__(self, config: ConfigType, callbacks: list[Callback[Any]] | None = None) -> None:
+class ModelItem:
-        self.config = config
+    name: str
-        self.clock = TrainingClock(
+    config: ModelConfig
-            dataset_length=self.dataset_length,
+    model: fl.Module
-            batch_size=config.training.batch_size,
+    learnable_parameters: list[nn.Parameter]
-            training_duration=config.training.duration,
+
-            evaluation_interval=config.training.evaluation_interval,
+
-            gradient_accumulation=config.training.gradient_accumulation,
+ModelRegistry = dict[str, ModelItem]
-            lr_scheduler_interval=config.scheduler.update_interval,
+ModuleT = TypeVar("ModuleT", bound=fl.Module)
 def register_model():
    def decorator(func: Callable[[Any, ModelConfig], ModuleT]) -> ModuleT:
        @wraps(func)
        def wrapper(self: Trainer[BaseConfig, Any], config: ModelConfig) -> fl.Module:
            name = func.__name__
            model = func(self, config)
            model = model.to(self.device, dtype=self.dtype)
            if config.requires_grad is not None:
                model.requires_grad_(requires_grad=config.requires_grad)
            learnable_parameters = [param for param in model.parameters() if param.requires_grad]
            self.models[name] = ModelItem(
                name=name, config=config, model=model, learnable_parameters=learnable_parameters
            )
-        self.callbacks = callbacks or []
+            setattr(self, name, self.models[name].model)
-        self.callbacks += self.default_callbacks()
+            return func(self, config)
        return wrapper  # type: ignore
    return decorator
 CallbackRegistry = dict[str, Callback[Any]]
 CallbackT = TypeVar("CallbackT", bound=Callback[Any])
 def register_callback():
    def decorator(func: Callable[[Any, Any], CallbackT]) -> CallbackT:
        @wraps(func)
        def wrapper(self: "Trainer[BaseConfig, Any]", config: Any) -> CallbackT:
            name = func.__name__
            callback = func(self, config)
            self.callbacks[name] = callback
            setattr(self, name, callback)
            return func(self, config)
        return wrapper  # type: ignore
    return decorator
 class Trainer(Generic[ConfigType, Batch], ABC):
    def __init__(self, config: ConfigType) -> None:
        self._models: ModelRegistry = {}
        self._callbacks: CallbackRegistry = {}
        self.config = config
        self._load_callbacks()
        self._call_callbacks(event_name="on_init_begin")
-        self.load_models()
+        self._load_models()
        self.prepare_models()
        self._call_callbacks(event_name="on_init_end")
-    def default_callbacks(self) -> list[Callback[Any]]:
+    @register_callback()
-        callbacks: list[Callback[Any]] = [
+    def clock(self, config: ClockConfig) -> TrainingClock:
-            ClockCallback(),
+        return TrainingClock(
-            GradientValueClipping(),
+            dataset_length=self.dataset_length,
-            GradientNormClipping(),
+            batch_size=self.config.training.batch_size,
-            DropoutCallback(),
+            training_duration=self.config.training.duration,
-        ]
+            evaluation_interval=self.config.training.evaluation_interval,
            gradient_accumulation=self.config.training.gradient_accumulation,
            lr_scheduler_interval=self.config.scheduler.update_interval,
            verbose=config.verbose,
        )
-        # look for any Callback that might be a property of the Trainer
+    @register_callback()
-        for attr_name in dir(self):
+    def gradient_clipping(self, config: GradientClippingConfig) -> GradientClipping:
-            if "__" in attr_name:
+        return GradientClipping(config)
                continue
-            try:
+    @property
-                attr = getattr(self, attr_name)
+    def models(self) -> ModelRegistry:
-            except AssertionError:
+        return self._models
-                continue
+
-            if isinstance(attr, Callback):
+    @property
-                callbacks.append(cast(Callback[Any], attr))
+    def callbacks(self) -> CallbackRegistry:
-        return callbacks
+        return self._callbacks
    @cached_property
    def device(self) -> Device:
@ -320,38 +182,31 @@ class Trainer(Generic[ConfigType, Batch], ABC):
        return dtype
    @property
-    def parameters(self) -> list[Parameter]:
+    def learnable_parameters(self) -> list[nn.Parameter]:
        """Returns a list of all parameters in all models"""
        return [param for model in self.models.values() for param in model.parameters()]
    @property
    def learnable_parameters(self) -> list[Parameter]:
        """Returns a list of learnable parameters in all models"""
-        return [param for model in self.models.values() for param in model.parameters() if param.requires_grad]
+        return [param for item in self.models.values() for param in item.learnable_parameters]
-    @property
+    @cached_property
    def optimizer_parameters(self) -> list[dict[str, Any]]:
        """
        Returns a list of `dict`-s containing the params and optimizer options for each model.
        See https://pytorch.org/docs/stable/optim.html#per-parameter-options for more details
        """
        params: list[dict[str, Any]] = []
-        for model_name, model in self.models.items():
+        for item in self.models.values():
-            model_params = [param for param in model.parameters() if param.requires_grad]
+            config = item.config
            model_config = self.config.models[model_name]
            model_optim_conf: dict[str, Any] = {}
-            if model_config.learning_rate is not None:
+            if config.learning_rate is not None:
-                model_optim_conf["lr"] = model_config.learning_rate
+                model_optim_conf["lr"] = config.learning_rate
-            if model_config.weight_decay is not None:
+            if config.weight_decay is not None:
-                model_optim_conf["weight_decay"] = model_config.learning_rate
+                model_optim_conf["weight_decay"] = config.learning_rate
-            if model_config.betas is not None:
+            if config.betas is not None:
-                model_optim_conf["betas"] = model_config.learning_rate
+                model_optim_conf["betas"] = config.learning_rate
-            if model_config.eps is not None:
+            if config.eps is not None:
-                model_optim_conf["eps"] = model_config.learning_rate
+                model_optim_conf["eps"] = config.learning_rate
-            for param in model_params:
+            params.append({"params": item.learnable_parameters, **model_optim_conf})
                params.append({"params": param, **model_optim_conf})
        return params
@ -363,17 +218,12 @@ class Trainer(Generic[ConfigType, Batch], ABC):
    @property
    def gradients(self) -> list[Tensor]:
        """Returns a list of detached gradients for all learnable parameters in all models"""
-        return [
+        return [param.grad.detach() for param in self.learnable_parameters if param.grad is not None]
            param.grad.detach()
            for model in self.models.values()
            for param in model.parameters()
            if param.grad is not None
        ]
    @property
    def total_gradient_norm(self) -> float:
        """Returns the total gradient norm for all learnable parameters in all models"""
-        return compute_grad_norm(parameters=self.parameters)
+        return compute_grad_norm(parameters=self.learnable_parameters)
    @cached_property
    def optimizer(self) -> Optimizer:
@ -441,38 +291,6 @@ class Trainer(Generic[ConfigType, Batch], ABC):
        return lr_scheduler
    @cached_property
    def models(self) -> dict[str, fl.Module]:
        return self.load_models()
    def set_models_to_train_mode(self) -> None:
        for model in self.models.values():
            model.train()
    def set_models_to_eval_mode(self) -> None:
        for model in self.models.values():
            model.eval()
    def prepare_model(self, model_name: str) -> None:
        model = self.models[model_name]
        if (checkpoint := self.config.models[model_name].checkpoint) is not None:
            model.load_from_safetensors(tensors_path=checkpoint)
        else:
            logger.info(f"No checkpoint found. Initializing model `{model_name}` from scratch.")
        if (requires_grad := self.config.models[model_name].requires_grad) is not None:
            model.requires_grad_(requires_grad=requires_grad)
        model.to(self.device)
        model.zero_grad()
    def prepare_models(self) -> None:
        assert self.models, "No models found."
        for model_name in self.models:
            self.prepare_model(model_name=model_name)
    @abstractmethod
    def load_models(self) -> dict[str, fl.Module]:
        ...
    @abstractmethod
    def get_item(self, index: int) -> Batch:
        """
@ -563,7 +381,7 @@ class Trainer(Generic[ConfigType, Batch], ABC):
    @scoped_seed(seed=get_training_seed)
    def train(self) -> None:
        """Train the model."""
-        self.set_models_to_train_mode()
+        self.set_models_to_mode("train")
        self._call_callbacks(event_name="on_train_begin")
        assert self.learnable_parameters, "There are no learnable parameters in the models."
        self.evaluate()
@ -581,12 +399,43 @@ class Trainer(Generic[ConfigType, Batch], ABC):
    @scoped_seed(seed=get_evaluation_seed)
    def evaluate(self) -> None:
        """Evaluate the model."""
-        self.set_models_to_eval_mode()
+        self.set_models_to_mode(mode="eval")
        self._call_callbacks(event_name="on_evaluate_begin")
        self.compute_evaluation()
        self._call_callbacks(event_name="on_evaluate_end")
-        self.set_models_to_train_mode()
+        self.set_models_to_mode(mode="train")
    def set_models_to_mode(self, mode: Literal["train", "eval"]) -> None:
        for item in self.models.values():
            if mode == "train":
                item.model.train()
            elif mode == "eval":
                item.model.eval()
    def _call_callbacks(self, event_name: str) -> None:
-        for callback in self.callbacks:
+        for callback in self.callbacks.values():
            getattr(callback, event_name)(self)
    def _load_callbacks(self) -> None:
        for name, config in self.config:
            if not isinstance(config, CallbackConfig):
                continue
            try:
                registered_callback = getattr(self, name)
            except AttributeError:
                raise ValueError(
                    f"Callback {name} is in the config but not registered in the Trainer. Create a method with the @register_callback decorator."
                )
            assert callable(registered_callback)
            registered_callback(config)
    def _load_models(self) -> None:
        for name, config in self.config.models.items():
            try:
                registered_model = getattr(self, name)
            except AttributeError:
                raise ValueError(
                    f"Model {name} is in the config but not registered in the Trainer. Create a method with the @register_model decorator."
                )
            assert callable(registered_model)
            registered_model(config)
--- a/src/refiners/training_utils/wandb.py
+++ b/src/refiners/training_utils/wandb.py
@ -1,15 +1,13 @@
 from abc import ABC
 from functools import cached_property
 from pathlib import Path
 from typing import Any, Literal
 import wandb
 from PIL import Image
 from pydantic import BaseModel, ConfigDict
-from refiners.training_utils.callback import Callback
+from refiners.training_utils.callback import Callback, CallbackConfig
 from refiners.training_utils.config import BaseConfig
-from refiners.training_utils.trainer import Trainer
+from refiners.training_utils.trainer import Trainer, register_callback
 number = float | int
 WandbLoggable = number | Image.Image | list[number] | dict[str, list[number]]
@ -64,7 +62,7 @@ class WandbLogger:
        return self.wandb_run.name or ""  # type: ignore
-class WandbConfig(BaseModel):
+class WandbConfig(CallbackConfig):
    """
    Wandb configuration.
@ -87,18 +85,16 @@ class WandbConfig(BaseModel):
    anonymous: Literal["never", "allow", "must"] | None = None
    id: str | None = None
    model_config = ConfigDict(extra="forbid")
 AnyTrainer = Trainer[BaseConfig, Any]
 class WandbCallback(Callback["TrainerWithWandb"]):
-    epoch_losses: list[float]
+    def __init__(self, config: WandbConfig, /, trainer_config: dict[str, Any]) -> None:
-    iteration_losses: list[float]
+        self.config = config
-
+        self.epoch_losses: list[float] = []
-    def on_init_begin(self, trainer: "TrainerWithWandb") -> None:
+        self.iteration_losses: list[float] = []
-        trainer.load_wandb()
+        self.logger = WandbLogger({**config.model_dump(), "config": trainer_config})
    def on_train_begin(self, trainer: "TrainerWithWandb") -> None:
        self.epoch_losses = []
@ -131,19 +127,13 @@ class WandbMixin(ABC):
    config: Any
    wandb_logger: WandbLogger
-    def load_wandb(self) -> None:
+    @register_callback()
-        wandb_config = getattr(self.config, "wandb", None)
+    def wandb(self, config: WandbConfig) -> WandbCallback:
-        assert wandb_config is not None and isinstance(wandb_config, WandbConfig), "Wandb config is not set"
+        return WandbCallback(config, trainer_config=self.config.model_dump())
        init_config = {**wandb_config.model_dump(), "config": self.config.model_dump()}
        self.wandb_logger = WandbLogger(init_config=init_config)
    def wandb_log(self, data: dict[str, WandbLoggable]) -> None:
        assert isinstance(self, Trainer), "WandbMixin must be mixed with a Trainer"
-        self.wandb_logger.log(data=data, step=self.clock.step)
+        self.wandb.logger.log(data=data, step=self.clock.step)
    @cached_property
    def wandb_callback(self) -> WandbCallback:
        return WandbCallback()
 class TrainerWithWandb(AnyTrainer, WandbMixin, ABC):
--- a/tests/training_utils/mock_config.toml
+++ b/tests/training_utils/mock_config.toml
@ -1,6 +1,11 @@
 [models.mock_model]
 requires_grad = true
 [clock]
 verbose = false
 [gradient_clipping]
 clip_grad_norm = 1.0
 [training]
 duration = "100:epoch"
@ -9,7 +14,6 @@ device = "cpu"
 dtype = "float32"
 batch_size = 4
 gradient_accumulation = "4:step"
 clip_grad_norm = 1.0
 evaluation_interval = "5:epoch"
 evaluation_seed = 1
@ -21,6 +25,3 @@ learning_rate = 1
 scheduler_type = "ConstantLR"
 update_interval = "1:step"
 warmup = "20:step"
 [dropout]
 dropout_probability = 0.0
--- a/tests/training_utils/mock_config_2_models.toml
+++ b/tests/training_utils/mock_config_2_models.toml
@ -5,12 +5,17 @@ learning_rate = 1e-5
 [models.mock_model2]
 requires_grad = true
 [clock]
 verbose = false
 [gradient_clipping]
 clip_grad_norm = 1.0
 [training]
 duration = "100:epoch"
 seed = 0
 batch_size = 4
 gradient_accumulation = "4:step"
 clip_grad_norm = 1.0
 evaluation_interval = "5:epoch"
 evaluation_seed = 1
@ -21,6 +26,3 @@ learning_rate = 1
 [scheduler]
 scheduler_type = "ConstantLR"
 update_interval = "1:step"
 [dropout]
 dropout_probability = 0.0
--- a/tests/training_utils/test_trainer.py
+++ b/tests/training_utils/test_trainer.py
@ -1,5 +1,4 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import cast
@ -10,13 +9,15 @@ from torch.optim import SGD
 from refiners.fluxion import layers as fl
 from refiners.fluxion.utils import norm
-from refiners.training_utils.config import BaseConfig, TimeUnit
+from refiners.training_utils.common import TimeUnit, count_learnable_parameters, human_readable_number
 from refiners.training_utils.config import BaseConfig, ModelConfig
 from refiners.training_utils.trainer import (
    Trainer,
    TrainingClock,
    WarmupScheduler,
    count_learnable_parameters,
    human_readable_number,
    register_model,
 )
@ -55,13 +56,10 @@ class MockTrainer(Trainer[MockConfig, MockBatch]):
            targets=torch.cat([b.targets for b in batch]),
        )
-    @cached_property
+    @register_model()
-    def mock_model(self) -> MockModel:
+    def mock_model(self, config: ModelConfig) -> MockModel:
        return MockModel()
    def load_models(self) -> dict[str, fl.Module]:
        return {"mock_model": self.mock_model}
    def compute_loss(self, batch: MockBatch) -> Tensor:
        self.step_counter += 1
        inputs, targets = batch.inputs.to(self.device), batch.targets.to(self.device)
@ -217,17 +215,14 @@ def test_warmup_lr(warmup_scheduler: WarmupScheduler) -> None:
 class MockTrainerWith2Models(MockTrainer):
-    @cached_property
+    @register_model()
-    def mock_model1(self) -> MockModel:
+    def mock_model1(self, config: ModelConfig) -> MockModel:
        return MockModel()
-    @cached_property
+    @register_model()
-    def mock_model2(self) -> MockModel:
+    def mock_model2(self, config: ModelConfig) -> MockModel:
        return MockModel()
    def load_models(self) -> dict[str, fl.Module]:
        return {"mock_model1": self.mock_model1, "mock_model2": self.mock_model2}
    def compute_loss(self, batch: MockBatch) -> Tensor:
        self.step_counter += 1
        inputs, targets = batch.inputs.to(self.device), batch.targets.to(self.device)
@ -246,7 +241,5 @@ def mock_trainer_2_models(mock_config_2_models: MockConfig) -> MockTrainerWith2M
 def test_optimizer_parameters(mock_trainer_2_models: MockTrainerWith2Models) -> None:
-    assert (
+    assert len(mock_trainer_2_models.optimizer.param_groups) == 2
        len(mock_trainer_2_models.optimizer.param_groups) == 12
    )  # 12 == (3 [linear layers] * 2 [bias + weights]) * 2 [models]
    assert mock_trainer_2_models.optimizer.param_groups[0]["lr"] == 1e-5