diff --git a/src/refiners/training_utils/clock.py b/src/refiners/training_utils/clock.py index d206456..fb54e7e 100644 --- a/src/refiners/training_utils/clock.py +++ b/src/refiners/training_utils/clock.py @@ -25,7 +25,6 @@ class TrainingClock(Callback["Trainer[BaseConfig, Any]"]): batch_size: int, training_duration: TimeValue, gradient_accumulation: TimeValue, - evaluation_interval: TimeValue, lr_scheduler_interval: TimeValue, verbose: bool = True, ) -> None: @@ -37,7 +36,6 @@ class TrainingClock(Callback["Trainer[BaseConfig, Any]"]): self.batch_size = batch_size self.training_duration = training_duration self.gradient_accumulation = gradient_accumulation - self.evaluation_interval = evaluation_interval self.lr_scheduler_interval = lr_scheduler_interval self.verbose = verbose self.num_batches_per_epoch = dataset_length // batch_size @@ -85,10 +83,6 @@ class TrainingClock(Callback["Trainer[BaseConfig, Any]"]): def num_step_per_iteration(self) -> int: return self.convert_time_value_to_steps(self.gradient_accumulation) - @cached_property - def num_step_per_evaluation(self) -> int: - return self.convert_time_value_to_steps(self.evaluation_interval) - def is_due(self, interval: TimeValue) -> bool: return self.step % self.convert_time_value_to_steps(interval) == 0 @@ -171,9 +165,3 @@ class TrainingClock(Callback["Trainer[BaseConfig, Any]"]): self.log(f"Iteration {trainer.clock.iteration} ended.") trainer.clock.iteration += 1 trainer.clock.num_minibatches_processed = 0 - - def on_evaluate_begin(self, trainer: "Trainer[BaseConfig, Any]") -> None: - self.log("Evaluation started.") - - def on_evaluate_end(self, trainer: "Trainer[BaseConfig, Any]") -> None: - self.log("Evaluation ended.") diff --git a/src/refiners/training_utils/config.py b/src/refiners/training_utils/config.py index f1b654e..3fc248f 100644 --- a/src/refiners/training_utils/config.py +++ b/src/refiners/training_utils/config.py @@ -26,13 +26,11 @@ class TrainingConfig(BaseModel): seed: int = 0 batch_size: int = 1 gradient_accumulation: Step | Epoch = Step(1) - evaluation_interval: Iteration | Epoch = Iteration(1) gradient_clipping_max_norm: float | None = None - evaluation_seed: int = 0 model_config = ConfigDict(extra="forbid") - @field_validator("duration", "gradient_accumulation", "evaluation_interval", mode="before") + @field_validator("duration", "gradient_accumulation", mode="before") def parse_field(cls, value: TimeValueInput) -> TimeValue: return parse_number_unit_field(value) diff --git a/src/refiners/training_utils/trainer.py b/src/refiners/training_utils/trainer.py index 997f097..2776121 100644 --- a/src/refiners/training_utils/trainer.py +++ b/src/refiners/training_utils/trainer.py @@ -24,7 +24,6 @@ from torch.optim.lr_scheduler import ( from torch.utils.data import DataLoader, Dataset from refiners.fluxion import layers as fl -from refiners.fluxion.utils import no_grad from refiners.training_utils.callback import ( Callback, CallbackConfig, @@ -154,7 +153,6 @@ class Trainer(Generic[ConfigType, Batch], ABC): dataset_length=self.dataset_length, batch_size=self.config.training.batch_size, training_duration=self.config.training.duration, - evaluation_interval=self.config.training.evaluation_interval, gradient_accumulation=self.config.training.gradient_accumulation, lr_scheduler_interval=self.config.lr_scheduler.update_interval, verbose=config.verbose, @@ -345,9 +343,6 @@ class Trainer(Generic[ConfigType, Batch], ABC): @abstractmethod def compute_loss(self, batch: Batch) -> Tensor: ... - def compute_evaluation(self) -> None: - pass - def backward(self) -> None: """Backward pass on the loss.""" self._call_callbacks(event_name="on_backward_begin") @@ -365,8 +360,6 @@ class Trainer(Generic[ConfigType, Batch], ABC): self._call_callbacks(event_name="on_lr_scheduler_step_begin") self.lr_scheduler.step() self._call_callbacks(event_name="on_lr_scheduler_step_end") - if self.clock.is_due(self.config.training.evaluation_interval): - self.evaluate() def step(self, batch: Batch) -> None: """Perform a single training step.""" @@ -395,27 +388,12 @@ class Trainer(Generic[ConfigType, Batch], ABC): self.set_models_to_mode("train") self._call_callbacks(event_name="on_train_begin") assert self.learnable_parameters, "There are no learnable parameters in the models." - self.evaluate() while not self.clock.done: self._call_callbacks(event_name="on_epoch_begin") self.epoch() self._call_callbacks(event_name="on_epoch_end") self._call_callbacks(event_name="on_train_end") - @staticmethod - def get_evaluation_seed(instance: "Trainer[BaseConfig, Any]") -> int: - return instance.config.training.evaluation_seed - - @no_grad() - @scoped_seed(seed=get_evaluation_seed) - def evaluate(self) -> None: - """Evaluate the model.""" - self.set_models_to_mode(mode="eval") - self._call_callbacks(event_name="on_evaluate_begin") - self.compute_evaluation() - self._call_callbacks(event_name="on_evaluate_end") - self.set_models_to_mode(mode="train") - def set_models_to_mode(self, mode: Literal["train", "eval"]) -> None: for item in self.models.values(): if mode == "train": diff --git a/tests/training_utils/mock_config.toml b/tests/training_utils/mock_config.toml index a42a746..a147164 100644 --- a/tests/training_utils/mock_config.toml +++ b/tests/training_utils/mock_config.toml @@ -4,7 +4,6 @@ on_batch_end_seed = 42 on_optimizer_step_interval = "2:iteration" - [mock_model] requires_grad = true use_activation = true @@ -19,8 +18,6 @@ device = "cpu" dtype = "float32" batch_size = 4 gradient_accumulation = "4:step" -evaluation_interval = "5:epoch" -evaluation_seed = 1 gradient_clipping_max_norm = 1.0 [optimizer] diff --git a/tests/training_utils/mock_config_2_models.toml b/tests/training_utils/mock_config_2_models.toml index 361890a..641bef2 100644 --- a/tests/training_utils/mock_config_2_models.toml +++ b/tests/training_utils/mock_config_2_models.toml @@ -13,8 +13,6 @@ duration = "100:epoch" seed = 0 batch_size = 4 gradient_accumulation = "4:step" -evaluation_interval = "5:epoch" -evaluation_seed = 1 gradient_clipping_max_norm = 1.0 [optimizer] diff --git a/tests/training_utils/test_trainer.py b/tests/training_utils/test_trainer.py index fdfef5b..f7803d2 100644 --- a/tests/training_utils/test_trainer.py +++ b/tests/training_utils/test_trainer.py @@ -186,7 +186,6 @@ def training_clock() -> TrainingClock: batch_size=10, training_duration=Epoch(5), gradient_accumulation=Epoch(1), - evaluation_interval=Epoch(1), lr_scheduler_interval=Epoch(1), ) @@ -198,7 +197,6 @@ def test_small_dataset_error(): batch_size=10, training_duration=Epoch(5), gradient_accumulation=Epoch(1), - evaluation_interval=Epoch(1), lr_scheduler_interval=Epoch(1), ) @@ -210,7 +208,6 @@ def test_zero_batch_size_error(): batch_size=0, training_duration=Epoch(5), gradient_accumulation=Epoch(1), - evaluation_interval=Epoch(1), lr_scheduler_interval=Epoch(1), ) @@ -244,13 +241,6 @@ def test_timer_functionality(training_clock: TrainingClock) -> None: assert training_clock.time_elapsed >= 0 -def test_state_based_properties(training_clock: TrainingClock) -> None: - training_clock.step = 5 # Halfway through the first epoch - assert not training_clock.is_due(training_clock.evaluation_interval) # Assuming evaluation every epoch - training_clock.step = 10 # End of the first epoch - assert training_clock.is_due(training_clock.evaluation_interval) - - def test_mock_trainer_initialization(mock_config: MockConfig, mock_trainer: MockTrainer) -> None: assert mock_trainer.config == mock_config assert isinstance(mock_trainer, MockTrainer)