Add sample_noise staticmethod and modify add_noise to support batched steps

2024-11-21 21:58:47 +00:00 · 2024-04-18 09:42:31 +00:00 · 2024-04-18 09:42:31 +00:00 · 17246708b9
parent 7427c171f6
commit 17246708b9
5 changed files with 122 additions and 8 deletions
--- a/src/refiners/foundationals/latent_diffusion/model.py
+++ b/src/refiners/foundationals/latent_diffusion/model.py
@ -35,32 +35,68 @@ class LatentDiffusionModel(fl.Module, ABC):
    def set_inference_steps(self, num_steps: int, first_step: int = 0) -> None:
        self.solver = self.solver.rebuild(num_inference_steps=num_steps, first_inference_step=first_step)
    @staticmethod
    def sample_noise(
        size: tuple[int, ...],
        device: Device | None = None,
        dtype: DType | None = None,
        offset_noise: float | None = None,
    ) -> torch.Tensor:
        """Sample noise from a normal distribution with an optional offset.
        Args:
            size: The size of the noise tensor.
            device: The device to put the noise tensor on.
            dtype: The data type of the noise tensor.
            offset_noise: The offset of the noise tensor.
                Useful at training time, see https://www.crosslabs.org/blog/diffusion-with-offset-noise.
        """
        noise = torch.randn(size=size, device=device, dtype=dtype)
        if offset_noise is not None:
            noise += offset_noise * torch.randn(size=(size[0], size[1], 1, 1), device=device, dtype=dtype)
        return noise
    def init_latents(
        self,
        size: tuple[int, int],
        init_image: Image.Image | None = None,
        noise: Tensor | None = None,
    ) -> Tensor:
        """Initialize the latents for the diffusion process.
        Args:
            size: The size of the latent (in pixel space).
            init_image: The image to use as initialization for the latents.
            noise: The noise to add to the latents.
        """
        height, width = size
        latent_height = height // 8
        latent_width = width // 8
        if noise is None:
-            noise = torch.randn(1, 4, height // 8, width // 8, device=self.device)
+            noise = LatentDiffusionModel.sample_noise(
                size=(1, 4, latent_height, latent_width),
                device=self.device,
                dtype=self.dtype,
            )
        assert list(noise.shape[2:]) == [
-            height // 8,
+            latent_height,
-            width // 8,
+            latent_width,
        ], f"noise shape is not compatible: {noise.shape}, with size: {size}"
        if init_image is None:
-            x = noise
+            latent = noise
        else:
            resized = init_image.resize(size=(width, height))  # type: ignore
            encoded_image = self.lda.image_to_latents(resized)
-            x = self.solver.add_noise(
+            latent = self.solver.add_noise(
                x=encoded_image,
                noise=noise,
                step=self.solver.first_inference_step,
            )
-        return self.solver.scale_model_input(x, step=-1)
+        return self.solver.scale_model_input(latent, step=-1)
    @property
    def steps(self) -> list[int]:
--- a/src/refiners/foundationals/latent_diffusion/solvers/solver.py
+++ b/src/refiners/foundationals/latent_diffusion/solvers/solver.py
@ -4,7 +4,19 @@ from enum import Enum
 from typing import TypeVar
 import numpy as np
-from torch import Generator, Tensor, arange, device as Device, dtype as DType, float32, linspace, log, sqrt, tensor
+from torch import (
    Generator,
    Tensor,
    arange,
    device as Device,
    dtype as DType,
    float32,
    linspace,
    log,
    sqrt,
    stack,
    tensor,
 )
 from refiners.fluxion import layers as fl
@ -208,7 +220,7 @@ class Solver(fl.Module, ABC):
            offset=self.params.timesteps_offset,
        )
-    def add_noise(
+    def _add_noise(
        self,
        x: Tensor,
        noise: Tensor,
@ -227,9 +239,43 @@ class Solver(fl.Module, ABC):
        timestep = self.timesteps[step]
        cumulative_scale_factors = self.cumulative_scale_factors[timestep]
        noise_stds = self.noise_std[timestep]
        # noisify the latents, arXiv:2006.11239 Eq. 4
        noised_x = cumulative_scale_factors * x + noise_stds * noise
        return noised_x
    def add_noise(
        self,
        x: Tensor,
        noise: Tensor,
        step: int | list[int],
    ) -> Tensor:
        """Add noise to the input tensor using the solver's parameters.
        Args:
            x: The input tensor to add noise to.
            noise: The noise tensor to add to the input tensor.
            step: The current step(s) of the diffusion process.
        Returns:
            The input tensor with added noise.
        """
        if isinstance(step, list):
            assert len(x) == len(noise) == len(step), "x, noise, and step must have the same length"
            return stack(
                tensors=[
                    self._add_noise(
                        x=x[i],
                        noise=noise[i],
                        step=step[i],
                    )
                    for i in range(x.shape[0])
                ],
                dim=0,
            )
        return self._add_noise(x=x, noise=noise, step=step)
    def remove_noise(self, x: Tensor, noise: Tensor, step: int) -> Tensor:
        """Remove noise from the input tensor using the current step of the diffusion process.
--- a/tests/e2e/test_lightning_ref/expected_lightning_base_4step.png
+++ b/tests/e2e/test_lightning_ref/expected_lightning_base_4step.png
--- a/tests/foundationals/latent_diffusion/test_model.py
+++ b/tests/foundationals/latent_diffusion/test_model.py
@ -0,0 +1,14 @@
 import torch
 from refiners.fluxion.utils import manual_seed, no_grad
 from refiners.foundationals.latent_diffusion.model import LatentDiffusionModel
@no_grad()
 def test_sample_noise():
    manual_seed(2)
    latents_0 = LatentDiffusionModel.sample_noise(size=(1, 4, 64, 64))
    manual_seed(2)
    latents_1 = LatentDiffusionModel.sample_noise(size=(1, 4, 64, 64), offset_noise=0.0)
    assert torch.allclose(latents_0, latents_1, atol=1e-6, rtol=0)
--- a/tests/foundationals/latent_diffusion/test_solvers.py
+++ b/tests/foundationals/latent_diffusion/test_solvers.py
@ -198,6 +198,24 @@ def test_solver_device(test_device: Device):
    assert noised.device == test_device
 def test_solver_add_noise(test_device: Device):
    scheduler = DDIM(num_inference_steps=30, device=test_device)
    latent = randn(1, 4, 32, 32, device=test_device)
    noise = randn(1, 4, 32, 32, device=test_device)
    noised = scheduler.add_noise(
        x=latent,
        noise=noise,
        step=0,
    )
    noised_double = scheduler.add_noise(
        x=latent.repeat(2, 1, 1, 1),
        noise=noise.repeat(2, 1, 1, 1),
        step=[0, 0],
    )
    assert allclose(noised, noised_double[0])
    assert allclose(noised, noised_double[1])
@pytest.mark.parametrize("noise_schedule", [NoiseSchedule.UNIFORM, NoiseSchedule.QUADRATIC, NoiseSchedule.KARRAS])
 def test_solver_noise_schedules(noise_schedule: NoiseSchedule, test_device: Device):
    scheduler = DDIM(