implement StyleAlignedAdapter

2024-11-21 13:48:46 +00:00 · 2024-02-15 14:10:39 +00:00 · 2024-02-15 14:10:39 +00:00 · efa3988638
parent 432e32f94f
commit efa3988638
1 changed files with 329 additions and 0 deletions
--- a/src/refiners/foundationals/latent_diffusion/style_aligned.py
+++ b/src/refiners/foundationals/latent_diffusion/style_aligned.py
@ -0,0 +1,329 @@
+from functools import cached_property
+from typing import Generic, TypeVar
+
+import torch
+from jaxtyping import Float
+from torch import Tensor
+
+import refiners.fluxion.layers as fl
+from refiners.fluxion.adapters.adapter import Adapter
+from refiners.foundationals.latent_diffusion import SD1UNet, SDXLUNet
+
+T = TypeVar("T", bound="SD1UNet | SDXLUNet")
+
+
+class ExtractReferenceFeatures(fl.Module):
+    """Extract the reference features from the input features.
+
+    Note:
+        This layer expects the input features to be a concatenation of conditional and unconditional features,
+        as done when using Classifier-free guidance (CFG).
+
+    The reference features are the first features of the conditional and unconditional input features.
+    They are extracted, and repeated to match the batch size of the input features.
+
+    Receives:
+        features (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The input features.
+
+    Returns:
+        reference (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The reference features.
+    """
+
+    def forward(
+        self,
+        features: Float[Tensor, "cfg_batch_size sequence_length embedding_dim"],
+    ) -> Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]:
+        cfg_batch_size = features.shape[0]
+        batch_size = cfg_batch_size // 2
+
+        # split the cfg
+        features_cond, features_uncond = torch.chunk(features, 2, dim=0)
+        # -> 2 x (batch_size, sequence_length, embedding_dim)
+
+        # extract the reference features
+        features_ref = torch.stack(
+            (
+                features_cond[0],  # (sequence_length, embedding_dim)
+                features_uncond[0],  # (sequence_length, embedding_dim)
+            ),
+        )  # -> (2, sequence_length, embedding_dim)
+
+        # repeat the reference features to match the batch size
+        features_ref = features_ref.repeat_interleave(batch_size, dim=0)
+        # -> (cfg_batch_size, sequence_length, embedding_dim)
+
+        return features_ref
+
+
+class AdaIN(fl.Module):
+    """Apply Adaptive Instance Normalization (AdaIN) to the target features.
+
+    See [[arXiv:1703.06868] Arbitrary Style Transfer in Real-time with Adaptive Instance Normalization](https://arxiv.org/abs/1703.06868) for more details.
+
+    Receives:
+        reference (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The reference features.
+        targets (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The target features.
+
+    Returns:
+        reference (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The reference features (unchanged).
+        targets (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The target features, renormalized.
+    """
+
+    def __init__(self, epsilon: float = 1e-8) -> None:
+        """Initialize the AdaIN module.
+
+        Args:
+            epsilon: A small value to avoid division by zero.
+        """
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(
+        self,
+        targets: Float[Tensor, "cfg_batch_size sequence_length embedding_dim"],
+        reference: Float[Tensor, "cfg_batch_size sequence_length embedding_dim"],
+    ) -> tuple[
+        Float[Tensor, "cfg_batch_size sequence_length embedding_dim"],  # targets (renormalized)
+        Float[Tensor, "cfg_batch_size sequence_length embedding_dim"],  # reference (unchanged)
+    ]:
+        targets_mean = torch.mean(targets, dim=-2, keepdim=True)
+        targets_std = torch.std(targets, dim=-2, keepdim=True)
+        targets_normalized = (targets - targets_mean) / (targets_std + self.epsilon)
+
+        reference_mean = torch.mean(reference, dim=-2, keepdim=True)
+        reference_std = torch.std(reference, dim=-2, keepdim=True)
+        targets_renormalized = targets_normalized * reference_std + reference_mean
+
+        return (
+            targets_renormalized,
+            reference,
+        )
+
+
+class ScaleReferenceFeatures(fl.Module):
+    """Scale the reference features.
+
+    Note:
+        This layer expects the input features to be a concatenation of conditional and unconditional features,
+        as done when using Classifier-free guidance (CFG).
+
+    This layer scales the reference features which will later be used (in the attention dot product) with the target features.
+
+    Receives:
+        features (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The input reference features.
+
+    Returns:
+        features (Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]): The rescaled reference features.
+    """
+
+    def __init__(
+        self,
+        scale: float = 1.0,
+    ) -> None:
+        """Initialize the ScaleReferenceFeatures module.
+
+        Args:
+            scale: The scaling factor.
+        """
+        super().__init__()
+        self.scale = scale
+
+    def forward(
+        self,
+        features: Float[Tensor, "cfg_batch_size sequence_length embedding_dim"],
+    ) -> Float[Tensor, "cfg_batch_size sequence_length embedding_dim"]:
+        cfg_batch_size = features.shape[0]
+        batch_size = cfg_batch_size // 2
+
+        # clone the features
+        # needed because all the following operations are in-place
+        features = features.clone()
+
+        # "stack" the cfg
+        features_cfg_stack = features.reshape(2, batch_size, *features.shape[1:])
+
+        # scale the reference features which will later be used (in the attention dot product) with the target features
+        features_cfg_stack[:, 1:] *= self.scale
+
+        # "unstack" the cfg
+        features = features_cfg_stack.reshape(features.shape)
+
+        return features
+
+
+class StyleAligned(fl.Chain):
+    """StyleAligned module.
+
+    This layer encapsulates the logic of the StyleAligned method,
+    as described in [[arXiv:2312.02133] Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133).
+
+    See also <https://blog.finegrain.ai/posts/implementing-style-aligned/>.
+
+    Receives:
+        features (Float[Tensor, "cfg_batch_size sequence_length_in embedding_dim"]): The input features.
+
+    Returns:
+        shared_features (Float[Tensor, "cfg_batch_size sequence_length_out embedding_dim"]): The transformed features.
+    """
+
+    def __init__(
+        self,
+        adain: bool,
+        concatenate: bool,
+        scale: float = 1.0,
+    ) -> None:
+        """Initialize the StyleAligned module.
+
+        Args:
+            adain: Whether to apply Adaptive Instance Normalization to the target features.
+            scale: The scaling factor for the reference features.
+            concatenate: Whether to concatenate the reference and target features.
+        """
+        super().__init__(
+            # (features): (cfg_batch_size sequence_length embedding_dim)
+            fl.Parallel(
+                fl.Identity(),
+                ExtractReferenceFeatures(),
+            ),
+            # (targets, reference)
+            AdaIN(),
+            # (targets_renormalized, reference)
+            fl.Distribute(
+                fl.Identity(),
+                ScaleReferenceFeatures(scale=scale),
+            ),
+            # (targets_renormalized, reference_scaled)
+            fl.Concatenate(
+                fl.GetArg(index=0),  # targets
+                fl.GetArg(index=1),  # reference
+                dim=-2,  # sequence_length
+            ),
+            # (features_with_shared_reference)
+        )
+
+        if not adain:
+            adain_module = self.ensure_find(AdaIN)
+            self.remove(adain_module)
+
+        if not concatenate:
+            concatenate_module = self.ensure_find(fl.Concatenate)
+            self.replace(
+                old_module=concatenate_module,
+                new_module=fl.GetArg(index=0),  # targets
+            )
+
+    @property
+    def scale(self) -> float:
+        """The scaling factor for the reference features."""
+        scale_reference = self.ensure_find(ScaleReferenceFeatures)
+        return scale_reference.scale
+
+    @scale.setter
+    def scale(self, scale: float) -> None:
+        scale_reference = self.ensure_find(ScaleReferenceFeatures)
+        scale_reference.scale = scale
+
+
+class SharedSelfAttentionAdapter(fl.Chain, Adapter[fl.SelfAttention]):
+    """Upgrades a `SelfAttention` layer into a `SharedSelfAttention` layer.
+
+    This adapter inserts 3 `StyleAligned` modules right after
+    the original Q, K, V `Linear`-s (wrapped inside a `fl.Distribute`).
+    """
+
+    def __init__(
+        self,
+        target: fl.SelfAttention,
+        scale: float = 1.0,
+    ) -> None:
+        with self.setup_adapter(target):
+            super().__init__(target)
+
+        self._style_aligned_layers = [
+            StyleAligned(  # Query
+                adain=True,
+                concatenate=False,
+                scale=scale,
+            ),
+            StyleAligned(  # Key
+                adain=True,
+                concatenate=True,
+                scale=scale,
+            ),
+            StyleAligned(  # Value
+                adain=False,
+                concatenate=True,
+                scale=scale,
+            ),
+        ]
+
+    @cached_property
+    def style_aligned_layers(self) -> fl.Distribute:
+        return fl.Distribute(*self._style_aligned_layers)
+
+    def inject(self, parent: fl.Chain | None = None) -> "SharedSelfAttentionAdapter":
+        self.target.insert_before_type(
+            module_type=fl.ScaledDotProductAttention,
+            new_module=self.style_aligned_layers,
+        )
+        return super().inject(parent)
+
+    def eject(self) -> None:
+        self.target.remove(self.style_aligned_layers)
+        super().eject()
+
+    @property
+    def scale(self) -> float:
+        return self.style_aligned_layers.layer(0, StyleAligned).scale
+
+    @scale.setter
+    def scale(self, scale: float) -> None:
+        for style_aligned_module in self.style_aligned_layers:
+            style_aligned_module.scale = scale
+
+
+class StyleAlignedAdapter(Generic[T], fl.Chain, Adapter[T]):
+    """Upgrade each `SelfAttention` layer of a UNet into a `SharedSelfAttention` layer."""
+
+    def __init__(
+        self,
+        target: T,
+        scale: float = 1.0,
+    ) -> None:
+        """Initialize the StyleAlignedAdapter.
+
+        Args:
+            target: The target module.
+            scale: The scaling factor for the reference features.
+        """
+        with self.setup_adapter(target):
+            super().__init__(target)
+
+        # create a SharedSelfAttentionAdapter for each SelfAttention module
+        self.shared_self_attention_adapters = tuple(
+            SharedSelfAttentionAdapter(
+                target=self_attention,
+                scale=scale,
+            )
+            for self_attention in self.target.layers(fl.SelfAttention)
+        )
+
+    def inject(self, parent: fl.Chain | None = None) -> "StyleAlignedAdapter[T]":
+        for shared_self_attention_adapter in self.shared_self_attention_adapters:
+            shared_self_attention_adapter.inject()
+        return super().inject(parent)
+
+    def eject(self) -> None:
+        for shared_self_attention_adapter in self.shared_self_attention_adapters:
+            shared_self_attention_adapter.eject()
+        super().eject()
+
+    @property
+    def scale(self) -> float:
+        """The scaling factor for the reference features."""
+        return self.shared_self_attention_adapters[0].scale
+
+    @scale.setter
+    def scale(self, scale: float) -> None:
+        for shared_self_attention_adapter in self.shared_self_attention_adapters:
+            shared_self_attention_adapter.scale = scale