implement ControlLora and ControlLoraAdapter

2024-11-21 21:58:47 +00:00 · 2024-02-14 15:26:14 +00:00 · 2024-02-14 15:26:14 +00:00 · 41a5ce2052
parent a54808e757
commit 41a5ce2052
1 changed files with 401 additions and 0 deletions
--- a/src/refiners/foundationals/latent_diffusion/stable_diffusion_xl/control_lora.py
+++ b/src/refiners/foundationals/latent_diffusion/stable_diffusion_xl/control_lora.py
@ -0,0 +1,401 @@
 from torch import Tensor, device as Device, dtype as DType
 from refiners.fluxion.adapters.adapter import Adapter
 from refiners.fluxion.adapters.lora import Lora, LoraAdapter
 from refiners.fluxion.context import Contexts
 from refiners.fluxion.layers import Chain, Conv2d, Multiply, Passthrough, Residual, SiLU, UseContext
 from refiners.fluxion.layers.module import WeightedModule
 from refiners.foundationals.latent_diffusion.range_adapter import RangeAdapter2d
 from refiners.foundationals.latent_diffusion.stable_diffusion_1.unet import ResidualAccumulator
 from refiners.foundationals.latent_diffusion.stable_diffusion_xl.unet import SDXLUNet
 from refiners.foundationals.latent_diffusion.unet import ResidualBlock
 class ConditionEncoder(Chain):
    """Encode an image into a condition latent tensor.
    Receives:
        (Float[Tensor, "batch in_channels width height"]): The input image.
    Returns:
        (Float[Tensor, "batch out_channels latent_width latent_height"]): The condition latent tensor.
    """
    def __init__(
        self,
        in_channels: int = 3,
        out_channels: int = 320,
        intermediate_channels: tuple[int, ...] = (16, 32, 96, 256),
        device: Device | str | None = None,
        dtype: DType | None = None,
    ) -> None:
        """Initialize the ConditionEncoder.
        Args:
            in_channels: The number of channels of the image tensor.
            out_channels: The number of channels of the latent tensor to encode the condition into.
            intermediate_channels: The number of channels of the intermediate layers.
            device: The PyTorch device to use.
            dtype: The PyTorch data type to use.
        """
        super().__init__(
            Chain(
                Conv2d(
                    in_channels=in_channels,
                    out_channels=intermediate_channels[0],
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    device=device,
                    dtype=dtype,
                ),
                SiLU(),
            ),
            *(
                Chain(
                    Conv2d(
                        in_channels=intermediate_channels[i],
                        out_channels=intermediate_channels[i],
                        kernel_size=3,
                        padding=1,
                        device=device,
                        dtype=dtype,
                    ),
                    SiLU(),
                    Conv2d(
                        in_channels=intermediate_channels[i],
                        out_channels=intermediate_channels[i + 1],
                        kernel_size=3,
                        stride=2,
                        padding=1,
                        device=device,
                        dtype=dtype,
                    ),
                    SiLU(),
                )
                for i in range(len(intermediate_channels) - 1)
            ),
            Conv2d(
                in_channels=intermediate_channels[-1],
                out_channels=out_channels,
                kernel_size=3,
                padding=1,
                device=device,
                dtype=dtype,
            ),
        )
 class ZeroConvolution(Passthrough):
    """Transform and store the ControlLora's residuals in the context of the original UNet.
    Receives:
        (Float[Tensor, "batch in_channels width height"]): The input tensor to transform and store.
    Returns: Updates context:
        (Tensor): Add the residual to the nth residual of the target's UNet.
            (context="unet", key="residuals")
    """
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        residual_index: int,
        scale: float = 1.0,
        device: Device | str | None = None,
        dtype: DType | None = None,
    ) -> None:
        """Initialize the ZeroConvolution.
        Args:
            in_channels: The number of channels of the input tensor.
            out_channels: The number of channels of the output tensor/residual.
            residual_index: The index of the residual to store in the target's UNet.
            scale: The scale to multiply the residuals by.
            device: The PyTorch device to use.
            dtype: The PyTorch data type to use.
        """
        self.scale = scale
        super().__init__(
            Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
                device=device,
                dtype=dtype,
            ),
            Multiply(scale=scale),
            ResidualAccumulator(n=residual_index),
        )
 class ControlLora(Passthrough):
    """ControlLora is a Half-UNet clone of the target UNet, patched with LoRAs.
    Like ControlNet, it injects residual tensors into the target UNet.
    See https://github.com/HighCWu/control-lora-v2 for more details.
    Receives: Gets context:
        (Float[Tensor, "batch condition_channels width height"]): The input image.
    Returns: Sets context:
        (list[Tensor]): The residuals to be added to the target UNet's residuals.
            (context="unet", key="residuals")
    """
    def __init__(
        self,
        name: str,
        unet: SDXLUNet,
        scale: float = 1.0,
        condition_channels: int = 3,
    ) -> None:
        """Initialize the ControlLora.
        Args:
            name: The name of the ControlLora.
            unet: The target UNet.
            scale: The scale to multiply the residuals by.
            condition_channels: The number of channels of the input condition tensor.
        """
        self.name = name
        super().__init__(
            timestep_encoder := unet.layer("TimestepEncoder", Chain).structural_copy(),
            downblocks := unet.layer("DownBlocks", Chain).structural_copy(),
            middle_block := unet.layer("MiddleBlock", Chain).structural_copy(),
        )
        # modify the context_key of the copied TimestepEncoder to avoid conflicts
        timestep_encoder.context_key = f"timestep_embedding_control_lora_{name}"
        # modify the context_key of each RangeAdapter2d to avoid conflicts
        for range_adapter in self.layers(RangeAdapter2d):
            range_adapter.context_key = f"timestep_embedding_control_lora_{name}"
        # insert the ConditionEncoder in the first DownBlock
        first_downblock = downblocks.layer(0, Chain)
        out_channels = first_downblock.layer(0, Conv2d).out_channels
        first_downblock.append(
            Residual(
                UseContext(f"control_lora_{name}", f"condition"),
                ConditionEncoder(
                    in_channels=condition_channels,
                    out_channels=out_channels,
                    device=unet.device,
                    dtype=unet.dtype,
                ),
            )
        )
        # replace each ResidualAccumulator by a ZeroConvolution
        for residual_accumulator in self.layers(ResidualAccumulator):
            downblock = self.ensure_find_parent(residual_accumulator)
            first_layer = downblock[0]
            assert hasattr(first_layer, "out_channels"), f"{first_layer} has no out_channels attribute"
            block_channels = first_layer.out_channels
            assert isinstance(block_channels, int)
            downblock.replace(
                residual_accumulator,
                ZeroConvolution(
                    scale=scale,
                    residual_index=residual_accumulator.n,
                    in_channels=block_channels,
                    out_channels=block_channels,
                    device=unet.device,
                    dtype=unet.dtype,
                ),
            )
        # append a ZeroConvolution to middle_block
        middle_block_channels = middle_block.layer(0, ResidualBlock).out_channels
        middle_block.append(
            ZeroConvolution(
                scale=scale,
                residual_index=len(downblocks),
                in_channels=middle_block_channels,
                out_channels=middle_block_channels,
                device=unet.device,
                dtype=unet.dtype,
            )
        )
    @property
    def scale(self) -> float:
        """The scale of the injected residuals."""
        zero_convolution_module = self.ensure_find(ZeroConvolution)
        return zero_convolution_module.scale
    @scale.setter
    def scale(self, value: float) -> None:
        for zero_convolution_module in self.layers(ZeroConvolution):
            zero_convolution_module.scale = value
 class ControlLoraAdapter(Chain, Adapter[SDXLUNet]):
    """Adapter for ControlLora.
    This adapter simply prepends a ControlLora model inside the target's UNet.
    """
    def __init__(
        self,
        name: str,
        target: SDXLUNet,
        scale: float = 1.0,
        condition_channels: int = 3,
        weights: dict[str, Tensor] | None = None,
    ) -> None:
        with self.setup_adapter(target):
            self.name = name
            self._control_lora = [
                ControlLora(
                    name=name,
                    unet=target,
                    scale=scale,
                    condition_channels=condition_channels,
                ),
            ]
            super().__init__(target)
        if weights:
            self.load_weights(weights)
    @property
    def control_lora(self) -> ControlLora:
        """The ControlLora model."""
        return self._control_lora[0]
    def init_context(self) -> Contexts:
        return {
            f"control_lora_{self.name}": {
                "condition": None,
            }
        }
    def inject(self, parent: Chain | None = None) -> "ControlLoraAdapter":
        self.target.insert(index=0, module=self.control_lora)
        return super().inject(parent)
    def eject(self) -> None:
        self.target.remove(self.control_lora)
        return super().eject()
    def structural_copy(self) -> "ControlLoraAdapter":
        raise RuntimeError("ControlLoraAdapter cannot be copied, eject it first.")
    @property
    def scale(self) -> float:
        """The scale of the injected residuals."""
        return self.control_lora.scale
    @scale.setter
    def scale(self, value: float) -> None:
        self.control_lora.scale = value
    def set_condition(self, condition: Tensor) -> None:
        self.set_context(
            context=f"control_lora_{self.name}",
            value={"condition": condition},
        )
    def load_weights(
        self,
        state_dict: dict[str, Tensor],
    ) -> None:
        """Load the weights from the state_dict into the ControlLora.
        Args:
            state_dict: The state_dict containing the weights to load.
        """
        ControlLoraAdapter.load_lora_layers(self.name, state_dict, self.control_lora)
        ControlLoraAdapter.load_zero_convolution_layers(state_dict, self.control_lora)
        ControlLoraAdapter.load_condition_encoder(state_dict, self.control_lora)
    @staticmethod
    def load_lora_layers(
        name: str,
        state_dict: dict[str, Tensor],
        control_lora: ControlLora,
    ) -> None:
        """Load the LoRA layers from the state_dict into the ControlLora.
        Args:
            name: The name of the ControlLora.
            state_dict: The state_dict containing the LoRA layers to load.
            control_lora: The ControlLora to load the LoRA layers into.
        """
        # filter the LoraAdapters from the state_dict
        lora_weights = {
            key.removeprefix("ControlLora."): value for key, value in state_dict.items() if "ControlLora" in key
        }
        lora_weights = {f"{key}.weight": value for key, value in lora_weights.items()}
        # move the tensors to the device and dtype of the ControlLora
        lora_weights = {
            key: value.to(
                dtype=control_lora.dtype,
                device=control_lora.device,
            )
            for key, value in lora_weights.items()
        }
        # load every LoRA layers from the filtered state_dict
        loras = Lora.from_dict(name, state_dict=lora_weights)
        # attach the LoRA layers to the ControlLora
        adapters: list[LoraAdapter] = []
        for key, lora in loras.items():
            target = control_lora.layer(key.split("."), WeightedModule)
            assert lora.is_compatible(target)
            adapter = LoraAdapter(target, lora)
            adapters.append(adapter)
        for adapter in adapters:
            adapter.inject(control_lora)
    @staticmethod
    def load_zero_convolution_layers(
        state_dict: dict[str, Tensor],
        control_lora: ControlLora,
    ):
        """Load the ZeroConvolution layers from the state_dict into the ControlLora.
        Args:
            state_dict: The state_dict containing the ZeroConvolution layers to load.
            control_lora: The ControlLora to load the ZeroConvolution layers into.
        """
        zero_convolution_layers = list(control_lora.layers(ZeroConvolution))
        for i, zero_convolution_layer in enumerate(zero_convolution_layers):
            zero_convolution_state_dict = {
                key.removeprefix(f"ZeroConvolution_{i+1:02d}."): value
                for key, value in state_dict.items()
                if f"ZeroConvolution_{i+1:02d}" in key
            }
            zero_convolution_layer.load_state_dict(zero_convolution_state_dict)
    @staticmethod
    def load_condition_encoder(
        state_dict: dict[str, Tensor],
        control_lora: ControlLora,
    ):
        """Load the ConditionEncoder layers from the state_dict into the ControlLora.
        Args:
            state_dict: The state_dict containing the ConditionEncoder layers to load.
            control_lora: The ControlLora to load the ConditionEncoder layers into.
        """
        condition_encoder_layer = control_lora.ensure_find(ConditionEncoder)
        condition_encoder_state_dict = {
            key.removeprefix("ConditionEncoder."): value
            for key, value in state_dict.items()
            if "ConditionEncoder" in key
        }
        condition_encoder_layer.load_state_dict(condition_encoder_state_dict)