mirror of
https://github.com/finegrain-ai/refiners.git
synced 2024-11-24 15:18:46 +00:00
implement ControlLora
and ControlLoraAdapter
This commit is contained in:
parent
a54808e757
commit
41a5ce2052
|
@ -0,0 +1,401 @@
|
|||
from torch import Tensor, device as Device, dtype as DType
|
||||
|
||||
from refiners.fluxion.adapters.adapter import Adapter
|
||||
from refiners.fluxion.adapters.lora import Lora, LoraAdapter
|
||||
from refiners.fluxion.context import Contexts
|
||||
from refiners.fluxion.layers import Chain, Conv2d, Multiply, Passthrough, Residual, SiLU, UseContext
|
||||
from refiners.fluxion.layers.module import WeightedModule
|
||||
from refiners.foundationals.latent_diffusion.range_adapter import RangeAdapter2d
|
||||
from refiners.foundationals.latent_diffusion.stable_diffusion_1.unet import ResidualAccumulator
|
||||
from refiners.foundationals.latent_diffusion.stable_diffusion_xl.unet import SDXLUNet
|
||||
from refiners.foundationals.latent_diffusion.unet import ResidualBlock
|
||||
|
||||
|
||||
class ConditionEncoder(Chain):
|
||||
"""Encode an image into a condition latent tensor.
|
||||
|
||||
Receives:
|
||||
(Float[Tensor, "batch in_channels width height"]): The input image.
|
||||
|
||||
Returns:
|
||||
(Float[Tensor, "batch out_channels latent_width latent_height"]): The condition latent tensor.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int = 3,
|
||||
out_channels: int = 320,
|
||||
intermediate_channels: tuple[int, ...] = (16, 32, 96, 256),
|
||||
device: Device | str | None = None,
|
||||
dtype: DType | None = None,
|
||||
) -> None:
|
||||
"""Initialize the ConditionEncoder.
|
||||
|
||||
Args:
|
||||
in_channels: The number of channels of the image tensor.
|
||||
out_channels: The number of channels of the latent tensor to encode the condition into.
|
||||
intermediate_channels: The number of channels of the intermediate layers.
|
||||
device: The PyTorch device to use.
|
||||
dtype: The PyTorch data type to use.
|
||||
"""
|
||||
|
||||
super().__init__(
|
||||
Chain(
|
||||
Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=intermediate_channels[0],
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
),
|
||||
SiLU(),
|
||||
),
|
||||
*(
|
||||
Chain(
|
||||
Conv2d(
|
||||
in_channels=intermediate_channels[i],
|
||||
out_channels=intermediate_channels[i],
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
),
|
||||
SiLU(),
|
||||
Conv2d(
|
||||
in_channels=intermediate_channels[i],
|
||||
out_channels=intermediate_channels[i + 1],
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
),
|
||||
SiLU(),
|
||||
)
|
||||
for i in range(len(intermediate_channels) - 1)
|
||||
),
|
||||
Conv2d(
|
||||
in_channels=intermediate_channels[-1],
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class ZeroConvolution(Passthrough):
|
||||
"""Transform and store the ControlLora's residuals in the context of the original UNet.
|
||||
|
||||
Receives:
|
||||
(Float[Tensor, "batch in_channels width height"]): The input tensor to transform and store.
|
||||
|
||||
Returns: Updates context:
|
||||
(Tensor): Add the residual to the nth residual of the target's UNet.
|
||||
(context="unet", key="residuals")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
residual_index: int,
|
||||
scale: float = 1.0,
|
||||
device: Device | str | None = None,
|
||||
dtype: DType | None = None,
|
||||
) -> None:
|
||||
"""Initialize the ZeroConvolution.
|
||||
|
||||
Args:
|
||||
in_channels: The number of channels of the input tensor.
|
||||
out_channels: The number of channels of the output tensor/residual.
|
||||
residual_index: The index of the residual to store in the target's UNet.
|
||||
scale: The scale to multiply the residuals by.
|
||||
device: The PyTorch device to use.
|
||||
dtype: The PyTorch data type to use.
|
||||
"""
|
||||
self.scale = scale
|
||||
|
||||
super().__init__(
|
||||
Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
),
|
||||
Multiply(scale=scale),
|
||||
ResidualAccumulator(n=residual_index),
|
||||
)
|
||||
|
||||
|
||||
class ControlLora(Passthrough):
|
||||
"""ControlLora is a Half-UNet clone of the target UNet, patched with LoRAs.
|
||||
|
||||
Like ControlNet, it injects residual tensors into the target UNet.
|
||||
See https://github.com/HighCWu/control-lora-v2 for more details.
|
||||
|
||||
Receives: Gets context:
|
||||
(Float[Tensor, "batch condition_channels width height"]): The input image.
|
||||
|
||||
Returns: Sets context:
|
||||
(list[Tensor]): The residuals to be added to the target UNet's residuals.
|
||||
(context="unet", key="residuals")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
unet: SDXLUNet,
|
||||
scale: float = 1.0,
|
||||
condition_channels: int = 3,
|
||||
) -> None:
|
||||
"""Initialize the ControlLora.
|
||||
|
||||
Args:
|
||||
name: The name of the ControlLora.
|
||||
unet: The target UNet.
|
||||
scale: The scale to multiply the residuals by.
|
||||
condition_channels: The number of channels of the input condition tensor.
|
||||
"""
|
||||
self.name = name
|
||||
|
||||
super().__init__(
|
||||
timestep_encoder := unet.layer("TimestepEncoder", Chain).structural_copy(),
|
||||
downblocks := unet.layer("DownBlocks", Chain).structural_copy(),
|
||||
middle_block := unet.layer("MiddleBlock", Chain).structural_copy(),
|
||||
)
|
||||
|
||||
# modify the context_key of the copied TimestepEncoder to avoid conflicts
|
||||
timestep_encoder.context_key = f"timestep_embedding_control_lora_{name}"
|
||||
|
||||
# modify the context_key of each RangeAdapter2d to avoid conflicts
|
||||
for range_adapter in self.layers(RangeAdapter2d):
|
||||
range_adapter.context_key = f"timestep_embedding_control_lora_{name}"
|
||||
|
||||
# insert the ConditionEncoder in the first DownBlock
|
||||
first_downblock = downblocks.layer(0, Chain)
|
||||
out_channels = first_downblock.layer(0, Conv2d).out_channels
|
||||
first_downblock.append(
|
||||
Residual(
|
||||
UseContext(f"control_lora_{name}", f"condition"),
|
||||
ConditionEncoder(
|
||||
in_channels=condition_channels,
|
||||
out_channels=out_channels,
|
||||
device=unet.device,
|
||||
dtype=unet.dtype,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# replace each ResidualAccumulator by a ZeroConvolution
|
||||
for residual_accumulator in self.layers(ResidualAccumulator):
|
||||
downblock = self.ensure_find_parent(residual_accumulator)
|
||||
|
||||
first_layer = downblock[0]
|
||||
assert hasattr(first_layer, "out_channels"), f"{first_layer} has no out_channels attribute"
|
||||
|
||||
block_channels = first_layer.out_channels
|
||||
assert isinstance(block_channels, int)
|
||||
|
||||
downblock.replace(
|
||||
residual_accumulator,
|
||||
ZeroConvolution(
|
||||
scale=scale,
|
||||
residual_index=residual_accumulator.n,
|
||||
in_channels=block_channels,
|
||||
out_channels=block_channels,
|
||||
device=unet.device,
|
||||
dtype=unet.dtype,
|
||||
),
|
||||
)
|
||||
|
||||
# append a ZeroConvolution to middle_block
|
||||
middle_block_channels = middle_block.layer(0, ResidualBlock).out_channels
|
||||
middle_block.append(
|
||||
ZeroConvolution(
|
||||
scale=scale,
|
||||
residual_index=len(downblocks),
|
||||
in_channels=middle_block_channels,
|
||||
out_channels=middle_block_channels,
|
||||
device=unet.device,
|
||||
dtype=unet.dtype,
|
||||
)
|
||||
)
|
||||
|
||||
@property
|
||||
def scale(self) -> float:
|
||||
"""The scale of the injected residuals."""
|
||||
zero_convolution_module = self.ensure_find(ZeroConvolution)
|
||||
return zero_convolution_module.scale
|
||||
|
||||
@scale.setter
|
||||
def scale(self, value: float) -> None:
|
||||
for zero_convolution_module in self.layers(ZeroConvolution):
|
||||
zero_convolution_module.scale = value
|
||||
|
||||
|
||||
class ControlLoraAdapter(Chain, Adapter[SDXLUNet]):
|
||||
"""Adapter for ControlLora.
|
||||
|
||||
This adapter simply prepends a ControlLora model inside the target's UNet.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
target: SDXLUNet,
|
||||
scale: float = 1.0,
|
||||
condition_channels: int = 3,
|
||||
weights: dict[str, Tensor] | None = None,
|
||||
) -> None:
|
||||
with self.setup_adapter(target):
|
||||
self.name = name
|
||||
self._control_lora = [
|
||||
ControlLora(
|
||||
name=name,
|
||||
unet=target,
|
||||
scale=scale,
|
||||
condition_channels=condition_channels,
|
||||
),
|
||||
]
|
||||
|
||||
super().__init__(target)
|
||||
|
||||
if weights:
|
||||
self.load_weights(weights)
|
||||
|
||||
@property
|
||||
def control_lora(self) -> ControlLora:
|
||||
"""The ControlLora model."""
|
||||
return self._control_lora[0]
|
||||
|
||||
def init_context(self) -> Contexts:
|
||||
return {
|
||||
f"control_lora_{self.name}": {
|
||||
"condition": None,
|
||||
}
|
||||
}
|
||||
|
||||
def inject(self, parent: Chain | None = None) -> "ControlLoraAdapter":
|
||||
self.target.insert(index=0, module=self.control_lora)
|
||||
return super().inject(parent)
|
||||
|
||||
def eject(self) -> None:
|
||||
self.target.remove(self.control_lora)
|
||||
return super().eject()
|
||||
|
||||
def structural_copy(self) -> "ControlLoraAdapter":
|
||||
raise RuntimeError("ControlLoraAdapter cannot be copied, eject it first.")
|
||||
|
||||
@property
|
||||
def scale(self) -> float:
|
||||
"""The scale of the injected residuals."""
|
||||
return self.control_lora.scale
|
||||
|
||||
@scale.setter
|
||||
def scale(self, value: float) -> None:
|
||||
self.control_lora.scale = value
|
||||
|
||||
def set_condition(self, condition: Tensor) -> None:
|
||||
self.set_context(
|
||||
context=f"control_lora_{self.name}",
|
||||
value={"condition": condition},
|
||||
)
|
||||
|
||||
def load_weights(
|
||||
self,
|
||||
state_dict: dict[str, Tensor],
|
||||
) -> None:
|
||||
"""Load the weights from the state_dict into the ControlLora.
|
||||
|
||||
Args:
|
||||
state_dict: The state_dict containing the weights to load.
|
||||
"""
|
||||
ControlLoraAdapter.load_lora_layers(self.name, state_dict, self.control_lora)
|
||||
ControlLoraAdapter.load_zero_convolution_layers(state_dict, self.control_lora)
|
||||
ControlLoraAdapter.load_condition_encoder(state_dict, self.control_lora)
|
||||
|
||||
@staticmethod
|
||||
def load_lora_layers(
|
||||
name: str,
|
||||
state_dict: dict[str, Tensor],
|
||||
control_lora: ControlLora,
|
||||
) -> None:
|
||||
"""Load the LoRA layers from the state_dict into the ControlLora.
|
||||
|
||||
Args:
|
||||
name: The name of the ControlLora.
|
||||
state_dict: The state_dict containing the LoRA layers to load.
|
||||
control_lora: The ControlLora to load the LoRA layers into.
|
||||
"""
|
||||
# filter the LoraAdapters from the state_dict
|
||||
lora_weights = {
|
||||
key.removeprefix("ControlLora."): value for key, value in state_dict.items() if "ControlLora" in key
|
||||
}
|
||||
lora_weights = {f"{key}.weight": value for key, value in lora_weights.items()}
|
||||
|
||||
# move the tensors to the device and dtype of the ControlLora
|
||||
lora_weights = {
|
||||
key: value.to(
|
||||
dtype=control_lora.dtype,
|
||||
device=control_lora.device,
|
||||
)
|
||||
for key, value in lora_weights.items()
|
||||
}
|
||||
|
||||
# load every LoRA layers from the filtered state_dict
|
||||
loras = Lora.from_dict(name, state_dict=lora_weights)
|
||||
|
||||
# attach the LoRA layers to the ControlLora
|
||||
adapters: list[LoraAdapter] = []
|
||||
for key, lora in loras.items():
|
||||
target = control_lora.layer(key.split("."), WeightedModule)
|
||||
assert lora.is_compatible(target)
|
||||
adapter = LoraAdapter(target, lora)
|
||||
adapters.append(adapter)
|
||||
|
||||
for adapter in adapters:
|
||||
adapter.inject(control_lora)
|
||||
|
||||
@staticmethod
|
||||
def load_zero_convolution_layers(
|
||||
state_dict: dict[str, Tensor],
|
||||
control_lora: ControlLora,
|
||||
):
|
||||
"""Load the ZeroConvolution layers from the state_dict into the ControlLora.
|
||||
|
||||
Args:
|
||||
state_dict: The state_dict containing the ZeroConvolution layers to load.
|
||||
control_lora: The ControlLora to load the ZeroConvolution layers into.
|
||||
"""
|
||||
zero_convolution_layers = list(control_lora.layers(ZeroConvolution))
|
||||
for i, zero_convolution_layer in enumerate(zero_convolution_layers):
|
||||
zero_convolution_state_dict = {
|
||||
key.removeprefix(f"ZeroConvolution_{i+1:02d}."): value
|
||||
for key, value in state_dict.items()
|
||||
if f"ZeroConvolution_{i+1:02d}" in key
|
||||
}
|
||||
zero_convolution_layer.load_state_dict(zero_convolution_state_dict)
|
||||
|
||||
@staticmethod
|
||||
def load_condition_encoder(
|
||||
state_dict: dict[str, Tensor],
|
||||
control_lora: ControlLora,
|
||||
):
|
||||
"""Load the ConditionEncoder layers from the state_dict into the ControlLora.
|
||||
|
||||
Args:
|
||||
state_dict: The state_dict containing the ConditionEncoder layers to load.
|
||||
control_lora: The ControlLora to load the ConditionEncoder layers into.
|
||||
"""
|
||||
condition_encoder_layer = control_lora.ensure_find(ConditionEncoder)
|
||||
condition_encoder_state_dict = {
|
||||
key.removeprefix("ConditionEncoder."): value
|
||||
for key, value in state_dict.items()
|
||||
if "ConditionEncoder" in key
|
||||
}
|
||||
condition_encoder_layer.load_state_dict(condition_encoder_state_dict)
|
Loading…
Reference in a new issue