delete old conversion scripts

2024-11-21 13:48:46 +00:00 · 2024-10-09 09:24:59 +00:00 · 2024-10-09 09:24:59 +00:00 · 189cfa1a69
parent 2796117d2d
commit 189cfa1a69
18 changed files with 0 additions and 3747 deletions
--- a/docs/reference/fluxion/model_converter.md
+++ b/docs/reference/fluxion/model_converter.md
@ -1 +0,0 @@
-::: refiners.fluxion.model_converter
--- a/scripts/conversion/convert_diffusers_autoencoder_kl.py
+++ b/scripts/conversion/convert_diffusers_autoencoder_kl.py
@ -1,81 +0,0 @@
-import argparse
-from pathlib import Path
-
-import torch
-from diffusers import AutoencoderKL  # type: ignore
-from torch import nn
-
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.foundationals.latent_diffusion.auto_encoder import LatentDiffusionAutoencoder
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    output_path: str | None
-    use_half: bool
-    verbose: bool
-
-
-def setup_converter(args: Args) -> ModelConverter:
-    target = LatentDiffusionAutoencoder()
-    # low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
-    source: nn.Module = AutoencoderKL.from_pretrained(  # type: ignore
-        pretrained_model_name_or_path=args.source_path,
-        subfolder=args.subfolder,
-        low_cpu_mem_usage=False,
-    )  # type: ignore
-    x = torch.randn(1, 3, 512, 512)
-    converter = ModelConverter(source_model=source, target_model=target, skip_output_check=True, verbose=args.verbose)
-    if not converter.run(source_args=(x,)):
-        raise RuntimeError("Model conversion failed")
-    return converter
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Convert a pretrained diffusers AutoencoderKL model to a refiners Latent Diffusion Autoencoder"
-    )
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="runwayml/stable-diffusion-v1-5",
-        help="Path to the source pretrained model (default: 'runwayml/stable-diffusion-v1-5').",
-    )
-    parser.add_argument(
-        "--subfolder",
-        type=str,
-        dest="subfolder",
-        default="vae",
-        help="Subfolder in the source path where the model is located inside the Hub (default: 'vae')",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Path to save the converted model (extension will be .safetensors). If not specified, the output path will"
-            " be the source path with the extension changed to .safetensors."
-        ),
-    )
-    parser.add_argument(
-        "--half",
-        action="store_true",
-        dest="use_half",
-        default=False,
-        help="Use this flag to save the output file as half precision (default: full precision).",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        dest="verbose",
-        default=False,
-        help="Use this flag to print verbose output during conversion.",
-    )
-    args = parser.parse_args(namespace=Args())
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}-autoencoder.safetensors"
-    assert args.output_path is not None
-    converter = setup_converter(args=args)
-    converter.save_to_safetensors(path=args.output_path, half=args.use_half)
--- a/scripts/conversion/convert_diffusers_controlnet.py
+++ b/scripts/conversion/convert_diffusers_controlnet.py
@ -1,233 +0,0 @@
-# pyright: reportPrivateUsage=false
-import argparse
-from pathlib import Path
-
-import torch
-from diffusers import ControlNetModel  # type: ignore
-from torch import nn
-
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.fluxion.utils import no_grad, save_to_safetensors
-from refiners.foundationals.latent_diffusion import (
-    DPMSolver,
-    SD1ControlnetAdapter,
-    SD1UNet,
-)
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    output_path: str | None
-
-
-@no_grad()
-def convert(args: Args) -> dict[str, torch.Tensor]:
-    # low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
-    controlnet_src: nn.Module = ControlNetModel.from_pretrained(  # type: ignore
-        pretrained_model_name_or_path=args.source_path,
-        low_cpu_mem_usage=False,
-    )
-    unet = SD1UNet(in_channels=4)
-    adapter = SD1ControlnetAdapter(unet, name="mycn").inject()
-    controlnet = adapter.controlnet
-
-    condition = torch.randn(1, 3, 512, 512)
-    adapter.set_controlnet_condition(condition=condition)
-
-    clip_text_embedding = torch.rand(1, 77, 768)
-    unet.set_clip_text_embedding(clip_text_embedding=clip_text_embedding)
-
-    solver = DPMSolver(num_inference_steps=10)
-    timestep = solver.timesteps[0].unsqueeze(dim=0)
-    unet.set_timestep(timestep=timestep.unsqueeze(dim=0))
-
-    x = torch.randn(1, 4, 64, 64)
-
-    # We need the hack below because our implementation is not strictly equivalent
-    # to diffusers in order, since we compute the residuals inline instead of
-    # in a separate step.
-
-    converter = ModelConverter(
-        source_model=controlnet_src, target_model=controlnet, skip_output_check=True, verbose=False
-    )
-
-    source_order = converter._trace_module_execution_order(
-        module=controlnet_src, args=(x, timestep, clip_text_embedding, condition), keys_to_skip=[]
-    )
-    target_order = converter._trace_module_execution_order(module=controlnet, args=(x,), keys_to_skip=[])
-
-    broken_k = (nn.Conv2d, (torch.Size([320, 320, 1, 1]), torch.Size([320])))
-
-    expected_source_order = [
-        "down_blocks.0.attentions.0.proj_in",
-        "down_blocks.0.attentions.0.proj_out",
-        "down_blocks.0.attentions.1.proj_in",
-        "down_blocks.0.attentions.1.proj_out",
-        "controlnet_down_blocks.0",
-        "controlnet_down_blocks.1",
-        "controlnet_down_blocks.2",
-        "controlnet_down_blocks.3",
-    ]
-
-    expected_target_order = [
-        "DownBlocks.Chain_1.Passthrough.Conv2d",
-        "DownBlocks.Chain_2.CLIPLCrossAttention.Chain_1.Conv2d",
-        "DownBlocks.Chain_2.CLIPLCrossAttention.Chain_3.Conv2d",
-        "DownBlocks.Chain_2.Passthrough.Conv2d",
-        "DownBlocks.Chain_3.CLIPLCrossAttention.Chain_1.Conv2d",
-        "DownBlocks.Chain_3.CLIPLCrossAttention.Chain_3.Conv2d",
-        "DownBlocks.Chain_3.Passthrough.Conv2d",
-        "DownBlocks.Chain_4.Passthrough.Conv2d",
-    ]
-
-    fixed_source_order = [
-        "controlnet_down_blocks.0",
-        "down_blocks.0.attentions.0.proj_in",
-        "down_blocks.0.attentions.0.proj_out",
-        "controlnet_down_blocks.1",
-        "down_blocks.0.attentions.1.proj_in",
-        "down_blocks.0.attentions.1.proj_out",
-        "controlnet_down_blocks.2",
-        "controlnet_down_blocks.3",
-    ]
-
-    assert source_order[broken_k] == expected_source_order
-    assert target_order[broken_k] == expected_target_order
-    source_order[broken_k] = fixed_source_order
-
-    broken_k = (nn.Conv2d, (torch.Size([640, 640, 1, 1]), torch.Size([640])))
-
-    expected_source_order = [
-        "down_blocks.1.attentions.0.proj_in",
-        "down_blocks.1.attentions.0.proj_out",
-        "down_blocks.1.attentions.1.proj_in",
-        "down_blocks.1.attentions.1.proj_out",
-        "controlnet_down_blocks.4",
-        "controlnet_down_blocks.5",
-        "controlnet_down_blocks.6",
-    ]
-
-    expected_target_order = [
-        "DownBlocks.Chain_5.CLIPLCrossAttention.Chain_1.Conv2d",
-        "DownBlocks.Chain_5.CLIPLCrossAttention.Chain_3.Conv2d",
-        "DownBlocks.Chain_5.Passthrough.Conv2d",
-        "DownBlocks.Chain_6.CLIPLCrossAttention.Chain_1.Conv2d",
-        "DownBlocks.Chain_6.CLIPLCrossAttention.Chain_3.Conv2d",
-        "DownBlocks.Chain_6.Passthrough.Conv2d",
-        "DownBlocks.Chain_7.Passthrough.Conv2d",
-    ]
-
-    fixed_source_order = [
-        "down_blocks.1.attentions.0.proj_in",
-        "down_blocks.1.attentions.0.proj_out",
-        "controlnet_down_blocks.4",
-        "down_blocks.1.attentions.1.proj_in",
-        "down_blocks.1.attentions.1.proj_out",
-        "controlnet_down_blocks.5",
-        "controlnet_down_blocks.6",
-    ]
-
-    assert source_order[broken_k] == expected_source_order
-    assert target_order[broken_k] == expected_target_order
-    source_order[broken_k] = fixed_source_order
-
-    broken_k = (nn.Conv2d, (torch.Size([1280, 1280, 1, 1]), torch.Size([1280])))
-
-    expected_source_order = [
-        "down_blocks.2.attentions.0.proj_in",
-        "down_blocks.2.attentions.0.proj_out",
-        "down_blocks.2.attentions.1.proj_in",
-        "down_blocks.2.attentions.1.proj_out",
-        "mid_block.attentions.0.proj_in",
-        "mid_block.attentions.0.proj_out",
-        "controlnet_down_blocks.7",
-        "controlnet_down_blocks.8",
-        "controlnet_down_blocks.9",
-        "controlnet_down_blocks.10",
-        "controlnet_down_blocks.11",
-        "controlnet_mid_block",
-    ]
-
-    expected_target_order = [
-        "DownBlocks.Chain_8.CLIPLCrossAttention.Chain_1.Conv2d",
-        "DownBlocks.Chain_8.CLIPLCrossAttention.Chain_3.Conv2d",
-        "DownBlocks.Chain_8.Passthrough.Conv2d",
-        "DownBlocks.Chain_9.CLIPLCrossAttention.Chain_1.Conv2d",
-        "DownBlocks.Chain_9.CLIPLCrossAttention.Chain_3.Conv2d",
-        "DownBlocks.Chain_9.Passthrough.Conv2d",
-        "DownBlocks.Chain_10.Passthrough.Conv2d",
-        "DownBlocks.Chain_11.Passthrough.Conv2d",
-        "DownBlocks.Chain_12.Passthrough.Conv2d",
-        "MiddleBlock.CLIPLCrossAttention.Chain_1.Conv2d",
-        "MiddleBlock.CLIPLCrossAttention.Chain_3.Conv2d",
-        "MiddleBlock.Passthrough.Conv2d",
-    ]
-
-    fixed_source_order = [
-        "down_blocks.2.attentions.0.proj_in",
-        "down_blocks.2.attentions.0.proj_out",
-        "controlnet_down_blocks.7",
-        "down_blocks.2.attentions.1.proj_in",
-        "down_blocks.2.attentions.1.proj_out",
-        "controlnet_down_blocks.8",
-        "controlnet_down_blocks.9",
-        "controlnet_down_blocks.10",
-        "controlnet_down_blocks.11",
-        "mid_block.attentions.0.proj_in",
-        "mid_block.attentions.0.proj_out",
-        "controlnet_mid_block",
-    ]
-
-    assert source_order[broken_k] == expected_source_order
-    assert target_order[broken_k] == expected_target_order
-    source_order[broken_k] = fixed_source_order
-
-    assert converter._assert_shapes_aligned(source_order=source_order, target_order=target_order), "Shapes not aligned"
-
-    mapping: dict[str, str] = {}
-    for model_type_shape in source_order:
-        source_keys = source_order[model_type_shape]
-        target_keys = target_order[model_type_shape]
-        mapping.update(zip(target_keys, source_keys))
-
-    state_dict = converter._convert_state_dict(
-        source_state_dict=controlnet_src.state_dict(),
-        target_state_dict=controlnet.state_dict(),
-        state_dict_mapping=mapping,
-    )
-
-    return {k: v.half() for k, v in state_dict.items()}
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Convert a diffusers ControlNet model to a Refiners ControlNet model")
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="lllyasviel/sd-controlnet-depth",
-        help=(
-            "Can be a path to a .bin, a .safetensors file, or a model identifier from Hugging Face Hub. Defaults to"
-            " lllyasviel/sd-controlnet-depth"
-        ),
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        required=False,
-        default=None,
-        help=(
-            "Output path (.safetensors) for converted model. If not provided, the output path will be the same as the"
-            " source path."
-        ),
-    )
-    args = parser.parse_args(namespace=Args())
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}-controlnet.safetensors"
-    state_dict = convert(args=args)
-    save_to_safetensors(path=args.output_path, tensors=state_dict)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_diffusers_ip_adapter.py
+++ b/scripts/conversion/convert_diffusers_ip_adapter.py
@ -1,154 +0,0 @@
-import argparse
-from pathlib import Path
-
-import torch
-
-from refiners.fluxion.utils import save_to_safetensors
-from refiners.foundationals.latent_diffusion import SD1IPAdapter, SD1UNet, SDXLIPAdapter, SDXLUNet
-
-# Running:
-#
-#     from diffusers import UNet2DConditionModel
-#     unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
-#     for k in unet.attn_processors.keys():
-#         print(k)
-#
-# Gives:
-#
-#     down_blocks.0.attentions.0.transformer_blocks.0.attn1.processor
-#     down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor
-#     ...
-#     down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor
-#     up_blocks.1.attentions.0.transformer_blocks.0.attn1.processor
-#     up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor
-#     ...
-#     up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor
-#     mid_block.attentions.0.transformer_blocks.0.attn1.processor
-#     mid_block.attentions.0.transformer_blocks.0.attn2.processor
-#
-# With attn1=self-attention and attn2=cross-attention, and middle block in last position. So in terms of increasing
-# indices:
-#
-#     DownBlocks  -> [1, 3, 5, 7, 9, 11]
-#     MiddleBlock -> [31]
-#     UpBlocks    -> [13, 15, 17, 19, 21, 23, 25, 27, 29]
-#
-# Same for SDXL with more layers (70 cross-attentions vs. 16)
-CROSS_ATTN_MAPPING: dict[str, list[int]] = {
-    "sd15": list(range(1, 12, 2)) + [31] + list(range(13, 30, 2)),
-    "sdxl": list(range(1, 48, 2)) + list(range(121, 140, 2)) + list(range(49, 120, 2)),
-}
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Converts a IP-Adapter diffusers model to refiners.")
-    parser.add_argument(
-        "--from",
-        type=str,
-        required=True,
-        dest="source_path",
-        help="Path to the source model. (e.g.: 'ip-adapter_sd15.bin').",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Path to save the converted model. If not specified, the output path will be the source path with the"
-            " extension changed to .safetensors."
-        ),
-    )
-    parser.add_argument("--verbose", action="store_true", dest="verbose")
-    parser.add_argument("--half", action="store_true", dest="half")
-    args = parser.parse_args()
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}.safetensors"
-
-    # Do not use `load_tensors`: first-level values are not tensors.
-    weights: dict[str, dict[str, torch.Tensor]] = torch.load(args.source_path, "cpu")  # type: ignore
-    assert isinstance(weights, dict)
-    assert sorted(weights.keys()) == ["image_proj", "ip_adapter"]
-
-    image_proj_weights = weights["image_proj"]
-    ip_adapter_weights = weights["ip_adapter"]
-
-    fine_grained = "latents" in image_proj_weights  # aka IP-Adapter plus
-
-    match len(ip_adapter_weights):
-        case 32:
-            ip_adapter = SD1IPAdapter(target=SD1UNet(in_channels=4), fine_grained=fine_grained)
-            cross_attn_mapping = CROSS_ATTN_MAPPING["sd15"]
-        case 140:
-            ip_adapter = SDXLIPAdapter(target=SDXLUNet(in_channels=4), fine_grained=fine_grained)
-            cross_attn_mapping = CROSS_ATTN_MAPPING["sdxl"]
-        case _:
-            raise ValueError("Unexpected number of keys in input checkpoint")
-
-    # Manual conversion to avoid any runtime dependency on IP-Adapter[1] custom classes
-    # [1]: https://github.com/tencent-ailab/IP-Adapter
-
-    state_dict: dict[str, torch.Tensor] = {}
-
-    image_proj_state_dict: dict[str, torch.Tensor]
-
-    if fine_grained:
-        w = image_proj_weights
-        image_proj_state_dict = {
-            "LatentsToken.Parameter.weight": w["latents"].squeeze(0),  # drop batch dim = 1
-            "Linear_1.weight": w["proj_in.weight"],
-            "Linear_1.bias": w["proj_in.bias"],
-            "Linear_2.weight": w["proj_out.weight"],
-            "Linear_2.bias": w["proj_out.bias"],
-            "LayerNorm.weight": w["norm_out.weight"],
-            "LayerNorm.bias": w["norm_out.bias"],
-        }
-        for i in range(4):
-            t_pfx, s_pfx = f"Transformer.TransformerLayer_{i+1}.Residual_", f"layers.{i}."
-            image_proj_state_dict.update(
-                {
-                    f"{t_pfx}1.PerceiverAttention.Distribute.LayerNorm_1.weight": w[f"{s_pfx}0.norm1.weight"],
-                    f"{t_pfx}1.PerceiverAttention.Distribute.LayerNorm_1.bias": w[f"{s_pfx}0.norm1.bias"],
-                    f"{t_pfx}1.PerceiverAttention.Distribute.LayerNorm_2.weight": w[f"{s_pfx}0.norm2.weight"],
-                    f"{t_pfx}1.PerceiverAttention.Distribute.LayerNorm_2.bias": w[f"{s_pfx}0.norm2.bias"],
-                    f"{t_pfx}1.PerceiverAttention.Parallel.Chain_2.Linear.weight": w[f"{s_pfx}0.to_q.weight"],
-                    f"{t_pfx}1.PerceiverAttention.Parallel.Chain_1.Linear.weight": w[f"{s_pfx}0.to_kv.weight"],
-                    f"{t_pfx}1.PerceiverAttention.Linear.weight": w[f"{s_pfx}0.to_out.weight"],
-                    f"{t_pfx}2.LayerNorm.weight": w[f"{s_pfx}1.0.weight"],
-                    f"{t_pfx}2.LayerNorm.bias": w[f"{s_pfx}1.0.bias"],
-                    f"{t_pfx}2.FeedForward.Linear_1.weight": w[f"{s_pfx}1.1.weight"],
-                    f"{t_pfx}2.FeedForward.Linear_2.weight": w[f"{s_pfx}1.3.weight"],
-                }
-            )
-    else:
-        image_proj_state_dict = {
-            "Linear.weight": image_proj_weights["proj.weight"],
-            "Linear.bias": image_proj_weights["proj.bias"],
-            "LayerNorm.weight": image_proj_weights["norm.weight"],
-            "LayerNorm.bias": image_proj_weights["norm.bias"],
-        }
-    ip_adapter.image_proj.load_state_dict(state_dict=image_proj_state_dict)
-
-    for k, v in image_proj_state_dict.items():
-        state_dict[f"image_proj.{k}"] = v
-
-    assert len(ip_adapter.sub_adapters) == len(ip_adapter_weights.keys()) // 2
-
-    for i, _ in enumerate(ip_adapter.sub_adapters):
-        cross_attn_index = cross_attn_mapping[i]
-        k_ip = f"{cross_attn_index}.to_k_ip.weight"
-        v_ip = f"{cross_attn_index}.to_v_ip.weight"
-
-        # the name of the key is not checked at runtime, so we keep the original name
-        state_dict[f"ip_adapter.{i:03d}.to_k_ip.weight"] = ip_adapter_weights[k_ip]
-        state_dict[f"ip_adapter.{i:03d}.to_v_ip.weight"] = ip_adapter_weights[v_ip]
-
-    if args.half:
-        state_dict = {key: value.half() for key, value in state_dict.items()}
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}.safetensors"
-    save_to_safetensors(path=args.output_path, tensors=state_dict)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_diffusers_t2i_adapter.py
+++ b/scripts/conversion/convert_diffusers_t2i_adapter.py
@ -1,63 +0,0 @@
-import argparse
-from pathlib import Path
-
-import torch
-from diffusers import T2IAdapter  # type: ignore
-from torch import nn
-
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.foundationals.latent_diffusion.t2i_adapter import ConditionEncoder, ConditionEncoderXL
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Convert a pretrained diffusers T2I-Adapter model to refiners")
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        required=True,
-        help="Path or repository name of the source model. (e.g.: 'ip-adapter_sd15.bin').",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Path to save the converted model (extension will be .safetensors). If not specified, the output path will"
-            " be the source path with the extension changed to .safetensors."
-        ),
-    )
-    parser.add_argument(
-        "--half",
-        action="store_true",
-        dest="use_half",
-        default=False,
-        help="Use this flag to save the output file as half precision (default: full precision).",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        dest="verbose",
-        default=False,
-        help="Use this flag to print verbose output during conversion.",
-    )
-    args = parser.parse_args()
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).name}.safetensors"
-    assert args.output_path is not None
-
-    sdxl = "xl" in args.source_path
-    target = ConditionEncoderXL() if sdxl else ConditionEncoder()
-    # low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
-    source: nn.Module = T2IAdapter.from_pretrained(  # type: ignore
-        pretrained_model_name_or_path=args.source_path,
-        low_cpu_mem_usage=False,
-    )
-    assert isinstance(source, nn.Module), "Source model is not a nn.Module"
-
-    x = torch.randn(1, 3, 1024, 1024) if sdxl else torch.randn(1, 3, 512, 512)
-    converter = ModelConverter(source_model=source, target_model=target, verbose=args.verbose)
-    if not converter.run(source_args=(x,)):
-        raise RuntimeError("Model conversion failed")
-
-    converter.save_to_safetensors(path=args.output_path, half=args.use_half)
--- a/scripts/conversion/convert_diffusers_unet.py
+++ b/scripts/conversion/convert_diffusers_unet.py
@ -1,148 +0,0 @@
-import argparse
-from pathlib import Path
-from typing import Any
-
-import torch
-from diffusers import UNet2DConditionModel  # type: ignore
-from torch import nn
-
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.fluxion.utils import load_from_safetensors, load_tensors
-from refiners.foundationals.latent_diffusion import SD1UNet, SDXLUNet
-from refiners.foundationals.latent_diffusion.stable_diffusion_xl.lcm import SDXLLcmAdapter
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    output_path: str | None
-    subfolder: str
-    half: bool
-    verbose: bool
-    skip_init_check: bool
-    override_weights: str | None
-
-
-def setup_converter(args: Args) -> ModelConverter:
-    # low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
-    source: nn.Module = UNet2DConditionModel.from_pretrained(  # type: ignore
-        pretrained_model_name_or_path=args.source_path,
-        subfolder=args.subfolder,
-        low_cpu_mem_usage=False,
-    )
-    if args.override_weights is not None:
-        if args.override_weights.endswith(".pth"):
-            sd = load_tensors(args.override_weights)
-        elif args.override_weights.endswith(".safetensors"):
-            sd = load_from_safetensors(args.override_weights)
-        else:
-            raise ValueError(f"Unsupported file format: {args.override_weights}")
-        source.load_state_dict(sd)
-    source_in_channels: int = source.config.in_channels  # type: ignore
-    source_clip_embedding_dim: int = source.config.cross_attention_dim  # type: ignore
-    source_has_time_ids: bool = source.config.addition_embed_type == "text_time"  # type: ignore
-    source_is_lcm: bool = source.config.time_cond_proj_dim is not None
-
-    if source_has_time_ids:
-        target = SDXLUNet(in_channels=source_in_channels)
-    else:
-        target = SD1UNet(in_channels=source_in_channels)
-
-    if source_is_lcm:
-        assert isinstance(target, SDXLUNet)
-        SDXLLcmAdapter(target=target).inject()
-
-    x = torch.randn(1, source_in_channels, 32, 32)
-    timestep = torch.tensor(data=[0])
-    clip_text_embeddings = torch.randn(1, 77, source_clip_embedding_dim)
-
-    target.set_timestep(timestep=timestep)
-    target.set_clip_text_embedding(clip_text_embedding=clip_text_embeddings)
-    added_cond_kwargs = {}
-    if isinstance(target, SDXLUNet):
-        added_cond_kwargs = {"text_embeds": torch.randn(1, 1280), "time_ids": torch.randn(1, 6)}
-        target.set_time_ids(time_ids=added_cond_kwargs["time_ids"])
-        target.set_pooled_text_embedding(pooled_text_embedding=added_cond_kwargs["text_embeds"])
-
-    target_args = (x,)
-
-    source_kwargs: dict[str, Any] = {}
-    if source_has_time_ids:
-        source_kwargs["added_cond_kwargs"] = added_cond_kwargs
-    if source_is_lcm:
-        source_kwargs["timestep_cond"] = torch.randn(1, source.config.time_cond_proj_dim)
-
-    source_args = {
-        "positional": (x, timestep, clip_text_embeddings),
-        "keyword": source_kwargs,
-    }
-
-    converter = ModelConverter(
-        source_model=source,
-        target_model=target,
-        skip_init_check=args.skip_init_check,
-        skip_output_check=True,
-        verbose=args.verbose,
-    )
-    if not converter.run(
-        source_args=source_args,
-        target_args=target_args,
-    ):
-        raise RuntimeError("Model conversion failed")
-    return converter
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Converts a Diffusion UNet model to a Refiners SD1UNet or SDXLUNet model"
-    )
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="runwayml/stable-diffusion-v1-5",
-        help=(
-            "Can be a path to a .bin file, a .safetensors file or a model name from the HuggingFace Hub. Default:"
-            " runwayml/stable-diffusion-v1-5"
-        ),
-    )
-    parser.add_argument(
-        "--override-weights",
-        type=str,
-        default=None,
-        help=(
-            "Path to a weights file to override the source model (keeping its config). "
-            "This is useful for models distributed as .pth files."
-        ),
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Output path (.safetensors) for converted model. If not provided, the output path will be the same as the"
-            " source path."
-        ),
-    )
-    parser.add_argument("--subfolder", type=str, default="unet", help="Subfolder. Default: unet.")
-    parser.add_argument(
-        "--skip-init-check",
-        action="store_true",
-        help="Skip check that source and target have the same layers count.",
-    )
-    parser.add_argument("--half", action="store_true", help="Convert to half precision.")
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Prints additional information during conversion. Default: False",
-    )
-    args = parser.parse_args(namespace=Args())
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}-unet.safetensors"
-    converter = setup_converter(args=args)
-    converter.save_to_safetensors(path=args.output_path, half=args.half)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_dinov2.py
+++ b/scripts/conversion/convert_dinov2.py
@ -1,176 +0,0 @@
-import argparse
-from pathlib import Path
-
-import torch
-
-from refiners.fluxion.utils import load_tensors, save_to_safetensors
-
-
-def convert_dinov2_facebook(weights: dict[str, torch.Tensor]) -> None:
-    """Convert a DINOv2 weights from facebook to refiners."""
-    # get depth from "blocks" keys
-    depth = max([int(k.split(".")[1]) for k in weights.keys() if k.startswith("blocks.")]) + 1
-
-    # only needed when pre-training
-    del weights["mask_token"]
-
-    # squeeze cls_token and position_embeddings
-    weights["cls_token"] = weights["cls_token"].squeeze(0)
-    weights["pos_embed"] = weights["pos_embed"].squeeze(0)
-
-    # rename "w12" to "fc1" and "w3" to "fc2", only for giant model
-    for key in list(weights.keys()):
-        if "w3" in key:
-            new_key = key.replace("w3", "fc2")
-            weights[new_key] = weights.pop(key)
-        elif "w12" in key:
-            # we swap w1 and w2 because of the difference between our GLU implementation and theirs
-            # see https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/layers/swiglu_ffn.py#L31-L34
-            # and https://github.com/finegrain-ai/refiners/blob/a2ee70578361e4d84a65a8708564480a9b0ec67e/src/refiners/fluxion/layers/activations.py#L158-L160
-            weight = weights.pop(key)
-            w1, w2 = weight.chunk(2, dim=0)
-            w21 = torch.cat([w2, w1], dim=0)
-            new_key = key.replace("w12", "fc1")
-            weights[new_key] = w21
-
-    rename_keys: list[tuple[str, str]] = [
-        ("cls_token", "Concatenate.ClassToken.Parameter.weight"),
-        ("pos_embed", "PositionalEncoder.PositionalEmbedding.Parameter.weight"),
-        ("patch_embed.proj.weight", "Concatenate.PatchEncoder.Conv2d.weight"),
-        ("patch_embed.proj.bias", "Concatenate.PatchEncoder.Conv2d.bias"),
-        ("norm.weight", "LayerNorm.weight"),
-        ("norm.bias", "LayerNorm.bias"),
-    ]
-    for i in range(depth):
-        rename_keys.append(
-            (
-                f"blocks.{i}.norm1.weight",
-                f"Transformer.TransformerLayer_{i+1}.Residual_1.LayerNorm.weight",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.norm1.bias",
-                f"Transformer.TransformerLayer_{i+1}.Residual_1.LayerNorm.bias",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.attn.proj.weight",
-                f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Linear.weight",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.attn.proj.bias",
-                f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Linear.bias",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.ls1.gamma",
-                f"Transformer.TransformerLayer_{i+1}.Residual_1.LayerScale.weight",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.norm2.weight",
-                f"Transformer.TransformerLayer_{i+1}.Residual_2.LayerNorm.weight",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.norm2.bias",
-                f"Transformer.TransformerLayer_{i+1}.Residual_2.LayerNorm.bias",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.mlp.fc1.weight",
-                f"Transformer.TransformerLayer_{i+1}.Residual_2.FeedForward.Linear_1.weight",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.mlp.fc1.bias",
-                f"Transformer.TransformerLayer_{i+1}.Residual_2.FeedForward.Linear_1.bias",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.mlp.fc2.weight",
-                f"Transformer.TransformerLayer_{i+1}.Residual_2.FeedForward.Linear_2.weight",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.mlp.fc2.bias",
-                f"Transformer.TransformerLayer_{i+1}.Residual_2.FeedForward.Linear_2.bias",
-            ),
-        )
-        rename_keys.append(
-            (
-                f"blocks.{i}.ls2.gamma",
-                f"Transformer.TransformerLayer_{i+1}.Residual_2.LayerScale.weight",
-            ),
-        )
-
-    if "register_tokens" in weights:
-        weights["register_tokens"] = weights["register_tokens"].squeeze(0)
-        rename_keys.append(("register_tokens", "Registers.Parameter.weight"))
-
-    # rename keys
-    for old_key, new_key in rename_keys:
-        weights[new_key] = weights.pop(old_key)
-
-    # split the qkv weights and biases
-    for i in range(depth):
-        qkv_weight = weights.pop(f"blocks.{i}.attn.qkv.weight")
-        q_weight, k_weight, v_weight = qkv_weight.chunk(3, dim=0)
-        weights[f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Distribute.Linear_1.weight"] = q_weight
-        weights[f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Distribute.Linear_2.weight"] = k_weight
-        weights[f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Distribute.Linear_3.weight"] = v_weight
-
-        qkv_bias = weights.pop(f"blocks.{i}.attn.qkv.bias")
-        q_bias, k_bias, v_bias = qkv_bias.chunk(3, dim=0)
-        weights[f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Distribute.Linear_1.bias"] = q_bias
-        weights[f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Distribute.Linear_2.bias"] = k_bias
-        weights[f"Transformer.TransformerLayer_{i+1}.Residual_1.SelfAttention.Distribute.Linear_3.bias"] = v_bias
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--from",
-        type=str,
-        required=True,
-        dest="source_path",
-        help=(
-            "Official checkpoint from https://github.com/facebookresearch/dinov2"
-            " e.g. /path/to/dinov2_vits14_pretrain.pth"
-        ),
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Path to save the converted model. If not specified, the output path will be the source path with the"
-            " extension changed to .safetensors."
-        ),
-    )
-    parser.add_argument("--half", action="store_true", dest="half")
-    args = parser.parse_args()
-
-    weights = load_tensors(args.source_path)
-    convert_dinov2_facebook(weights)
-    if args.half:
-        weights = {key: value.half() for key, value in weights.items()}
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}.safetensors"
-    save_to_safetensors(path=args.output_path, tensors=weights)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_ella_adapter.py
+++ b/scripts/conversion/convert_ella_adapter.py
@ -1,102 +0,0 @@
-import argparse
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download  # type: ignore
-
-from refiners.fluxion.utils import load_from_safetensors, save_to_safetensors
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    output_path: str | None
-    use_half: bool
-
-
-def convert(args: Args) -> dict[str, torch.Tensor]:
-    if Path(args.source_path).suffix != ".safetensors":
-        args.source_path = hf_hub_download(
-            repo_id=args.source_path, filename="ella-sd1.5-tsc-t5xl.safetensors", local_dir="tests/weights/ELLA-Adapter"
-        )
-    weights = load_from_safetensors(args.source_path)
-
-    for key in list(weights.keys()):
-        if "latents" in key:
-            new_key = "PerceiverResampler.Latents.ParameterInitialized.weight"
-            weights[new_key] = weights.pop(key)
-        elif "time_embedding" in key:
-            new_key = key.replace("time_embedding", "TimestepEncoder.RangeEncoder").replace("linear", "Linear")
-            weights[new_key] = weights.pop(key)
-        elif "proj_in" in key:
-            new_key = f"PerceiverResampler.Linear.{key.split('.')[-1]}"
-            weights[new_key] = weights.pop(key)
-        elif "time_aware" in key:
-            new_key = f"PerceiverResampler.Residual.Linear.{key.split('.')[-1]}"
-            weights[new_key] = weights.pop(key)
-        elif "attn.in_proj" in key:
-            layer_num = int(key.split(".")[2])
-            query_param, key_param, value_param = weights.pop(key).chunk(3, dim=0)
-            param_type = "weight" if "weight" in key else "bias"
-            for i, param in enumerate([query_param, key_param, value_param]):
-                new_key = f"PerceiverResampler.Transformer.TransformerLayer_{layer_num+1}.Residual_1.PerceiverAttention.Attention.Distribute.Linear_{i+1}.{param_type}"
-                weights[new_key] = param
-        elif "attn.out_proj" in key:
-            layer_num = int(key.split(".")[2])
-            new_key = f"PerceiverResampler.Transformer.TransformerLayer_{layer_num+1}.Residual_1.PerceiverAttention.Attention.Linear.{key.split('.')[-1]}"
-            weights[new_key] = weights.pop(key)
-        elif "ln_ff" in key:
-            layer_num = int(key.split(".")[2])
-            new_key = f"PerceiverResampler.Transformer.TransformerLayer_{layer_num+1}.Residual_2.AdaLayerNorm.Parallel.Chain.Linear.{key.split('.')[-1]}"
-            weights[new_key] = weights.pop(key)
-        elif "ln_1" in key or "ln_2" in key:
-            layer_num = int(key.split(".")[2])
-            n = 1 if int(key.split(".")[3].split("_")[-1]) == 2 else 2
-            new_key = f"PerceiverResampler.Transformer.TransformerLayer_{layer_num+1}.Residual_1.PerceiverAttention.Distribute.AdaLayerNorm_{n}.Parallel.Chain.Linear.{key.split('.')[-1]}"
-            weights[new_key] = weights.pop(key)
-        elif "mlp" in key:
-            layer_num = int(key.split(".")[2])
-            n = 1 if "c_fc" in key else 2
-            new_key = f"PerceiverResampler.Transformer.TransformerLayer_{layer_num+1}.Residual_2.FeedForward.Linear_{n}.{key.split('.')[-1]}"
-            weights[new_key] = weights.pop(key)
-
-    if args.use_half:
-        weights = {key: value.half() for key, value in weights.items()}
-
-    return weights
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Convert a pretrained Ella Adapter to refiners implementation")
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="QQGYLab/ELLA",
-        help=(
-            "A path to a local .safetensors weights. If not provided, a repo from Hugging Face Hub will be used"
-            "Default to QQGYLab/ELLA"
-        ),
-    )
-
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Path to save the converted model (extension will be .safetensors). If not specified, the output path will"
-            " be the source path with the prefix set to refiners"
-        ),
-    )
-    parser.add_argument(
-        "--half",
-        action="store_true",
-        dest="use_half",
-        default=True,
-        help="Use this flag to save the output file as half precision (default: full precision).",
-    )
-    args = parser.parse_args(namespace=Args())
-    weights = convert(args)
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}-refiners.safetensors"
-    save_to_safetensors(path=args.output_path, tensors=weights)
--- a/scripts/conversion/convert_fooocus_control_lora.py
+++ b/scripts/conversion/convert_fooocus_control_lora.py
@ -1,348 +0,0 @@
-import argparse
-import logging
-from logging import info
-from pathlib import Path
-
-from huggingface_hub import hf_hub_download  # type: ignore
-from torch import Tensor
-from torch.nn import Parameter as TorchParameter
-
-from refiners.fluxion.adapters.lora import Lora, LoraAdapter, auto_attach_loras
-from refiners.fluxion.layers import Conv2d
-from refiners.fluxion.layers.linear import Linear
-from refiners.fluxion.utils import load_from_safetensors, save_to_safetensors
-from refiners.foundationals.latent_diffusion.lora import SDLoraManager
-from refiners.foundationals.latent_diffusion.stable_diffusion_xl.control_lora import (
-    ConditionEncoder,
-    ControlLora,
-    ControlLoraAdapter,
-    ZeroConvolution,
-)
-from refiners.foundationals.latent_diffusion.stable_diffusion_xl.model import StableDiffusion_XL
-
-
-def sort_keys(key: str, /) -> tuple[str, int]:
-    """Compute the score of a key, relatively to its suffix.
-
-    When used by [`sorted`][sorted], the keys will only be sorted "at the suffix level".
-
-    Args:
-        key: The key to sort.
-
-    Returns:
-        The padded suffix of the key.
-        The score of the key's suffix.
-    """
-    if "time_embed" in key:  # HACK: will place the "time_embed" layers at very start of the list
-        return ("", -2)
-
-    if "label_emb" in key:  # HACK: will place the "label_emb" layers right after "time_embed"
-        return ("", -1)
-
-    if "proj_out" in key:  # HACK: will place the "proj_out" layers at the end of each "transformer_blocks"
-        return (key.removesuffix("proj_out") + "transformer_blocks.99.ff.net.2", 10)
-
-    return SDLoraManager.sort_keys(key)
-
-
-def load_lora_layers(
-    name: str,
-    state_dict: dict[str, Tensor],
-    control_lora: ControlLora,
-) -> dict[str, Lora[Linear | Conv2d]]:
-    """Load the LoRA layers from the state_dict into the ControlLora.
-
-    Args:
-        name: The name of the LoRA.
-        state_dict: The state_dict of the LoRA.
-        control_lora: The ControlLora to load the LoRA layers into.
-    """
-    # filter from the state_dict the layers that will be used for the LoRA layers
-    lora_weights = {f"{key}.weight": value for key, value in state_dict.items() if ".up" in key or ".down" in key}
-
-    # move the tensors to the device and dtype of the ControlLora
-    lora_weights = {
-        key: value.to(
-            dtype=control_lora.dtype,
-            device=control_lora.device,
-        )
-        for key, value in lora_weights.items()
-    }
-
-    # load every LoRA layers from the filtered state_dict
-    lora_layers = Lora.from_dict(name, state_dict=lora_weights)
-
-    # sort all the LoRA's keys using the `sort_keys` method
-    lora_layers = {
-        key: lora_layers[key]
-        for key in sorted(
-            lora_layers.keys(),
-            key=sort_keys,
-        )
-    }
-
-    # auto-attach the LoRA layers to the U-Net
-    auto_attach_loras(lora_layers, control_lora, exclude=["ZeroConvolution", "ConditionEncoder"])
-
-    # eject all the LoRA adapters from the U-Net
-    # because we need each target path as if the adapter wasn't injected
-    for lora_layer in lora_layers.values():
-        lora_adapter = lora_layer.parent
-        assert isinstance(lora_adapter, LoraAdapter)
-        lora_adapter.eject()
-
-    return lora_layers
-
-
-def load_condition_encoder(
-    state_dict: dict[str, Tensor],
-    control_lora: ControlLora,
-) -> None:
-    """Load the ConditionEncoder's Conv2d layers from the state_dict into the ControlLora.
-
-    Args:
-        state_dict: The state_dict of the ConditionEncoder.
-        control_lora: The control_lora to load the ConditionEncoder's Conv2d layers into.
-    """
-    # filter from the state_dict the layers that will be used for the ConditionEncoder
-    condition_encoder_tensors = {key: value for key, value in state_dict.items() if "input_hint_block" in key}
-
-    # move the tensors to the device and dtype of the ControlLora
-    condition_encoder_tensors = {
-        key: value.to(
-            dtype=control_lora.dtype,
-            device=control_lora.device,
-        )
-        for key, value in condition_encoder_tensors.items()
-    }
-
-    # find the ConditionEncoder's Conv2d layers
-    condition_encoder_layer = control_lora.ensure_find(ConditionEncoder)
-    condition_encoder_conv2ds = list(condition_encoder_layer.layers(Conv2d))
-
-    # replace the Conv2d layers' weights and biases with the ones from the state_dict
-    for i, layer in enumerate(condition_encoder_conv2ds):
-        layer.weight = TorchParameter(condition_encoder_tensors[f"input_hint_block.{i*2}.weight"])
-        layer.bias = TorchParameter(condition_encoder_tensors[f"input_hint_block.{i*2}.bias"])
-
-
-def load_zero_convolutions(
-    state_dict: dict[str, Tensor],
-    control_lora: ControlLora,
-) -> None:
-    """Load the ZeroConvolution's Conv2d layers from the state_dict into the ControlLora.
-
-    Args:
-        state_dict: The state_dict of the ZeroConvolution.
-        control_lora: The ControlLora to load the ZeroConvolution's Conv2d layers into.
-    """
-    # filter from the state_dict the layers that will be used for the ZeroConvolution layers
-    zero_convolution_tensors = {key: value for key, value in state_dict.items() if "zero_convs" in key}
-    n = len(zero_convolution_tensors) // 2
-    zero_convolution_tensors[f"zero_convs.{n}.0.weight"] = state_dict["middle_block_out.0.weight"]
-    zero_convolution_tensors[f"zero_convs.{n}.0.bias"] = state_dict["middle_block_out.0.bias"]
-
-    # move the tensors to the device and dtype of the ControlLora
-    zero_convolution_tensors = {
-        key: value.to(
-            dtype=control_lora.dtype,
-            device=control_lora.device,
-        )
-        for key, value in zero_convolution_tensors.items()
-    }
-
-    # find the ZeroConvolution's Conv2d layers
-    zero_convolution_layers = list(control_lora.layers(ZeroConvolution))
-    zero_convolution_conv2ds = [layer.ensure_find(Conv2d) for layer in zero_convolution_layers]
-
-    # replace the Conv2d layers' weights and biases with the ones from the state_dict
-    for i, layer in enumerate(zero_convolution_conv2ds):
-        layer.weight = TorchParameter(zero_convolution_tensors[f"zero_convs.{i}.0.weight"])
-        layer.bias = TorchParameter(zero_convolution_tensors[f"zero_convs.{i}.0.bias"])
-
-
-def simplify_key(key: str, prefix: str, index: int | None = None) -> str:
-    """Simplify a key by stripping everything to the left of the prefix.
-
-    Also optionally add a zero-padded index to the prefix.
-
-    Example:
-        >>> simplify_key("foo.bar.ControlLora.something", "ControlLora", 1)
-        "ControlLora_01.something"
-
-        >>> simplify_key("foo.bar.ControlLora.DownBlocks.something", "ControlLora")
-        "ControlLora.DownBlocks.something"
-
-    Args:
-        key: The key to simplify.
-        prefix: The prefix to remove.
-        index: The index to add.
-    """
-    _, right = key.split(prefix, maxsplit=1)
-    if index:
-        return f"{prefix}_{index:02d}{right}"
-    else:
-        return f"{prefix}{right}"
-
-
-def convert_lora_layers(
-    lora_layers: dict[str, Lora[Linear | Conv2d]],
-    control_lora: ControlLora,
-    refiners_state_dict: dict[str, Tensor],
-) -> None:
-    """Convert the LoRA layers to the refiners format.
-
-    Args:
-        lora_layers: The LoRA layers to convert.
-        control_lora: The ControlLora to convert the LoRA layers from.
-        refiners_state_dict: The refiners state dict to update with the converted LoRA layers.
-    """
-    for lora_layer in lora_layers.values():
-        # get the adapter associated with the LoRA layer
-        lora_adapter = lora_layer.parent
-        assert isinstance(lora_adapter, LoraAdapter)
-
-        # get the path of the adapter's target in the ControlLora
-        target = lora_adapter.target
-        path = target.get_path(parent=control_lora.ensure_find_parent(target))
-
-        state_dict = {
-            f"{path}.down": lora_layer.down.weight,
-            f"{path}.up": lora_layer.up.weight,
-        }
-        state_dict = {simplify_key(key, "ControlLora."): param for key, param in state_dict.items()}
-        refiners_state_dict.update(state_dict)
-
-
-def convert_zero_convolutions(
-    control_lora: ControlLora,
-    refiners_state_dict: dict[str, Tensor],
-) -> None:
-    """Convert the ZeroConvolution layers to the refiners format.
-
-    Args:
-        control_lora: The ControlLora to convert the ZeroConvolution layers from.
-        refiners_state_dict: The refiners state dict to update with the converted ZeroConvolution layers.
-    """
-    zero_convolution_layers = list(control_lora.layers(ZeroConvolution))
-    for i, zero_convolution_layer in enumerate(zero_convolution_layers):
-        state_dict = zero_convolution_layer.state_dict()
-        path = zero_convolution_layer.get_path()
-        state_dict = {f"{path}.{key}": param for key, param in state_dict.items()}
-        state_dict = {simplify_key(key, "ZeroConvolution", i + 1): param for key, param in state_dict.items()}
-        refiners_state_dict.update(state_dict)
-
-
-def convert_condition_encoder(
-    control_lora: ControlLora,
-    refiners_state_dict: dict[str, Tensor],
-) -> None:
-    """Convert the ConditionEncoder to the refiners format.
-
-    Args:
-        control_lora: The ControlLora to convert the ConditionEncoder from.
-        refiners_state_dict: The refiners state dict to update with the converted ConditionEncoder.
-    """
-    condition_encoder_layer = control_lora.ensure_find(ConditionEncoder)
-    path = condition_encoder_layer.get_path()
-    state_dict = condition_encoder_layer.state_dict()
-    state_dict = {f"{path}.{key}": param for key, param in state_dict.items()}
-    state_dict = {simplify_key(key, "ConditionEncoder"): param for key, param in state_dict.items()}
-    refiners_state_dict.update(state_dict)
-
-
-def convert(
-    name: str,
-    state_dict_path: Path,
-    output_path: Path,
-) -> None:
-    sdxl = StableDiffusion_XL()
-    info("Stable Diffusion XL model initialized")
-
-    fooocus_state_dict = load_from_safetensors(state_dict_path)
-    info(f"Fooocus weights loaded from: {state_dict_path}")
-
-    control_lora_adapter = ControlLoraAdapter(target=sdxl.unet, name=name).inject()
-    control_lora = control_lora_adapter.control_lora
-    info("ControlLoraAdapter initialized")
-
-    lora_layers = load_lora_layers(name, fooocus_state_dict, control_lora)
-    info("LoRA layers loaded")
-
-    load_zero_convolutions(fooocus_state_dict, control_lora)
-    info("ZeroConvolution layers loaded")
-
-    load_condition_encoder(fooocus_state_dict, control_lora)
-    info("ConditionEncoder loaded")
-
-    refiners_state_dict: dict[str, Tensor] = {}
-    convert_lora_layers(lora_layers, control_lora, refiners_state_dict)
-    info("LoRA layers converted to refiners format")
-
-    convert_zero_convolutions(control_lora, refiners_state_dict)
-    info("ZeroConvolution layers converted to refiners format")
-
-    convert_condition_encoder(control_lora, refiners_state_dict)
-    info("ConditionEncoder converted to refiners format")
-
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    save_to_safetensors(path=output_path, tensors=refiners_state_dict)
-    info(f"Converted ControlLora state dict saved to disk at: {output_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Convert ControlLora (from Fooocus) weights to refiners.",
-    )
-
-    parser.add_argument(
-        "--from",
-        type=Path,
-        dest="source_path",
-        default="lllyasviel/misc:control-lora-canny-rank128.safetensors",
-        help="Path to the state_dict of the ControlLora, or a Hugging Face model ID.",
-    )
-
-    parser.add_argument(
-        "--to",
-        type=Path,
-        dest="output_path",
-        help=(
-            "Path to save the converted model (extension will be .safetensors)."
-            "If not specified, the output path will be the source path with the extension changed to .safetensors."
-        ),
-    )
-
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        dest="verbose",
-        default=False,
-        help="Use this flag to print verbose output during conversion.",
-    )
-
-    args = parser.parse_args()
-
-    if args.verbose:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(levelname)s: %(message)s",
-        )
-
-    if not args.source_path.exists():
-        repo_id, filename = str(args.source_path).split(":")
-        args.source_path = Path(
-            hf_hub_download(
-                repo_id=repo_id,
-                filename=filename,
-            )
-        )
-
-    if args.output_path is None:
-        args.output_path = Path(f"refiners_{args.source_path.stem}.safetensors")
-
-    convert(
-        name=args.source_path.stem,
-        state_dict_path=args.source_path,
-        output_path=args.output_path,
-    )
--- a/scripts/conversion/convert_hq_segment_anything.py
+++ b/scripts/conversion/convert_hq_segment_anything.py
@ -1,81 +0,0 @@
-import argparse
-
-from torch import Tensor
-
-from refiners.fluxion.utils import load_tensors, save_to_safetensors
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Convert HQ SAM model to Refiners state_dict format")
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        required=True,
-        default="sam_hq_vit_h.pth",
-        help="Path to the source model checkpoint.",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        required=True,
-        default="refiners_sam_hq_vit_h.safetensors",
-        help="Path to save the converted model in Refiners format.",
-    )
-    args = parser.parse_args()
-
-    source_state_dict = load_tensors(args.source_path)
-
-    state_dict: dict[str, Tensor] = {}
-
-    for suffix in ["weight", "bias"]:
-        state_dict[f"HQFeatures.CompressViTFeat.ConvTranspose2d_1.{suffix}"] = source_state_dict[
-            f"mask_decoder.compress_vit_feat.0.{suffix}"
-        ]
-        state_dict[f"HQFeatures.EmbeddingEncoder.ConvTranspose2d_1.{suffix}"] = source_state_dict[
-            f"mask_decoder.embedding_encoder.0.{suffix}"
-        ]
-        state_dict[f"EmbeddingMaskfeature.Conv2d_1.{suffix}"] = source_state_dict[
-            f"mask_decoder.embedding_maskfeature.0.{suffix}"
-        ]
-
-        state_dict[f"HQFeatures.CompressViTFeat.LayerNorm2d.{suffix}"] = source_state_dict[
-            f"mask_decoder.compress_vit_feat.1.{suffix}"
-        ]
-        state_dict[f"HQFeatures.EmbeddingEncoder.LayerNorm2d.{suffix}"] = source_state_dict[
-            f"mask_decoder.embedding_encoder.1.{suffix}"
-        ]
-        state_dict[f"EmbeddingMaskfeature.LayerNorm2d.{suffix}"] = source_state_dict[
-            f"mask_decoder.embedding_maskfeature.1.{suffix}"
-        ]
-
-        state_dict[f"HQFeatures.CompressViTFeat.ConvTranspose2d_2.{suffix}"] = source_state_dict[
-            f"mask_decoder.compress_vit_feat.3.{suffix}"
-        ]
-        state_dict[f"HQFeatures.EmbeddingEncoder.ConvTranspose2d_2.{suffix}"] = source_state_dict[
-            f"mask_decoder.embedding_encoder.3.{suffix}"
-        ]
-        state_dict[f"EmbeddingMaskfeature.Conv2d_2.{suffix}"] = source_state_dict[
-            f"mask_decoder.embedding_maskfeature.3.{suffix}"
-        ]
-
-    state_dict = {f"Chain.HQSAMMaskPrediction.Chain.DenseEmbeddingUpscalingHQ.{k}": v for k, v in state_dict.items()}
-
-    # HQ Token
-    state_dict["MaskDecoderTokensExtender.hq_token.weight"] = source_state_dict["mask_decoder.hf_token.weight"]
-
-    # HQ MLP
-    for i in range(3):
-        state_dict[f"Chain.HQSAMMaskPrediction.HQTokenMLP.MultiLinear.Linear_{i+1}.weight"] = source_state_dict[
-            f"mask_decoder.hf_mlp.layers.{i}.weight"
-        ]
-        state_dict[f"Chain.HQSAMMaskPrediction.HQTokenMLP.MultiLinear.Linear_{i+1}.bias"] = source_state_dict[
-            f"mask_decoder.hf_mlp.layers.{i}.bias"
-        ]
-
-    save_to_safetensors(path=args.output_path, tensors=state_dict)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_ic_light.py
+++ b/scripts/conversion/convert_ic_light.py
@ -1,89 +0,0 @@
-import argparse
-from pathlib import Path
-
-from convert_diffusers_unet import Args as UNetArgs, setup_converter as setup_unet_converter
-from huggingface_hub import hf_hub_download  # type: ignore
-
-from refiners.fluxion.utils import load_from_safetensors, save_to_safetensors
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    output_path: str | None
-    subfolder: str
-    half: bool
-    verbose: bool
-    reference_unet_path: str
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Converts IC-Light patch weights to work with Refiners")
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="lllyasviel/ic-light",
-        help=(
-            "Can be a path to a .bin file, a .safetensors file or a model name from the Hugging Face Hub. Default:"
-            " lllyasviel/ic-light"
-        ),
-    )
-    parser.add_argument("--filename", type=str, default="iclight_sd15_fc.safetensors", help="Filename inside the hub.")
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Output path (.safetensors) for converted model. If not provided, the output path will be the same as the"
-            " source path."
-        ),
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Prints additional information during conversion. Default: False",
-    )
-    parser.add_argument(
-        "--reference-unet-path",
-        type=str,
-        dest="reference_unet_path",
-        default="runwayml/stable-diffusion-v1-5",
-        help="Path to the reference UNet weights.",
-    )
-    args = parser.parse_args(namespace=Args())
-    if args.output_path is None:
-        args.output_path = f"{Path(args.filename).stem}-refiners.safetensors"
-
-    patch_file = (
-        Path(args.source_path)
-        if args.source_path.endswith(".safetensors")
-        else Path(
-            hf_hub_download(
-                repo_id=args.source_path,
-                filename=args.filename,
-            )
-        )
-    )
-    patch_weights = load_from_safetensors(patch_file)
-
-    unet_args = UNetArgs(
-        source_path=args.reference_unet_path,
-        subfolder="unet",
-        half=False,
-        verbose=False,
-        skip_init_check=True,
-        override_weights=None,
-    )
-    converter = setup_unet_converter(args=unet_args)
-    result = converter._convert_state_dict(  # pyright: ignore[reportPrivateUsage]
-        source_state_dict=patch_weights,
-        target_state_dict=converter.target_model.state_dict(),
-        state_dict_mapping=converter.get_mapping(),
-    )
-    save_to_safetensors(path=args.output_path, tensors=result)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_informative_drawings.py
+++ b/scripts/conversion/convert_informative_drawings.py
@ -1,65 +0,0 @@
-import argparse
-from typing import cast
-
-import torch
-from torch import nn
-
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.fluxion.utils import load_tensors
-from refiners.foundationals.latent_diffusion.preprocessors.informative_drawings import InformativeDrawings
-
-try:
-    from model import Generator  # type: ignore
-except ImportError:
-    raise ImportError(
-        "Please download the model.py file from https://github.com/carolineec/informative-drawings and add it to your"
-        " PYTHONPATH"
-    )
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    output_path: str
-    verbose: bool
-    half: bool
-
-
-def setup_converter(args: Args) -> ModelConverter:
-    source = cast(nn.Module, Generator(3, 1, 3))
-    source.load_state_dict(state_dict=load_tensors(args.source_path))
-    source.eval()
-    target = InformativeDrawings()
-    x = torch.randn(1, 3, 512, 512)
-    converter = ModelConverter(source_model=source, target_model=target, skip_output_check=True, verbose=args.verbose)
-    if not converter.run(source_args=(x,)):
-        raise RuntimeError("Model conversion failed")
-    return converter
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Converts a pretrained Informative Drawings model to a refiners Informative Drawings model"
-    )
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="model2.pth",
-        help="Path to the source model. (default: 'model2.pth').",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default="informative-drawings.safetensors",
-        help="Path to save the converted model. (default: 'informative-drawings.safetensors').",
-    )
-    parser.add_argument("--verbose", action="store_true", dest="verbose")
-    parser.add_argument("--half", action="store_true", dest="half")
-    args = parser.parse_args(namespace=Args())
-    converter = setup_converter(args=args)
-    converter.save_to_safetensors(path=args.output_path, half=args.half)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_mvanet.py
+++ b/scripts/conversion/convert_mvanet.py
@ -1,40 +0,0 @@
-import argparse
-from pathlib import Path
-
-from refiners.fluxion.utils import load_tensors, save_to_safetensors
-from refiners.foundationals.swin.mvanet.converter import convert_weights
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--from",
-        type=str,
-        required=True,
-        dest="source_path",
-        help="A MVANet checkpoint. One can be found at https://github.com/qianyu-dlut/MVANet",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Path to save the converted model. If not specified, the output path will be the source path with the"
-            " extension changed to .safetensors."
-        ),
-    )
-    parser.add_argument("--half", action="store_true", dest="half")
-    args = parser.parse_args()
-
-    src_weights = load_tensors(args.source_path)
-    weights = convert_weights(src_weights)
-    if args.half:
-        weights = {key: value.half() for key, value in weights.items()}
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}.safetensors"
-    save_to_safetensors(path=args.output_path, tensors=weights)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_segment_anything.py
+++ b/scripts/conversion/convert_segment_anything.py
@ -1,268 +0,0 @@
-import argparse
-import types
-from typing import Any, Callable, cast
-
-import torch
-import torch.nn as nn
-from segment_anything import build_sam_vit_h  # type: ignore
-from segment_anything.modeling.common import LayerNorm2d  # type: ignore
-from torch import Tensor
-
-import refiners.fluxion.layers as fl
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.fluxion.utils import load_tensors, manual_seed, save_to_safetensors
-from refiners.foundationals.segment_anything.image_encoder import PositionalEncoder, SAMViTH
-from refiners.foundationals.segment_anything.mask_decoder import MaskDecoder
-from refiners.foundationals.segment_anything.prompt_encoder import MaskEncoder, PointEncoder
-
-
-class FacebookSAM(nn.Module):
-    image_encoder: nn.Module
-    prompt_encoder: nn.Module
-    mask_decoder: nn.Module
-
-
-build_sam_vit_h = cast(Callable[[], FacebookSAM], build_sam_vit_h)
-
-
-assert issubclass(LayerNorm2d, nn.Module)
-custom_layers = {LayerNorm2d: fl.LayerNorm2d}
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    output_path: str
-    half: bool
-    verbose: bool
-
-
-def convert_mask_encoder(prompt_encoder: nn.Module) -> dict[str, Tensor]:
-    manual_seed(seed=0)
-    refiners_mask_encoder = MaskEncoder()
-
-    converter = ModelConverter(
-        source_model=prompt_encoder.mask_downscaling,
-        target_model=refiners_mask_encoder,
-        custom_layer_mapping=custom_layers,  # type: ignore
-    )
-
-    x = torch.randn(1, 256, 256)
-    mapping = converter.map_state_dicts(source_args=(x,))
-    assert mapping
-
-    source_state_dict = prompt_encoder.mask_downscaling.state_dict()
-    target_state_dict = refiners_mask_encoder.state_dict()
-
-    # Mapping handled manually (see below) because nn.Parameter is a special case
-    del target_state_dict["no_mask_embedding"]
-
-    converted_source = converter._convert_state_dict(  # pyright: ignore[reportPrivateUsage]
-        source_state_dict=source_state_dict, target_state_dict=target_state_dict, state_dict_mapping=mapping
-    )
-
-    state_dict: dict[str, Tensor] = {
-        "no_mask_embedding": nn.Parameter(data=prompt_encoder.no_mask_embed.weight.clone()),  # type: ignore
-    }
-
-    state_dict.update(converted_source)
-
-    refiners_mask_encoder.load_state_dict(state_dict=state_dict)
-
-    return state_dict
-
-
-def convert_point_encoder(prompt_encoder: nn.Module) -> dict[str, Tensor]:
-    manual_seed(seed=0)
-    point_embeddings: list[Tensor] = [pe.weight for pe in prompt_encoder.point_embeddings] + [
-        prompt_encoder.not_a_point_embed.weight
-    ]  # type: ignore
-    pe = prompt_encoder.pe_layer.positional_encoding_gaussian_matrix  # type: ignore
-    assert isinstance(pe, Tensor)
-    state_dict: dict[str, Tensor] = {
-        "Residual.PointTypeEmbedding.weight": nn.Parameter(data=torch.cat(tensors=point_embeddings, dim=0)),
-        "CoordinateEncoder.Linear.weight": nn.Parameter(data=pe.T.contiguous()),
-    }
-
-    refiners_prompt_encoder = PointEncoder()
-    refiners_prompt_encoder.load_state_dict(state_dict=state_dict)
-
-    return state_dict
-
-
-def convert_vit(vit: nn.Module) -> dict[str, Tensor]:
-    manual_seed(seed=0)
-    refiners_sam_vit_h = SAMViTH()
-
-    converter = ModelConverter(
-        source_model=vit,
-        target_model=refiners_sam_vit_h,
-        custom_layer_mapping=custom_layers,  # type: ignore
-    )
-    converter.skip_init_check = True
-
-    x = torch.randn(1, 3, 1024, 1024)
-    mapping = converter.map_state_dicts(source_args=(x,))
-    assert mapping
-
-    mapping["PositionalEncoder.Parameter.weight"] = "pos_embed"
-
-    target_state_dict = refiners_sam_vit_h.state_dict()
-    del target_state_dict["PositionalEncoder.Parameter.weight"]
-
-    source_state_dict = vit.state_dict()
-    pos_embed = source_state_dict["pos_embed"]
-    del source_state_dict["pos_embed"]
-
-    target_rel_keys = [
-        (
-            f"Transformer.TransformerLayer_{i}.Residual_1.FusedSelfAttention.RelativePositionAttention.horizontal_embedding",
-            f"Transformer.TransformerLayer_{i}.Residual_1.FusedSelfAttention.RelativePositionAttention.vertical_embedding",
-        )
-        for i in range(1, 33)
-    ]
-    source_rel_keys = [(f"blocks.{i}.attn.rel_pos_w", f"blocks.{i}.attn.rel_pos_h") for i in range(32)]
-
-    rel_items: dict[str, Tensor] = {}
-
-    for (key_w, key_h), (target_key_w, target_key_h) in zip(source_rel_keys, target_rel_keys):
-        rel_items[target_key_w] = source_state_dict[key_w]
-        rel_items[target_key_h] = source_state_dict[key_h]
-        del source_state_dict[key_w]
-        del source_state_dict[key_h]
-        del target_state_dict[target_key_w]
-        del target_state_dict[target_key_h]
-
-    converted_source = converter._convert_state_dict(  # pyright: ignore[reportPrivateUsage]
-        source_state_dict=source_state_dict, target_state_dict=target_state_dict, state_dict_mapping=mapping
-    )
-
-    positional_encoder = refiners_sam_vit_h.layer("PositionalEncoder", PositionalEncoder)
-    embed = pos_embed.reshape_as(positional_encoder.layer("Parameter", fl.Parameter).weight)
-    converted_source["PositionalEncoder.Parameter.weight"] = embed  # type: ignore
-    converted_source.update(rel_items)
-
-    refiners_sam_vit_h.load_state_dict(state_dict=converted_source)
-    assert converter.compare_models((x,), threshold=1e-2)
-
-    return converted_source
-
-
-def convert_mask_decoder(mask_decoder: nn.Module) -> dict[str, Tensor]:
-    manual_seed(seed=0)
-
-    refiners_mask_decoder = MaskDecoder()
-
-    image_embedding = torch.randn(1, 256, 64, 64)
-    dense_positional_embedding = torch.randn(1, 256, 64, 64)
-    point_embedding = torch.randn(1, 3, 256)
-    mask_embedding = torch.randn(1, 256, 64, 64)
-
-    from segment_anything.modeling.common import LayerNorm2d  # type: ignore
-
-    import refiners.fluxion.layers as fl
-
-    assert issubclass(LayerNorm2d, nn.Module)
-    custom_layers = {LayerNorm2d: fl.LayerNorm2d}
-
-    converter = ModelConverter(
-        source_model=mask_decoder,
-        target_model=refiners_mask_decoder,
-        custom_layer_mapping=custom_layers,  # type: ignore
-    )
-
-    inputs = {
-        "image_embeddings": image_embedding,
-        "image_pe": dense_positional_embedding,
-        "sparse_prompt_embeddings": point_embedding,
-        "dense_prompt_embeddings": mask_embedding,
-        "multimask_output": True,
-    }
-
-    refiners_mask_decoder.set_image_embedding(image_embedding)
-    refiners_mask_decoder.set_point_embedding(point_embedding)
-    refiners_mask_decoder.set_mask_embedding(mask_embedding)
-    refiners_mask_decoder.set_dense_positional_embedding(dense_positional_embedding)
-
-    mapping = converter.map_state_dicts(source_args=inputs, target_args={})
-    assert mapping is not None
-    mapping["MaskDecoderTokens.Parameter"] = "iou_token"
-
-    state_dict = converter._convert_state_dict(  # type: ignore
-        source_state_dict=mask_decoder.state_dict(),
-        target_state_dict=refiners_mask_decoder.state_dict(),
-        state_dict_mapping=mapping,
-    )
-    state_dict["MaskDecoderTokens.Parameter.weight"] = torch.cat(
-        tensors=[mask_decoder.iou_token.weight, mask_decoder.mask_tokens.weight], dim=0
-    )  # type: ignore
-    refiners_mask_decoder.load_state_dict(state_dict=state_dict)
-
-    refiners_mask_decoder.set_image_embedding(image_embedding)
-    refiners_mask_decoder.set_point_embedding(point_embedding)
-    refiners_mask_decoder.set_mask_embedding(mask_embedding)
-    refiners_mask_decoder.set_dense_positional_embedding(dense_positional_embedding)
-
-    # Perform (1) upscaling then (2) mask prediction in this order (= like in the official implementation) to make
-    # `compare_models` happy (MaskPrediction's Matmul runs those in the reverse order by default)
-    matmul = refiners_mask_decoder.ensure_find(fl.Matmul)
-
-    def forward_swapped_order(self: Any, *args: Any) -> Any:
-        y = self[1](*args)  # (1)
-        x = self[0](*args)  # (2)
-        return torch.matmul(input=x, other=y)
-
-    matmul.forward = types.MethodType(forward_swapped_order, matmul)
-
-    assert converter.compare_models(source_args=inputs, target_args={}, threshold=1e-3)
-
-    return state_dict
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Converts a Segment Anything ViT model to a Refiners SAMViTH model")
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="sam_vit_h_4b8939.pth",
-        # required=True,
-        help="Path to the Segment Anything model weights",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default="segment-anything-h.safetensors",
-        help="Output path for converted model (as safetensors).",
-    )
-    parser.add_argument("--half", action="store_true", default=False, help="Convert to half precision. Default: False")
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Prints additional information during conversion. Default: False",
-    )
-    args = parser.parse_args(namespace=Args())
-
-    sam_h = build_sam_vit_h()  # type: ignore
-    sam_h.load_state_dict(state_dict=load_tensors(args.source_path))
-
-    vit_state_dict = convert_vit(vit=sam_h.image_encoder)
-    mask_decoder_state_dict = convert_mask_decoder(mask_decoder=sam_h.mask_decoder)
-    point_encoder_state_dict = convert_point_encoder(prompt_encoder=sam_h.prompt_encoder)
-    mask_encoder_state_dict = convert_mask_encoder(prompt_encoder=sam_h.prompt_encoder)
-
-    output_state_dict = {
-        **{f"SAMViTH.{key}": value for key, value in vit_state_dict.items()},
-        **{f"MaskDecoder.{key}": value for key, value in mask_decoder_state_dict.items()},
-        **{f"PointEncoder.{key}": value for key, value in point_encoder_state_dict.items()},
-        **{f"MaskEncoder.{key}": value for key, value in mask_encoder_state_dict.items()},
-    }
-    if args.half:
-        output_state_dict = {key: value.half() for key, value in output_state_dict.items()}
-
-    save_to_safetensors(path=args.output_path, tensors=output_state_dict)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_transformers_clip_image_model.py
+++ b/scripts/conversion/convert_transformers_clip_image_model.py
@ -1,149 +0,0 @@
-import argparse
-from pathlib import Path
-from typing import NamedTuple, cast
-
-import torch
-from torch import nn
-from transformers import CLIPVisionModelWithProjection  # type: ignore
-
-import refiners.fluxion.layers as fl
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.fluxion.utils import save_to_safetensors
-from refiners.foundationals.clip.image_encoder import CLIPImageEncoder
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    subfolder: str
-    output_path: str | None
-    half: bool
-    verbose: bool
-    threshold: float
-
-
-class CLIPImageEncoderConfig(NamedTuple):
-    architectures: list[str]
-    num_channels: int
-    hidden_size: int
-    hidden_act: str
-    image_size: int
-    projection_dim: int
-    patch_size: int
-    num_hidden_layers: int
-    num_attention_heads: int
-    intermediate_size: int
-    layer_norm_eps: float
-
-
-def setup_converter(args: Args) -> ModelConverter:
-    # low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
-    source: nn.Module = CLIPVisionModelWithProjection.from_pretrained(  # type: ignore
-        pretrained_model_name_or_path=args.source_path,
-        subfolder=args.subfolder,
-        low_cpu_mem_usage=False,
-    )
-    assert isinstance(source, nn.Module), "Source model is not a nn.Module"
-    config = cast(CLIPImageEncoderConfig, source.config)  # pyright: ignore[reportArgumentType, reportUnknownMemberType]
-
-    assert (
-        config.architectures[0] == "CLIPVisionModelWithProjection"
-    ), f"Unsupported architecture: {config.architectures[0]}"
-    assert config.num_channels == 3, f"Expected 3 input channels, got {config.num_channels}"
-    assert config.hidden_act == "gelu", f"Unsupported activation: {config.hidden_act}"
-
-    target = CLIPImageEncoder(
-        image_size=config.image_size,
-        embedding_dim=config.hidden_size,
-        output_dim=config.projection_dim,
-        patch_size=config.patch_size,
-        num_layers=config.num_hidden_layers,
-        num_attention_heads=config.num_attention_heads,
-        feedforward_dim=config.intermediate_size,
-        layer_norm_eps=config.layer_norm_eps,
-    )
-
-    x = torch.randn(1, 3, config.image_size, config.image_size)
-
-    converter = ModelConverter(source_model=source, target_model=target, verbose=True)
-
-    # Custom conversion logic since the class embedding (fl.Parameter layer) is not supported out-of-the-box by the
-    # converter
-    mapping = converter.map_state_dicts((x,))
-    assert mapping is not None
-
-    source_state_dict = source.state_dict()
-    target_state_dict = target.state_dict()
-
-    # Remove the class embedding from state dict since it was not mapped by the model converter
-    class_embedding = target.ensure_find(fl.Parameter)
-    class_embedding_key = next((n for n, p in target.named_parameters() if id(p) == id(class_embedding.weight)), None)
-    assert class_embedding_key is not None
-    assert class_embedding_key in target_state_dict
-    del target_state_dict[class_embedding_key]
-
-    converted_state_dict = converter._convert_state_dict(  # type: ignore[reportPrivateUsage]
-        source_state_dict=source_state_dict, target_state_dict=target_state_dict, state_dict_mapping=mapping
-    )
-    target.load_state_dict(state_dict=converted_state_dict, strict=False)
-
-    # Ad hoc post-conversion steps
-    embed = source.vision_model.embeddings.class_embedding
-    class_embedding.weight = torch.nn.Parameter(embed.clone().reshape_as(class_embedding.weight))  # type: ignore
-
-    assert converter.compare_models((x,), threshold=args.threshold)
-
-    return converter
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Converts a CLIPImageEncoder from the library transformers from the HuggingFace Hub to refiners."
-    )
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="stabilityai/stable-diffusion-2-1-unclip",
-        help=(
-            "Can be a path to a .bin file, a .safetensors file or a model name from the HuggingFace Hub. Default:"
-            " stabilityai/stable-diffusion-2-1-unclip"
-        ),
-    )
-    parser.add_argument(
-        "--subfolder",
-        type=str,
-        dest="subfolder",
-        default="image_encoder",
-        help="Subfolder in the source path where the model is located inside the Hub. Default: image_encoder",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Output path (.safetensors) for converted model. If not provided, the output path will be the same as the"
-            " source path."
-        ),
-    )
-    parser.add_argument("--half", action="store_true", help="Convert to half precision.")
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Prints additional information during conversion. Default: False",
-    )
-    parser.add_argument("--threshold", type=float, default=1e-2, help="Threshold for model comparison. Default: 1e-2")
-    args = parser.parse_args(namespace=Args())
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}-{args.subfolder}.safetensors"
-    converter = setup_converter(args=args)
-    # Do not use converter.save_to_safetensors since it is not in a valid state due to the ad hoc conversion
-    state_dict = converter.target_model.state_dict()
-    if args.half:
-        state_dict = {key: value.half() for key, value in state_dict.items()}
-    save_to_safetensors(path=args.output_path, tensors=state_dict)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/conversion/convert_transformers_clip_text_model.py
+++ b/scripts/conversion/convert_transformers_clip_text_model.py
@ -1,150 +0,0 @@
-import argparse
-from pathlib import Path
-from typing import NamedTuple, cast
-
-from torch import nn
-from transformers import CLIPTextModel, CLIPTextModelWithProjection  # type: ignore
-
-import refiners.fluxion.layers as fl
-from refiners.fluxion.model_converter import ModelConverter
-from refiners.fluxion.utils import save_to_safetensors
-from refiners.foundationals.clip.text_encoder import CLIPTextEncoder, CLIPTextEncoderG, CLIPTextEncoderL
-from refiners.foundationals.clip.tokenizer import CLIPTokenizer
-from refiners.foundationals.latent_diffusion.stable_diffusion_xl.text_encoder import DoubleTextEncoder
-
-
-class Args(argparse.Namespace):
-    source_path: str
-    subfolder: str
-    output_path: str | None
-    half: bool
-    verbose: bool
-
-
-class CLIPTextEncoderConfig(NamedTuple):
-    architectures: list[str]
-    vocab_size: int
-    hidden_size: int
-    intermediate_size: int
-    num_hidden_layers: int
-    num_attention_heads: int
-    hidden_act: str
-    layer_norm_eps: float
-    projection_dim: int
-
-
-def setup_converter(args: Args, with_projection: bool = False) -> ModelConverter:
-    # low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
-    cls = CLIPTextModelWithProjection if with_projection else CLIPTextModel
-    source: nn.Module = cls.from_pretrained(  # type: ignore
-        pretrained_model_name_or_path=args.source_path,
-        subfolder=args.subfolder,
-        low_cpu_mem_usage=False,
-    )
-    assert isinstance(source, nn.Module), "Source model is not a nn.Module"
-    config = cast(CLIPTextEncoderConfig, source.config)  # pyright: ignore[reportArgumentType, reportUnknownMemberType]
-    architecture: str = config.architectures[0]
-    embedding_dim: int = config.hidden_size
-    projection_dim: int = config.projection_dim
-    use_quick_gelu = config.hidden_act == "quick_gelu"
-    assert architecture in ("CLIPTextModel", "CLIPTextModelWithProjection"), f"Unsupported architecture: {architecture}"
-    target = CLIPTextEncoder(
-        embedding_dim=config.hidden_size,
-        num_layers=config.num_hidden_layers,
-        num_attention_heads=config.num_attention_heads,
-        feedforward_dim=config.intermediate_size,
-        use_quick_gelu=use_quick_gelu,
-    )
-    if architecture == "CLIPTextModelWithProjection":
-        target.append(module=fl.Linear(in_features=embedding_dim, out_features=projection_dim, bias=False))
-    text = "What a nice cat you have there!"
-    tokenizer = target.ensure_find(CLIPTokenizer)
-    tokens = tokenizer(text)
-    converter = ModelConverter(source_model=source, target_model=target, skip_output_check=True, verbose=args.verbose)
-    if not converter.run(source_args=(tokens,), target_args=(text,)):
-        raise RuntimeError("Model conversion failed")
-    return converter
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Converts a CLIPTextEncoder from the library transformers from the HuggingFace Hub to refiners."
-    )
-    parser.add_argument(
-        "--from",
-        type=str,
-        dest="source_path",
-        default="runwayml/stable-diffusion-v1-5",
-        help=(
-            "Can be a path to a .bin file, a .safetensors file or a model name from the HuggingFace Hub. Default:"
-            " runwayml/stable-diffusion-v1-5"
-        ),
-    )
-    parser.add_argument(
-        "--subfolder",
-        type=str,
-        dest="subfolder",
-        default="text_encoder",
-        help=(
-            "Subfolder in the source path where the model is located inside the Hub. Default: text_encoder (for"
-            " CLIPTextModel)"
-        ),
-    )
-    parser.add_argument(
-        "--subfolder2",
-        type=str,
-        dest="subfolder2",
-        default=None,
-        help="Additional subfolder for the 2nd text encoder (useful for SDXL). Default: None",
-    )
-    parser.add_argument(
-        "--to",
-        type=str,
-        dest="output_path",
-        default=None,
-        help=(
-            "Output path (.safetensors) for converted model. If not provided, the output path will be the same as the"
-            " source path."
-        ),
-    )
-    parser.add_argument("--half", action="store_true", help="Convert to half precision.")
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        default=False,
-        help="Prints additional information during conversion. Default: False",
-    )
-    args = parser.parse_args(namespace=Args())
-    if args.output_path is None:
-        args.output_path = f"{Path(args.source_path).stem}-{args.subfolder}.safetensors"
-    converter = setup_converter(args=args)
-    if args.subfolder2 is not None:
-        # Assume this is the second text encoder of Stable Diffusion XL
-        args.subfolder = args.subfolder2
-        converter2 = setup_converter(args=args, with_projection=True)
-
-        text_encoder_l = CLIPTextEncoderL()
-        text_encoder_l.load_state_dict(state_dict=converter.get_state_dict())
-
-        projection = cast(CLIPTextEncoder, converter2.target_model)[-1]
-        assert isinstance(projection, fl.Linear)
-        text_encoder_g_with_projection = CLIPTextEncoderG()
-        text_encoder_g_with_projection.append(module=projection)
-        text_encoder_g_with_projection.load_state_dict(state_dict=converter2.get_state_dict())
-
-        projection = text_encoder_g_with_projection.pop(index=-1)
-        assert isinstance(projection, fl.Linear)
-        double_text_encoder = DoubleTextEncoder(
-            text_encoder_l=text_encoder_l, text_encoder_g=text_encoder_g_with_projection, projection=projection
-        )
-
-        state_dict = double_text_encoder.state_dict()
-        if args.half:
-            state_dict = {key: value.half() for key, value in state_dict.items()}
-        save_to_safetensors(path=args.output_path, tensors=state_dict)
-    else:
-        converter.save_to_safetensors(path=args.output_path, half=args.half)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/prepare_test_weights.py
+++ b/scripts/prepare_test_weights.py
@ -1,945 +0,0 @@
-"""
-Download and convert weights for testing
-
-To see what weights will be downloaded and converted, run:
-DRY_RUN=1 python scripts/prepare_test_weights.py
-"""
-
-import hashlib
-import os
-import subprocess
-import sys
-from urllib.parse import urlparse
-
-import gdown
-import requests
-from tqdm import tqdm
-
-# Set the base directory to the parent directory of the script
-project_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-test_weights_dir = os.path.join(project_dir, "tests", "weights")
-
-previous_line = "\033[F"
-
-download_count = 0
-bytes_count = 0
-
-
-def die(message: str) -> None:
-    print(message, file=sys.stderr)
-    sys.exit(1)
-
-
-def rel(path: str) -> str:
-    return os.path.relpath(path, project_dir)
-
-
-def calc_hash(filepath: str) -> str:
-    with open(filepath, "rb") as f:
-        data = f.read()
-        found = hashlib.blake2b(data, digest_size=int(32 / 8)).hexdigest()
-    return found
-
-
-def check_hash(path: str, expected: str) -> str:
-    found = calc_hash(path)
-    if found != expected:
-        die(f"❌ Invalid hash for {path} ({found} != {expected})")
-    return found
-
-
-def download_file(
-    url: str,
-    dest_folder: str,
-    dry_run: bool | None = None,
-    skip_existing: bool = True,
-    expected_hash: str | None = None,
-    filename: str | None = None,
-):
-    """
-    Downloads a file
-
-    Features:
-      - shows a progress bar
-      - skips existing files
-      - uses a temporary file to prevent partial downloads
-      - can do a dry run to check the url is valid
-      - displays the downloaded file hash
-
-    """
-    global download_count, bytes_count
-    filename = os.path.basename(urlparse(url).path) if filename is None else filename
-    dest_filename = os.path.join(dest_folder, filename)
-    temp_filename = dest_filename + ".part"
-    dry_run = bool(os.environ.get("DRY_RUN") == "1") if dry_run is None else dry_run
-
-    is_downloaded = os.path.exists(dest_filename)
-    if is_downloaded and skip_existing:
-        skip_icon = "✖️ "
-    else:
-        skip_icon = "🔽"
-
-    if dry_run:
-        response = requests.head(url, allow_redirects=True)
-        readable_size = ""
-
-        if response.status_code == 200:
-            content_length = response.headers.get("content-length")
-
-            if content_length:
-                size_in_bytes = int(content_length)
-                readable_size = human_readable_size(size_in_bytes)
-                download_count += 1
-                bytes_count += size_in_bytes
-            print(f"✅{skip_icon} {response.status_code} READY {readable_size:<8} {url}")
-
-        else:
-            print(f"❌{skip_icon} {response.status_code} ERROR {readable_size:<8} {url}")
-        return
-
-    if skip_existing and is_downloaded:
-        print(f"{skip_icon}️ Skipping previously downloaded {url}")
-        if expected_hash is not None:
-            check_hash(dest_filename, expected_hash)
-        return
-
-    os.makedirs(dest_folder, exist_ok=True)
-
-    print(f"🔽 Downloading {url} => '{rel(dest_filename)}'", end="\n")
-    response = requests.get(url, stream=True)
-    if response.status_code != 200:
-        print(response.content[:1000])
-        die(f"Failed to download {url}. Status code: {response.status_code}")
-    total = int(response.headers.get("content-length", 0))
-    bar = tqdm(
-        desc=filename,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-        leave=False,
-    )
-    with open(temp_filename, "wb") as f, bar:
-        for data in response.iter_content(chunk_size=1024 * 1000):
-            size = f.write(data)
-            bar.update(size)
-
-    os.rename(temp_filename, dest_filename)
-    calculated_hash = calc_hash(dest_filename)
-
-    print(f"{previous_line}✅ Downloaded {calculated_hash} {url} => '{rel(dest_filename)}' ")
-    if expected_hash is not None:
-        check_hash(dest_filename, expected_hash)
-
-
-def download_files(urls: list[str], dest_folder: str):
-    for url in urls:
-        download_file(url, dest_folder)
-
-
-def human_readable_size(size: int | float, decimal_places: int = 2) -> str:
-    for unit in ["B", "KB", "MB", "GB", "TB", "PB"]:
-        if size < 1024.0:
-            break
-        size /= 1024.0
-    return f"{size:.{decimal_places}f}{unit}"  # type: ignore
-
-
-def download_sd_text_encoder(hf_repo_id: str = "runwayml/stable-diffusion-v1-5", subdir: str = "text_encoder"):
-    encoder_filename = "model.safetensors" if "inpainting" not in hf_repo_id else "model.fp16.safetensors"
-    base_url = f"https://huggingface.co/{hf_repo_id}"
-    download_files(
-        urls=[
-            f"{base_url}/raw/main/{subdir}/config.json",
-            f"{base_url}/resolve/main/{subdir}/{encoder_filename}",
-        ],
-        dest_folder=os.path.join(test_weights_dir, hf_repo_id, subdir),
-    )
-
-
-def download_sd_tokenizer(hf_repo_id: str = "runwayml/stable-diffusion-v1-5", subdir: str = "tokenizer"):
-    download_files(
-        urls=[
-            f"https://huggingface.co/{hf_repo_id}/raw/main/{subdir}/merges.txt",
-            f"https://huggingface.co/{hf_repo_id}/raw/main/{subdir}/special_tokens_map.json",
-            f"https://huggingface.co/{hf_repo_id}/raw/main/{subdir}/tokenizer_config.json",
-            f"https://huggingface.co/{hf_repo_id}/raw/main/{subdir}/vocab.json",
-        ],
-        dest_folder=os.path.join(test_weights_dir, hf_repo_id, subdir),
-    )
-
-
-def download_sd_base(hf_repo_id: str = "runwayml/stable-diffusion-v1-5"):
-    is_inpainting = "inpainting" in hf_repo_id
-    ext = "safetensors" if not is_inpainting else "bin"
-    base_folder = os.path.join(test_weights_dir, hf_repo_id)
-    download_file(f"https://huggingface.co/{hf_repo_id}/raw/main/model_index.json", base_folder)
-    download_file(
-        f"https://huggingface.co/{hf_repo_id}/raw/main/scheduler/scheduler_config.json",
-        os.path.join(base_folder, "scheduler"),
-    )
-
-    for subdir in ["unet", "vae"]:
-        subdir_folder = os.path.join(base_folder, subdir)
-        download_file(f"https://huggingface.co/{hf_repo_id}/raw/main/{subdir}/config.json", subdir_folder)
-        download_file(
-            f"https://huggingface.co/{hf_repo_id}/resolve/main/{subdir}/diffusion_pytorch_model.{ext}", subdir_folder
-        )
-    # we only need the unet for the inpainting model
-    if not is_inpainting:
-        download_sd_text_encoder(hf_repo_id, "text_encoder")
-    download_sd_tokenizer(hf_repo_id, "tokenizer")
-
-
-def download_sd15(hf_repo_id: str = "runwayml/stable-diffusion-v1-5"):
-    download_sd_base(hf_repo_id)
-    base_folder = os.path.join(test_weights_dir, hf_repo_id)
-
-    subdir = "feature_extractor"
-    download_file(
-        f"https://huggingface.co/{hf_repo_id}/raw/main/{subdir}/preprocessor_config.json",
-        os.path.join(base_folder, subdir),
-    )
-
-    if "inpainting" not in hf_repo_id:
-        subdir = "safety_checker"
-        subdir_folder = os.path.join(base_folder, subdir)
-        download_file(f"https://huggingface.co/{hf_repo_id}/raw/main/{subdir}/config.json", subdir_folder)
-        download_file(f"https://huggingface.co/{hf_repo_id}/resolve/main/{subdir}/model.safetensors", subdir_folder)
-
-
-def download_sdxl(hf_repo_id: str = "stabilityai/stable-diffusion-xl-base-1.0"):
-    download_sd_base(hf_repo_id)
-    download_sd_text_encoder(hf_repo_id, "text_encoder_2")
-    download_sd_tokenizer(hf_repo_id, "tokenizer_2")
-
-
-def download_vae_fp16_fix():
-    download_files(
-        urls=[
-            "https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/raw/main/config.json",
-            "https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/resolve/main/diffusion_pytorch_model.safetensors",
-        ],
-        dest_folder=os.path.join(test_weights_dir, "madebyollin", "sdxl-vae-fp16-fix"),
-    )
-
-
-def download_vae_ft_mse():
-    download_files(
-        urls=[
-            "https://huggingface.co/stabilityai/sd-vae-ft-mse/raw/main/config.json",
-            "https://huggingface.co/stabilityai/sd-vae-ft-mse/resolve/main/diffusion_pytorch_model.safetensors",
-        ],
-        dest_folder=os.path.join(test_weights_dir, "stabilityai", "sd-vae-ft-mse"),
-    )
-
-
-def download_loras():
-    dest_folder = os.path.join(test_weights_dir, "loras", "pokemon-lora")
-    download_file(
-        "https://huggingface.co/pcuenq/pokemon-lora/resolve/main/pytorch_lora_weights.bin",
-        dest_folder,
-        expected_hash="89992ea6",
-    )
-
-    dest_folder = os.path.join(test_weights_dir, "loras", "dpo-lora")
-    download_file(
-        "https://huggingface.co/radames/sdxl-DPO-LoRA/resolve/main/pytorch_lora_weights.safetensors",
-        dest_folder,
-        expected_hash="a51e9144",
-    )
-
-    dest_folder = os.path.join(test_weights_dir, "loras", "sliders")
-    download_file("https://sliders.baulab.info/weights/xl_sliders/age.pt", dest_folder, expected_hash="908f07d3")
-    download_file(
-        "https://sliders.baulab.info/weights/xl_sliders/cartoon_style.pt", dest_folder, expected_hash="25652004"
-    )
-    download_file("https://sliders.baulab.info/weights/xl_sliders/eyesize.pt", dest_folder, expected_hash="ee170e4d")
-
-    dest_folder = os.path.join(test_weights_dir, "loras")
-    download_file(
-        "https://civitai.com/api/download/models/140624",
-        filename="Sci-fi_Environments_sdxl.safetensors",
-        dest_folder=dest_folder,
-        expected_hash="6a4afda8",
-    )
-    download_file(
-        "https://civitai.com/api/download/models/135931",
-        filename="pixel-art-xl-v1.1.safetensors",
-        dest_folder=dest_folder,
-        expected_hash="71aaa6ca",
-    )
-
-
-def download_preprocessors():
-    dest_folder = os.path.join(test_weights_dir, "carolineec", "informativedrawings")
-    download_file("https://huggingface.co/spaces/carolineec/informativedrawings/resolve/main/model2.pth", dest_folder)
-
-
-def download_controlnet():
-    base_folder = os.path.join(test_weights_dir, "lllyasviel")
-    controlnets = [
-        "control_v11p_sd15_canny",
-        "control_v11f1p_sd15_depth",
-        "control_v11p_sd15_normalbae",
-        "control_v11p_sd15_lineart",
-    ]
-    for net in controlnets:
-        net_folder = os.path.join(base_folder, net)
-        urls = [
-            f"https://huggingface.co/lllyasviel/{net}/raw/main/config.json",
-            f"https://huggingface.co/lllyasviel/{net}/resolve/main/diffusion_pytorch_model.safetensors",
-        ]
-        download_files(urls, net_folder)
-
-    tile_folder = os.path.join(base_folder, "control_v11f1e_sd15_tile")
-    urls = [
-        "https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile/raw/main/config.json",
-        "https://huggingface.co/lllyasviel/control_v11f1e_sd15_tile/resolve/main/diffusion_pytorch_model.bin",
-    ]
-    download_files(urls, tile_folder)
-
-    mfidabel_folder = os.path.join(test_weights_dir, "mfidabel", "controlnet-segment-anything")
-    urls = [
-        "https://huggingface.co/mfidabel/controlnet-segment-anything/raw/main/config.json",
-        "https://huggingface.co/mfidabel/controlnet-segment-anything/resolve/main/diffusion_pytorch_model.bin",
-    ]
-    download_files(urls, mfidabel_folder)
-
-
-def download_control_lora_fooocus():
-    base_folder = os.path.join(test_weights_dir, "lllyasviel", "misc")
-
-    download_file(
-        url=f"https://huggingface.co/lllyasviel/misc/resolve/main/control-lora-canny-rank128.safetensors",
-        dest_folder=base_folder,
-        expected_hash="fec9e32b",
-    )
-
-    download_file(
-        url=f"https://huggingface.co/lllyasviel/misc/resolve/main/fooocus_xl_cpds_128.safetensors",
-        dest_folder=base_folder,
-        expected_hash="fc04b120",
-    )
-
-
-def download_unclip():
-    base_folder = os.path.join(test_weights_dir, "stabilityai", "stable-diffusion-2-1-unclip")
-    download_file(
-        "https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/raw/main/model_index.json", base_folder
-    )
-    image_encoder_folder = os.path.join(base_folder, "image_encoder")
-    urls = [
-        "https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/raw/main/image_encoder/config.json",
-        "https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/resolve/main/image_encoder/model.safetensors",
-    ]
-    download_files(urls, image_encoder_folder)
-
-
-def download_ip_adapter():
-    base_folder = os.path.join(test_weights_dir, "h94", "IP-Adapter")
-    models_folder = os.path.join(base_folder, "models")
-    urls = [
-        "https://huggingface.co/h94/IP-Adapter/resolve/main/models/ip-adapter_sd15.bin",
-        "https://huggingface.co/h94/IP-Adapter/resolve/main/models/ip-adapter-plus_sd15.bin",
-    ]
-    download_files(urls, models_folder)
-
-    sdxl_models_folder = os.path.join(base_folder, "sdxl_models")
-    urls = [
-        "https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter_sdxl_vit-h.bin",
-        "https://huggingface.co/h94/IP-Adapter/resolve/main/sdxl_models/ip-adapter-plus_sdxl_vit-h.bin",
-    ]
-    download_files(urls, sdxl_models_folder)
-
-
-def download_t5xl_fp16():
-    base_folder = os.path.join(test_weights_dir, "QQGYLab", "T5XLFP16")
-    urls = [
-        "https://huggingface.co/QQGYLab/ELLA/resolve/main/models--google--flan-t5-xl--text_encoder/config.json",
-        "https://huggingface.co/QQGYLab/ELLA/resolve/main/models--google--flan-t5-xl--text_encoder/model.safetensors",
-        "https://huggingface.co/QQGYLab/ELLA/resolve/main/models--google--flan-t5-xl--text_encoder/special_tokens_map.json",
-        "https://huggingface.co/QQGYLab/ELLA/resolve/main/models--google--flan-t5-xl--text_encoder/spiece.model",
-        "https://huggingface.co/QQGYLab/ELLA/resolve/main/models--google--flan-t5-xl--text_encoder/tokenizer.json",
-        "https://huggingface.co/QQGYLab/ELLA/resolve/main/models--google--flan-t5-xl--text_encoder/tokenizer_config.json",
-    ]
-    download_files(urls, base_folder)
-
-
-def download_ella_adapter():
-    download_t5xl_fp16()
-    base_folder = os.path.join(test_weights_dir, "QQGYLab", "ELLA")
-    download_file(
-        "https://huggingface.co/QQGYLab/ELLA/resolve/main/ella-sd1.5-tsc-t5xl.safetensors",
-        base_folder,
-        expected_hash="5af7b200",
-    )
-
-
-def download_t2i_adapter():
-    base_folder = os.path.join(test_weights_dir, "TencentARC", "t2iadapter_depth_sd15v2")
-    urls = [
-        "https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2/raw/main/config.json",
-        "https://huggingface.co/TencentARC/t2iadapter_depth_sd15v2/resolve/main/diffusion_pytorch_model.bin",
-    ]
-    download_files(urls, base_folder)
-
-    canny_sdxl_folder = os.path.join(test_weights_dir, "TencentARC", "t2i-adapter-canny-sdxl-1.0")
-    urls = [
-        "https://huggingface.co/TencentARC/t2i-adapter-canny-sdxl-1.0/raw/main/config.json",
-        "https://huggingface.co/TencentARC/t2i-adapter-canny-sdxl-1.0/resolve/main/diffusion_pytorch_model.safetensors",
-    ]
-    download_files(urls, canny_sdxl_folder)
-
-
-def download_sam():
-    weights_folder = os.path.join(test_weights_dir)
-    download_file(
-        "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", weights_folder, expected_hash="06785e66"
-    )
-
-
-def download_hq_sam():
-    weights_folder = os.path.join(test_weights_dir)
-    download_file(
-        "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_h.pth", weights_folder, expected_hash="66da2472"
-    )
-
-
-def download_dinov2():
-    # For conversion
-    weights_folder = os.path.join(test_weights_dir)
-    urls = [
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth",
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth",
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth",
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_pretrain.pth",
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_pretrain.pth",
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_pretrain.pth",
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_pretrain.pth",
-        "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_pretrain.pth",
-    ]
-    download_files(urls, weights_folder)
-
-
-def download_lcm_base():
-    base_folder = os.path.join(test_weights_dir, "latent-consistency/lcm-sdxl")
-    download_file(f"https://huggingface.co/latent-consistency/lcm-sdxl/raw/main/config.json", base_folder)
-    download_file(
-        f"https://huggingface.co/latent-consistency/lcm-sdxl/resolve/main/diffusion_pytorch_model.safetensors",
-        base_folder,
-    )
-
-
-def download_lcm_lora():
-    download_file(
-        "https://huggingface.co/latent-consistency/lcm-lora-sdxl/resolve/main/pytorch_lora_weights.safetensors",
-        dest_folder=test_weights_dir,
-        filename="sdxl-lcm-lora.safetensors",
-        expected_hash="6312a30a",
-    )
-
-
-def download_sdxl_lightning_base():
-    base_folder = os.path.join(test_weights_dir, "ByteDance/SDXL-Lightning")
-    download_file(
-        f"https://huggingface.co/ByteDance/SDXL-Lightning/resolve/main/sdxl_lightning_4step_unet.safetensors",
-        base_folder,
-        expected_hash="1b76cca3",
-    )
-    download_file(
-        f"https://huggingface.co/ByteDance/SDXL-Lightning/resolve/main/sdxl_lightning_1step_unet_x0.safetensors",
-        base_folder,
-        expected_hash="38e605bd",
-    )
-
-
-def download_sdxl_lightning_lora():
-    download_file(
-        "https://huggingface.co/ByteDance/SDXL-Lightning/resolve/main/sdxl_lightning_4step_lora.safetensors",
-        dest_folder=test_weights_dir,
-        expected_hash="9783edac",
-    )
-
-
-def download_ic_light():
-    download_file(
-        "https://huggingface.co/lllyasviel/ic-light/resolve/main/iclight_sd15_fc.safetensors",
-        dest_folder=test_weights_dir,
-        expected_hash="bce70123",
-    )
-
-
-def download_mvanet():
-    fn = "Model_80.pth"
-    dest_folder = os.path.join(test_weights_dir, "mvanet")
-    dest_filename = os.path.join(dest_folder, fn)
-
-    if os.environ.get("DRY_RUN") == "1":
-        return
-
-    if os.path.exists(dest_filename):
-        print(f"✖️ ️ Skipping previously downloaded mvanet/{fn}")
-    else:
-        os.makedirs(dest_folder, exist_ok=True)
-        print(f"🔽 Downloading mvanet/{fn} => '{rel(dest_filename)}'", end="\n")
-        gdown.download(id="1_gabQXOF03MfXnf3EWDK1d_8wKiOemOv", output=dest_filename, quiet=True)
-        print(f"{previous_line}✅ Downloaded mvanet/{fn} => '{rel(dest_filename)}' ")
-
-    check_hash(dest_filename, "b915d492")
-
-
-def download_box_segmenter():
-    download_file(
-        "https://huggingface.co/finegrain/finegrain-box-segmenter/resolve/v0.1/model.safetensors",
-        dest_folder=test_weights_dir,
-        filename="finegrain-box-segmenter-v0-1.safetensors",
-        expected_hash="e0450e8c",
-    )
-
-
-def printg(msg: str):
-    """print in green color"""
-    print("\033[92m" + msg + "\033[0m")
-
-
-def run_conversion_script(
-    script_filename: str,
-    from_weights: str,
-    to_weights: str,
-    half: bool = False,
-    expected_hash: str | None = None,
-    additional_args: list[str] | None = None,
-    skip_existing: bool = True,
-):
-    if skip_existing and expected_hash and os.path.exists(to_weights):
-        found_hash = check_hash(to_weights, expected_hash)
-        if expected_hash == found_hash:
-            printg(f"✖️  Skipping converted from {from_weights} to {to_weights} (hash {found_hash} confirmed)   ")
-            return
-
-    msg = f"Converting {from_weights} to {to_weights}"
-    printg(msg)
-
-    args = ["python", f"scripts/conversion/{script_filename}", "--from", from_weights, "--to", to_weights]
-    if half:
-        args.append("--half")
-    if additional_args:
-        args.extend(additional_args)
-
-    subprocess.run(args, check=True)
-    if expected_hash is not None:
-        found_hash = check_hash(to_weights, expected_hash)
-        printg(f"✅  Converted from {from_weights} to {to_weights} (hash {found_hash} confirmed)   ")
-    else:
-        printg(f"✅⚠️  Converted from {from_weights} to {to_weights} (no hash check performed)")
-
-
-def convert_sd15():
-    run_conversion_script(
-        script_filename="convert_transformers_clip_text_model.py",
-        from_weights="tests/weights/runwayml/stable-diffusion-v1-5",
-        to_weights="tests/weights/CLIPTextEncoderL.safetensors",
-        half=True,
-        expected_hash="6c9cbc59",
-    )
-    run_conversion_script(
-        "convert_diffusers_autoencoder_kl.py",
-        "tests/weights/runwayml/stable-diffusion-v1-5",
-        "tests/weights/lda.safetensors",
-        expected_hash="329e369c",
-    )
-    run_conversion_script(
-        "convert_diffusers_unet.py",
-        "tests/weights/runwayml/stable-diffusion-v1-5",
-        "tests/weights/unet.safetensors",
-        half=True,
-        expected_hash="f81ac65a",
-    )
-    os.makedirs("tests/weights/inpainting", exist_ok=True)
-    run_conversion_script(
-        "convert_diffusers_unet.py",
-        "tests/weights/runwayml/stable-diffusion-inpainting",
-        "tests/weights/inpainting/unet.safetensors",
-        half=True,
-        expected_hash="c07a8c61",
-    )
-
-
-def convert_sdxl():
-    run_conversion_script(
-        "convert_transformers_clip_text_model.py",
-        "tests/weights/stabilityai/stable-diffusion-xl-base-1.0",
-        "tests/weights/DoubleCLIPTextEncoder.safetensors",
-        half=True,
-        expected_hash="7f99c30b",
-        additional_args=["--subfolder2", "text_encoder_2"],
-    )
-    run_conversion_script(
-        "convert_diffusers_autoencoder_kl.py",
-        "tests/weights/stabilityai/stable-diffusion-xl-base-1.0",
-        "tests/weights/sdxl-lda.safetensors",
-        half=True,
-        expected_hash="7464e9dc",
-    )
-    run_conversion_script(
-        "convert_diffusers_unet.py",
-        "tests/weights/stabilityai/stable-diffusion-xl-base-1.0",
-        "tests/weights/sdxl-unet.safetensors",
-        half=True,
-        expected_hash="2e5c4911",
-    )
-
-
-def convert_vae_ft_mse():
-    run_conversion_script(
-        "convert_diffusers_autoencoder_kl.py",
-        "tests/weights/stabilityai/sd-vae-ft-mse",
-        "tests/weights/lda_ft_mse.safetensors",
-        half=True,
-        expected_hash="4d0bae7e",
-    )
-
-
-def convert_vae_fp16_fix():
-    run_conversion_script(
-        "convert_diffusers_autoencoder_kl.py",
-        "tests/weights/madebyollin/sdxl-vae-fp16-fix",
-        "tests/weights/sdxl-lda-fp16-fix.safetensors",
-        additional_args=["--subfolder", "''"],
-        half=True,
-        expected_hash="98c7e998",
-    )
-
-
-def convert_preprocessors():
-    subprocess.run(
-        [
-            "curl",
-            "-L",
-            "https://raw.githubusercontent.com/carolineec/informative-drawings/main/model.py",
-            "-o",
-            "src/model.py",
-        ],
-        check=True,
-    )
-    run_conversion_script(
-        "convert_informative_drawings.py",
-        "tests/weights/carolineec/informativedrawings/model2.pth",
-        "tests/weights/informative-drawings.safetensors",
-        expected_hash="93dca207",
-    )
-    os.remove("src/model.py")
-
-
-def convert_controlnet():
-    os.makedirs("tests/weights/controlnet", exist_ok=True)
-    run_conversion_script(
-        "convert_diffusers_controlnet.py",
-        "tests/weights/lllyasviel/control_v11p_sd15_canny",
-        "tests/weights/controlnet/lllyasviel_control_v11p_sd15_canny.safetensors",
-        expected_hash="9a1a48cf",
-    )
-    run_conversion_script(
-        "convert_diffusers_controlnet.py",
-        "tests/weights/lllyasviel/control_v11f1p_sd15_depth",
-        "tests/weights/controlnet/lllyasviel_control_v11f1p_sd15_depth.safetensors",
-        expected_hash="bbe7e5a6",
-    )
-    run_conversion_script(
-        "convert_diffusers_controlnet.py",
-        "tests/weights/lllyasviel/control_v11p_sd15_normalbae",
-        "tests/weights/controlnet/lllyasviel_control_v11p_sd15_normalbae.safetensors",
-        expected_hash="9fa88ed5",
-    )
-    run_conversion_script(
-        "convert_diffusers_controlnet.py",
-        "tests/weights/lllyasviel/control_v11p_sd15_lineart",
-        "tests/weights/controlnet/lllyasviel_control_v11p_sd15_lineart.safetensors",
-        expected_hash="c29e8c03",
-    )
-    run_conversion_script(
-        "convert_diffusers_controlnet.py",
-        "tests/weights/mfidabel/controlnet-segment-anything",
-        "tests/weights/controlnet/mfidabel_controlnet-segment-anything.safetensors",
-        expected_hash="d536eebb",
-    )
-    run_conversion_script(
-        "convert_diffusers_controlnet.py",
-        "tests/weights/lllyasviel/control_v11f1e_sd15_tile",
-        "tests/weights/controlnet/lllyasviel_control_v11f1e_sd15_tile.safetensors",
-        expected_hash="42463af8",
-    )
-
-
-def convert_unclip():
-    run_conversion_script(
-        "convert_transformers_clip_image_model.py",
-        "tests/weights/stabilityai/stable-diffusion-2-1-unclip",
-        "tests/weights/CLIPImageEncoderH.safetensors",
-        half=True,
-        expected_hash="4ddb44d2",
-    )
-
-
-def convert_ip_adapter():
-    run_conversion_script(
-        "convert_diffusers_ip_adapter.py",
-        "tests/weights/h94/IP-Adapter/models/ip-adapter_sd15.bin",
-        "tests/weights/ip-adapter_sd15.safetensors",
-        expected_hash="3fb0472e",
-    )
-    run_conversion_script(
-        "convert_diffusers_ip_adapter.py",
-        "tests/weights/h94/IP-Adapter/sdxl_models/ip-adapter_sdxl_vit-h.bin",
-        "tests/weights/ip-adapter_sdxl_vit-h.safetensors",
-        half=True,
-        expected_hash="860518fe",
-    )
-    run_conversion_script(
-        "convert_diffusers_ip_adapter.py",
-        "tests/weights/h94/IP-Adapter/models/ip-adapter-plus_sd15.bin",
-        "tests/weights/ip-adapter-plus_sd15.safetensors",
-        half=True,
-        expected_hash="aba8503b",
-    )
-    run_conversion_script(
-        "convert_diffusers_ip_adapter.py",
-        "tests/weights/h94/IP-Adapter/sdxl_models/ip-adapter-plus_sdxl_vit-h.bin",
-        "tests/weights/ip-adapter-plus_sdxl_vit-h.safetensors",
-        half=True,
-        expected_hash="545d5ce7",
-    )
-
-
-def convert_ella_adapter():
-    os.makedirs("tests/weights/ELLA-Adapter", exist_ok=True)
-    run_conversion_script(
-        "convert_ella_adapter.py",
-        "tests/weights/QQGYLab/ELLA/ella-sd1.5-tsc-t5xl.safetensors",
-        "tests/weights/ELLA-Adapter/ella-sd1.5-tsc-t5xl.safetensors",
-        half=True,
-        expected_hash="b8244cb6",
-    )
-
-
-def convert_t2i_adapter():
-    os.makedirs("tests/weights/T2I-Adapter", exist_ok=True)
-    run_conversion_script(
-        "convert_diffusers_t2i_adapter.py",
-        "tests/weights/TencentARC/t2iadapter_depth_sd15v2",
-        "tests/weights/T2I-Adapter/t2iadapter_depth_sd15v2.safetensors",
-        half=True,
-        expected_hash="bb2b3115",
-    )
-    run_conversion_script(
-        "convert_diffusers_t2i_adapter.py",
-        "tests/weights/TencentARC/t2i-adapter-canny-sdxl-1.0",
-        "tests/weights/T2I-Adapter/t2i-adapter-canny-sdxl-1.0.safetensors",
-        half=True,
-        expected_hash="f07249a6",
-    )
-
-
-def convert_sam():
-    run_conversion_script(
-        "convert_segment_anything.py",
-        "tests/weights/sam_vit_h_4b8939.pth",
-        "tests/weights/segment-anything-h.safetensors",
-        expected_hash="5ffb976f",
-    )
-
-
-def convert_hq_sam():
-    run_conversion_script(
-        "convert_hq_segment_anything.py",
-        "tests/weights/sam_hq_vit_h.pth",
-        "tests/weights/refiners-sam-hq-vit-h.safetensors",
-        expected_hash="b2f5e79f",
-    )
-
-
-def convert_dinov2():
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vits14_pretrain.pth",
-        "tests/weights/dinov2_vits14_pretrain.safetensors",
-        expected_hash="af000ded",
-    )
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vitb14_pretrain.pth",
-        "tests/weights/dinov2_vitb14_pretrain.safetensors",
-        expected_hash="d6294087",
-    )
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vitl14_pretrain.pth",
-        "tests/weights/dinov2_vitl14_pretrain.safetensors",
-        expected_hash="ddd4819f",
-    )
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vitg14_pretrain.pth",
-        "tests/weights/dinov2_vitg14_pretrain.safetensors",
-        expected_hash="880c61f5",
-    )
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vits14_reg4_pretrain.pth",
-        "tests/weights/dinov2_vits14_reg4_pretrain.safetensors",
-        expected_hash="080247c7",
-    )
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vitb14_reg4_pretrain.pth",
-        "tests/weights/dinov2_vitb14_reg4_pretrain.safetensors",
-        expected_hash="5cd4d408",
-    )
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vitl14_reg4_pretrain.pth",
-        "tests/weights/dinov2_vitl14_reg4_pretrain.safetensors",
-        expected_hash="b1221702",
-    )
-    run_conversion_script(
-        "convert_dinov2.py",
-        "tests/weights/dinov2_vitg14_reg4_pretrain.pth",
-        "tests/weights/dinov2_vitg14_reg4_pretrain.safetensors",
-        expected_hash="639398eb",
-    )
-
-
-def convert_control_lora_fooocus():
-    run_conversion_script(
-        "convert_fooocus_control_lora.py",
-        "tests/weights/lllyasviel/misc/control-lora-canny-rank128.safetensors",
-        "tests/weights/control-loras/refiners_control-lora-canny-rank128.safetensors",
-        expected_hash="4d505134",
-    )
-    run_conversion_script(
-        "convert_fooocus_control_lora.py",
-        "tests/weights/lllyasviel/misc/fooocus_xl_cpds_128.safetensors",
-        "tests/weights/control-loras/refiners_fooocus_xl_cpds_128.safetensors",
-        expected_hash="d81aa461",
-    )
-
-
-def convert_lcm_base():
-    run_conversion_script(
-        "convert_diffusers_unet.py",
-        "tests/weights/latent-consistency/lcm-sdxl",
-        "tests/weights/sdxl-lcm-unet.safetensors",
-        half=True,
-        expected_hash="e161b20c",
-    )
-
-
-def convert_sdxl_lightning_base():
-    run_conversion_script(
-        "convert_diffusers_unet.py",
-        "tests/weights/stabilityai/stable-diffusion-xl-base-1.0",
-        "tests/weights/sdxl_lightning_4step_unet.safetensors",
-        additional_args=[
-            "--override-weights",
-            "tests/weights/ByteDance/SDXL-Lightning/sdxl_lightning_4step_unet.safetensors",
-        ],
-        half=True,
-        expected_hash="cfdc46da",
-    )
-
-    run_conversion_script(
-        "convert_diffusers_unet.py",
-        "tests/weights/stabilityai/stable-diffusion-xl-base-1.0",
-        "tests/weights/sdxl_lightning_1step_unet_x0.safetensors",
-        additional_args=[
-            "--override-weights",
-            "tests/weights/ByteDance/SDXL-Lightning/sdxl_lightning_1step_unet_x0.safetensors",
-        ],
-        half=True,
-        expected_hash="21166a64",
-    )
-
-
-def convert_ic_light():
-    run_conversion_script(
-        "convert_ic_light.py",
-        "tests/weights/iclight_sd15_fc.safetensors",
-        "tests/weights/iclight_sd15_fc-refiners.safetensors",
-        half=False,
-        expected_hash="be315c1f",
-    )
-
-
-def convert_mvanet():
-    run_conversion_script(
-        "convert_mvanet.py",
-        "tests/weights/mvanet/Model_80.pth",
-        "tests/weights/mvanet/mvanet.safetensors",
-        half=True,
-        expected_hash="bf9ae4cb",
-    )
-
-
-def download_all():
-    print(f"\nAll weights will be downloaded to {test_weights_dir}\n")
-    download_sd15("runwayml/stable-diffusion-v1-5")
-    download_sd15("runwayml/stable-diffusion-inpainting")
-    download_sdxl("stabilityai/stable-diffusion-xl-base-1.0")
-    download_vae_ft_mse()
-    download_vae_fp16_fix()
-    download_loras()
-    download_preprocessors()
-    download_controlnet()
-    download_unclip()
-    download_ip_adapter()
-    download_t2i_adapter()
-    download_ella_adapter()
-    download_sam()
-    download_hq_sam()
-    download_dinov2()
-    download_control_lora_fooocus()
-    download_lcm_base()
-    download_lcm_lora()
-    download_sdxl_lightning_base()
-    download_sdxl_lightning_lora()
-    download_ic_light()
-    download_mvanet()
-    download_box_segmenter()
-
-
-def convert_all():
-    convert_sd15()
-    convert_sdxl()
-    convert_vae_ft_mse()
-    convert_vae_fp16_fix()
-    # Note: no convert loras: this is done at runtime by `SDLoraManager`
-    convert_preprocessors()
-    convert_controlnet()
-    convert_unclip()
-    convert_ip_adapter()
-    convert_t2i_adapter()
-    convert_ella_adapter()
-    convert_sam()
-    convert_hq_sam()
-    convert_dinov2()
-    convert_control_lora_fooocus()
-    convert_lcm_base()
-    convert_sdxl_lightning_base()
-    convert_ic_light()
-    convert_mvanet()
-
-
-def main():
-    try:
-        download_all()
-        print(f"{download_count} files ({human_readable_size(bytes_count)})\n")
-        if not bool(os.environ.get("DRY_RUN") == "1"):
-            printg("Converting weights to refiners format\n")
-            convert_all()
-    except KeyboardInterrupt:
-        print("Stopped")
-
-
-if __name__ == "__main__":
-    main()
--- a/src/refiners/fluxion/model_converter.py
+++ b/src/refiners/fluxion/model_converter.py
@ -1,654 +0,0 @@
-from collections import defaultdict
-from enum import Enum, auto
-from pathlib import Path
-from typing import Any, DefaultDict, TypedDict
-
-import torch
-from torch import Tensor, nn
-from torch.utils.hooks import RemovableHandle
-
-from refiners.fluxion.utils import no_grad, norm, save_to_safetensors
-
-TORCH_BASIC_LAYERS: list[type[nn.Module]] = [
-    nn.Conv1d,
-    nn.Conv2d,
-    nn.Conv3d,
-    nn.ConvTranspose1d,
-    nn.ConvTranspose2d,
-    nn.ConvTranspose3d,
-    nn.Linear,
-    nn.BatchNorm1d,
-    nn.BatchNorm2d,
-    nn.BatchNorm3d,
-    nn.LayerNorm,
-    nn.GroupNorm,
-    nn.Embedding,
-    nn.MaxPool2d,
-    nn.AvgPool2d,
-    nn.AdaptiveAvgPool2d,
-]
-
-
-ModelTypeShape = tuple[type[nn.Module], tuple[torch.Size, ...]]
-
-
-class ModuleArgsDict(TypedDict):
-    """Represents positional and keyword arguments passed to a module.
-
-    - `positional`: A tuple of positional arguments.
-    - `keyword`: A dictionary of keyword arguments.
-    """
-
-    positional: tuple[Any, ...]
-    keyword: dict[str, Any]
-
-
-class ConversionStage(Enum):
-    """Represents the current stage of the conversion process.
-
-    Attributes:
-        INIT: The conversion process has not started.
-        BASIC_LAYERS_MATCH: The source and target models have the same number of basic layers.
-        SHAPE_AND_LAYERS_MATCH: The shape of both models agree.
-        MODELS_OUTPUT_AGREE: The source and target models agree.
-    """
-
-    INIT = auto()
-    BASIC_LAYERS_MATCH = auto()
-    SHAPE_AND_LAYERS_MATCH = auto()
-    MODELS_OUTPUT_AGREE = auto()
-
-
-class ModelConverter:
-    """Converts a model's state_dict to match another model's state_dict.
-
-    Note: The conversion process consists of three stages
-        1. Verify that the source and target models have the same number of basic layers.
-        2. Find matching shapes and layers between the source and target models.
-        3. Convert the source model's state_dict to match the target model's state_dict.
-        4. Compare the outputs of the source and target models.
-
-        The conversion process can be run multiple times, and will resume from the last stage.
-
-    Example:
-        ```py
-        source = ...
-        target = ...
-
-        converter = ModelConverter(
-            source_model=source,
-            target_model=target,
-            threshold=0.1,
-            verbose=False
-        )
-
-        is_converted = converter(args)
-        if is_converted:
-            converter.save_to_safetensors(path="converted_model.pt")
-        ```
-    """
-
-    ModuleArgs = tuple[Any, ...] | dict[str, Any] | ModuleArgsDict
-    stage: ConversionStage = ConversionStage.INIT
-    _stored_mapping: dict[str, str] | None = None
-
-    def __init__(
-        self,
-        source_model: nn.Module,
-        target_model: nn.Module,
-        source_keys_to_skip: list[str] | None = None,
-        target_keys_to_skip: list[str] | None = None,
-        custom_layer_mapping: dict[type[nn.Module], type[nn.Module]] | None = None,
-        threshold: float = 1e-5,
-        skip_output_check: bool = False,
-        skip_init_check: bool = False,
-        verbose: bool = True,
-    ) -> None:
-        """Initializes the ModelConverter.
-
-        Args:
-            source_model: The model to convert from.
-            target_model: The model to convert to.
-            source_keys_to_skip: A list of keys to skip when tracing the source model.
-            target_keys_to_skip: A list of keys to skip when tracing the target model.
-            custom_layer_mapping: A dictionary mapping custom layer types between the source and target models.
-            threshold: The threshold for comparing outputs between the source and target models.
-            skip_output_check: Whether to skip comparing the outputs of the source and target models.
-            skip_init_check: Whether to skip checking that the source and target models have the same number of basic
-                layers.
-            verbose: Whether to print messages during the conversion process.
-
-        """
-        self.source_model = source_model
-        self.target_model = target_model
-        self.source_keys_to_skip = source_keys_to_skip or []
-        self.target_keys_to_skip = target_keys_to_skip or []
-        self.custom_layer_mapping = custom_layer_mapping or {}
-        self.threshold = threshold
-        self.skip_output_check = skip_output_check
-        self.skip_init_check = skip_init_check
-        self.verbose = verbose
-
-    def __repr__(self) -> str:
-        return (
-            f"ModelConverter(source_model={self.source_model.__class__.__name__},"
-            f" target_model={self.target_model.__class__.__name__}, stage={self.stage})"
-        )
-
-    def __bool__(self) -> bool:
-        return self.stage.value >= 2 if self.skip_output_check else self.stage.value >= 3
-
-    def run(self, source_args: ModuleArgs, target_args: ModuleArgs | None = None) -> bool:
-        """Run the conversion process.
-
-        Args:
-            source_args: The arguments to pass to the source model it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys. If `target_args`
-                is not provided, these arguments will also be passed to the target model.
-            target_args: The arguments to pass to the target model it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys.
-
-        Returns:
-            True if the conversion process is done and the models agree.
-        """
-        if target_args is None:
-            target_args = source_args
-
-        match self.stage:
-            case ConversionStage.MODELS_OUTPUT_AGREE:
-                self._increment_stage()
-                return True
-
-            case ConversionStage.SHAPE_AND_LAYERS_MATCH if self._run_shape_and_layers_match_stage(
-                source_args=source_args, target_args=target_args
-            ):
-                self._increment_stage()
-                return True
-
-            case ConversionStage.BASIC_LAYERS_MATCH if self._run_basic_layers_match_stage(
-                source_args=source_args, target_args=target_args
-            ):
-                self._increment_stage()
-                return self.run(source_args=source_args, target_args=target_args)
-
-            case ConversionStage.INIT if self._run_init_stage():
-                self._increment_stage()
-                return self.run(source_args=source_args, target_args=target_args)
-
-            case _:
-                self._log(message=f"Conversion failed at stage {self.stage.value}")
-                return False
-
-    def _increment_stage(self) -> None:
-        """Increment the stage of the conversion process."""
-        match self.stage:
-            case ConversionStage.INIT:
-                self.stage = ConversionStage.BASIC_LAYERS_MATCH
-                self._log(
-                    message=(
-                        "Stage 0 -> 1 - Models have the same number of basic layers. Finding matching shapes and"
-                        " layers..."
-                    )
-                )
-            case ConversionStage.BASIC_LAYERS_MATCH:
-                self.stage = ConversionStage.SHAPE_AND_LAYERS_MATCH
-                self._log(
-                    message=(
-                        "Stage 1 -> 2 - Shape of both models agree. Applying state_dict to target model. Comparing"
-                        " models..."
-                    )
-                )
-
-            case ConversionStage.SHAPE_AND_LAYERS_MATCH:
-                if self.skip_output_check:
-                    self._log(
-                        message=(
-                            "Stage 2 - Nothing to do. Skipping output check. If you want to compare the outputs, set"
-                            " `skip_output_check` to `False`"
-                        )
-                    )
-                else:
-                    self.stage = ConversionStage.MODELS_OUTPUT_AGREE
-                    self._log(
-                        message=(
-                            "Stage 2 -> 3 - Conversion is done and source and target models agree: you can export the"
-                            " converted model using `save_to_safetensors`"
-                        )
-                    )
-            case ConversionStage.MODELS_OUTPUT_AGREE:
-                self._log(
-                    message=(
-                        "Stage 3 - Nothing to do. Conversion is done and source and target models agree: you can export"
-                        " the converted model using `save_to_safetensors`"
-                    )
-                )
-
-    def get_state_dict(self) -> dict[str, Tensor]:
-        """Get the converted state_dict."""
-        if not self:
-            raise ValueError("The conversion process is not done yet. Run `converter(args)` first.")
-        return self.target_model.state_dict()
-
-    def get_mapping(self) -> dict[str, str]:
-        """Get the mapping between the source and target models' state_dicts."""
-        if not self:
-            raise ValueError("The conversion process is not done yet. Run `converter(args)` first.")
-        assert self._stored_mapping is not None, "Mapping is not stored"
-        return self._stored_mapping
-
-    def save_to_safetensors(self, path: Path | str, metadata: dict[str, str] | None = None, half: bool = False) -> None:
-        """Save the converted model to a SafeTensors file.
-
-        Warning:
-            This method can only be called after the conversion process is done.
-
-        Args:
-            path: The path to save the converted model to.
-            metadata: Metadata to save with the converted model.
-            half: Whether to save the converted model as half precision.
-
-        Raises:
-            ValueError: If the conversion process is not done yet. Run `converter` first.
-        """
-        if not self:
-            raise ValueError("The conversion process is not done yet. Run `converter(args)` first.")
-        state_dict = self.get_state_dict()
-        if half:
-            state_dict = {key: value.half() for key, value in state_dict.items()}
-        save_to_safetensors(path=path, tensors=state_dict, metadata=metadata)
-
-    def map_state_dicts(
-        self,
-        source_args: ModuleArgs,
-        target_args: ModuleArgs | None = None,
-    ) -> dict[str, str] | None:
-        """Find a mapping between the source and target models' state_dicts.
-
-        Args:
-            source_args: The arguments to pass to the source model it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys. If `target_args`
-                is not provided, these arguments will also be passed to the target model.
-            target_args: The arguments to pass to the target model it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys.
-
-        Returns:
-            A dictionary mapping keys in the target model's state_dict to keys in the source model's state_dict.
-        """
-        if target_args is None:
-            target_args = source_args
-
-        source_order = self._trace_module_execution_order(
-            module=self.source_model, args=source_args, keys_to_skip=self.source_keys_to_skip
-        )
-        target_order = self._trace_module_execution_order(
-            module=self.target_model, args=target_args, keys_to_skip=self.target_keys_to_skip
-        )
-
-        if not self._assert_shapes_aligned(source_order=source_order, target_order=target_order):
-            return None
-
-        mapping: dict[str, str] = {}
-        for source_type_shape in source_order:
-            source_keys = source_order[source_type_shape]
-            target_type_shape = source_type_shape
-            if not self._is_torch_basic_layer(module_type=source_type_shape[0]):
-                for source_custom_type, target_custom_type in self.custom_layer_mapping.items():
-                    if source_custom_type == source_type_shape[0]:
-                        target_type_shape = (target_custom_type, source_type_shape[1])
-                        break
-
-            target_keys = target_order[target_type_shape]
-            mapping.update(zip(target_keys, source_keys))
-
-        return mapping
-
-    def compare_models(
-        self,
-        source_args: ModuleArgs,
-        target_args: ModuleArgs | None = None,
-        threshold: float = 1e-5,
-    ) -> bool:
-        """Compare the outputs of the source and target models.
-
-        Args:
-            source_args: The arguments to pass to the source model it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys. If `target_args`
-                is not provided, these arguments will also be passed to the target model.
-            target_args: The arguments to pass to the target model it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys.
-            threshold: The threshold for comparing outputs between the source and target models.
-
-        Returns:
-            True if the outputs of the source and target models agree.
-        """
-        if target_args is None:
-            target_args = source_args
-
-        source_outputs = self._collect_layers_outputs(
-            module=self.source_model, args=source_args, keys_to_skip=self.source_keys_to_skip
-        )
-        target_outputs = self._collect_layers_outputs(
-            module=self.target_model, args=target_args, keys_to_skip=self.target_keys_to_skip
-        )
-
-        diff, prev_source_key, prev_target_key = None, None, None
-        for (source_key, source_output), (target_key, target_output) in zip(source_outputs, target_outputs):
-            diff = norm(source_output - target_output.reshape(shape=source_output.shape)).item()
-            if diff > threshold:
-                self._log(
-                    f"Models diverged between {prev_source_key} and {source_key}, and between {prev_target_key} and"
-                    f" {target_key}, difference in norm: {diff}"
-                )
-                return False
-            prev_source_key, prev_target_key = source_key, target_key
-
-        self._log(message=f"Models agree. Difference in norm: {diff}")
-
-        return True
-
-    def _run_init_stage(self) -> bool:
-        """Run the init stage of the conversion process."""
-        if self.skip_init_check:
-            self._log(
-                message=(
-                    "Skipping init check. If you want to check the number of basic layers, set `skip_init_check` to"
-                    " `False`"
-                )
-            )
-            return True
-
-        is_count_correct = self._verify_basic_layers_count()
-        is_not_missing_layers = self._verify_missing_basic_layers()
-
-        return is_count_correct and is_not_missing_layers
-
-    def _run_basic_layers_match_stage(self, source_args: ModuleArgs, target_args: ModuleArgs | None) -> bool:
-        """Run the basic layers match stage of the conversion process."""
-        mapping = self.map_state_dicts(source_args=source_args, target_args=target_args)
-        self._stored_mapping = mapping
-        if mapping is None:
-            self._log(message="Models do not have matching shapes.")
-            return False
-
-        source_state_dict = self.source_model.state_dict()
-        target_state_dict = self.target_model.state_dict()
-        converted_state_dict = self._convert_state_dict(
-            source_state_dict=source_state_dict, target_state_dict=target_state_dict, state_dict_mapping=mapping
-        )
-        self.target_model.load_state_dict(state_dict=converted_state_dict)
-
-        return True
-
-    def _run_shape_and_layers_match_stage(self, source_args: ModuleArgs, target_args: ModuleArgs | None) -> bool:
-        """Run the shape and layers match stage of the conversion process."""
-        if self.skip_output_check:
-            self._log(
-                message="Skipping output check. If you want to compare the outputs, set `skip_output_check` to `False`"
-            )
-            return True
-
-        try:
-            if self.compare_models(source_args=source_args, target_args=target_args, threshold=self.threshold):
-                self._log(message="Models agree. You can export the converted model using `save_to_safetensors`")
-                return True
-            else:
-                self._log(message="Models do not agree. Try to increase the threshold or modify the models.")
-                return False
-        except Exception as e:
-            self._log(message=f"An error occurred while comparing the models: {e}")
-            return False
-
-    def _log(self, message: str) -> None:
-        """Print a message if `verbose` is `True`."""
-        if self.verbose:
-            print(message)
-
-    def _debug_print_shapes(
-        self,
-        shape: ModelTypeShape,
-        source_keys: list[str],
-        target_keys: list[str],
-    ) -> None:
-        """Print the shapes of the sub-modules in `source_keys` and `target_keys`."""
-        self._log(message=f"{shape}")
-        max_len = max(len(source_keys), len(target_keys))
-        for i in range(max_len):
-            source_key = source_keys[i] if i < len(source_keys) else "---"
-            target_key = target_keys[i] if i < len(target_keys) else "---"
-            self._log(f"\t{source_key}\t{target_key}")
-
-    @staticmethod
-    def _unpack_module_args(module_args: ModuleArgs) -> tuple[tuple[Any, ...], dict[str, Any]]:
-        """Unpack the positional and keyword arguments passed to a module."""
-        match module_args:
-            case tuple(positional_args):
-                keyword_args: dict[str, Any] = {}
-            case {"positional": positional_args, "keyword": keyword_args}:
-                pass
-            case _:
-                positional_args = ()
-                keyword_args = dict(**module_args)
-
-        return positional_args, keyword_args
-
-    def _is_torch_basic_layer(self, module_type: type[nn.Module]) -> bool:
-        """Check if a module type is a subclass of a torch basic layer."""
-        return any(issubclass(module_type, torch_basic_layer) for torch_basic_layer in TORCH_BASIC_LAYERS)
-
-    def _infer_basic_layer_type(self, module: nn.Module) -> type[nn.Module] | None:
-        """Infer the type of a basic layer."""
-        layer_types = (
-            set(self.custom_layer_mapping.keys()) | set(self.custom_layer_mapping.values()) | set(TORCH_BASIC_LAYERS)
-        )
-        for layer_type in layer_types:
-            if isinstance(module, layer_type):
-                return layer_type
-
-        return None
-
-    def get_module_signature(self, module: nn.Module) -> ModelTypeShape:
-        """Get the signature of a module."""
-        layer_type = self._infer_basic_layer_type(module=module)
-        assert layer_type is not None, f"Module {module} is not a basic layer"
-        param_shapes = [p.shape for p in module.parameters()]
-        return (layer_type, tuple(param_shapes))
-
-    def _count_basic_layers(self, module: nn.Module) -> dict[type[nn.Module], int]:
-        """Count the number of basic layers in a module."""
-        count: DefaultDict[type[nn.Module], int] = defaultdict(int)
-        for submodule in module.modules():
-            layer_type = self._infer_basic_layer_type(module=submodule)
-            if layer_type is not None:
-                count[layer_type] += 1
-
-        return count
-
-    def _verify_basic_layers_count(self) -> bool:
-        """Verify that the source and target models have the same number of basic layers."""
-        source_layers = self._count_basic_layers(module=self.source_model)
-        target_layers = self._count_basic_layers(module=self.target_model)
-
-        reverse_mapping = {v: k for k, v in self.custom_layer_mapping.items()}
-
-        diff: dict[type[nn.Module], tuple[int, int]] = {}
-        for layer_type, source_count in source_layers.items():
-            target_type = self.custom_layer_mapping.get(layer_type, layer_type)
-            target_count = target_layers.get(target_type, 0)
-
-            if source_count != target_count:
-                diff[layer_type] = (source_count, target_count)
-
-        for layer_type, target_count in target_layers.items():
-            source_type = reverse_mapping.get(layer_type, layer_type)
-            source_count = source_layers.get(source_type, 0)
-
-            if source_count != target_count:
-                diff[layer_type] = (source_count, target_count)
-
-        if diff:
-            message = "Models do not have the same number of basic layers:\n"
-            for layer_type, counts in diff.items():
-                message += f"  {layer_type}: Source {counts[0]} - Target {counts[1]}\n"
-            self._log(message=message.strip())
-            return False
-
-        return True
-
-    def _is_weighted_leaf_module(self, module: nn.Module) -> bool:
-        """Check if a module is a leaf module with weights."""
-        return next(module.parameters(), None) is not None and next(module.children(), None) is None
-
-    def _check_for_missing_basic_layers(self, module: nn.Module) -> list[type[nn.Module]]:
-        """Check if a module has weighted leaf modules that are not basic layers."""
-        return [
-            type(submodule)
-            for submodule in module.modules()
-            if self._is_weighted_leaf_module(module=submodule) and not self._infer_basic_layer_type(module=submodule)
-        ]
-
-    def _verify_missing_basic_layers(self) -> bool:
-        """Verify that the source and target models do not have missing basic layers."""
-        missing_source_layers = self._check_for_missing_basic_layers(module=self.source_model)
-        missing_target_layers = self._check_for_missing_basic_layers(module=self.target_model)
-
-        if missing_source_layers or missing_target_layers:
-            self._log(
-                message=(
-                    "Models might have missing basic layers. If you want to skip this check, set"
-                    f" `skip_init_check` to `True`: {missing_source_layers}, {missing_target_layers}"
-                )
-            )
-            return False
-
-        return True
-
-    @no_grad()
-    def _trace_module_execution_order(
-        self,
-        module: nn.Module,
-        args: ModuleArgs,
-        keys_to_skip: list[str],
-    ) -> dict[ModelTypeShape, list[str]]:
-        """Execute a forward pass and store the order of execution of specific sub-modules.
-
-        Args:
-            module: The module to trace.
-            args: The arguments to pass to the module it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys.
-            keys_to_skip: A list of keys to skip when tracing the module.
-
-        Returns:
-            A dictionary mapping the signature of each sub-module to a list of keys in the module's `named_modules`
-        """
-        submodule_to_key: dict[nn.Module, str] = {}
-        execution_order: defaultdict[ModelTypeShape, list[str]] = defaultdict(list)
-
-        def collect_execution_order_hook(layer: nn.Module, *_: Any) -> None:
-            layer_signature = self.get_module_signature(module=layer)
-            execution_order[layer_signature].append(submodule_to_key[layer])
-
-        hooks: list[RemovableHandle] = []
-        named_modules: list[tuple[str, nn.Module]] = module.named_modules()  # type: ignore
-        for name, submodule in named_modules:
-            if (self._infer_basic_layer_type(module=submodule) is not None) and name not in keys_to_skip:
-                submodule_to_key[submodule] = name  # type: ignore
-                hook = submodule.register_forward_hook(hook=collect_execution_order_hook)
-                hooks.append(hook)
-
-        positional_args, keyword_args = self._unpack_module_args(module_args=args)
-        module(*positional_args, **keyword_args)
-
-        for hook in hooks:
-            hook.remove()
-
-        return dict(execution_order)
-
-    def _assert_shapes_aligned(
-        self, source_order: dict[ModelTypeShape, list[str]], target_order: dict[ModelTypeShape, list[str]]
-    ) -> bool:
-        """Assert that the shapes of the sub-modules in `source_order` and `target_order` are aligned."""
-        model_type_shapes = set(source_order.keys()) | set(target_order.keys())
-
-        default_type_shapes = [
-            type_shape for type_shape in model_type_shapes if self._is_torch_basic_layer(module_type=type_shape[0])
-        ]
-
-        shape_mismatched = False
-
-        for model_type_shape in default_type_shapes:
-            source_keys = source_order.get(model_type_shape, [])
-            target_keys = target_order.get(model_type_shape, [])
-
-            if len(source_keys) != len(target_keys):
-                shape_mismatched = True
-                self._debug_print_shapes(shape=model_type_shape, source_keys=source_keys, target_keys=target_keys)
-
-        for source_custom_type in self.custom_layer_mapping.keys():
-            # iterate over all type_shapes that have the same type as source_custom_type
-            for source_type_shape in [
-                type_shape for type_shape in model_type_shapes if type_shape[0] == source_custom_type
-            ]:
-                source_keys = source_order.get(source_type_shape, [])
-                target_custom_type = self.custom_layer_mapping[source_custom_type]
-                target_type_shape = (target_custom_type, source_type_shape[1])
-                target_keys = target_order.get(target_type_shape, [])
-
-                if len(source_keys) != len(target_keys):
-                    shape_mismatched = True
-                    self._debug_print_shapes(shape=source_type_shape, source_keys=source_keys, target_keys=target_keys)
-
-        return not shape_mismatched
-
-    @staticmethod
-    def _convert_state_dict(
-        source_state_dict: dict[str, Tensor], target_state_dict: dict[str, Tensor], state_dict_mapping: dict[str, str]
-    ) -> dict[str, Tensor]:
-        """Convert the source model's state_dict to match the target model's state_dict."""
-        converted_state_dict: dict[str, Tensor] = {}
-        for target_key in target_state_dict:
-            target_prefix, suffix = target_key.rsplit(sep=".", maxsplit=1)
-            source_prefix = state_dict_mapping[target_prefix]
-            source_key = ".".join([source_prefix, suffix])
-            converted_state_dict[target_key] = source_state_dict[source_key]
-
-        return converted_state_dict
-
-    @no_grad()
-    def _collect_layers_outputs(
-        self, module: nn.Module, args: ModuleArgs, keys_to_skip: list[str]
-    ) -> list[tuple[str, Tensor]]:
-        """Execute a forward pass and store the output of specific sub-modules.
-
-        Args:
-            module: The module to trace.
-            args: The arguments to pass to the module it can be either a tuple of positional arguments,
-                a dictionary of keyword arguments, or a dictionary with `positional` and `keyword` keys.
-            keys_to_skip: A list of keys to skip when tracing the module.
-
-        Returns:
-            A list of tuples containing the key of each sub-module and its output.
-
-        Note:
-            The output of each sub-module is cloned to avoid memory leaks.
-        """
-        submodule_to_key: dict[nn.Module, str] = {}
-        execution_order: list[tuple[str, Tensor]] = []
-
-        def collect_execution_order_hook(layer: nn.Module, _: Any, output: Tensor) -> None:
-            execution_order.append((submodule_to_key[layer], output.clone()))
-
-        hooks: list[RemovableHandle] = []
-        named_modules: list[tuple[str, nn.Module]] = module.named_modules()  # type: ignore
-        for name, submodule in named_modules:
-            if (self._infer_basic_layer_type(module=submodule) is not None) and name not in keys_to_skip:
-                submodule_to_key[submodule] = name  # type: ignore
-                hook = submodule.register_forward_hook(hook=collect_execution_order_hook)
-                hooks.append(hook)
-
-        positional_args, keyword_args = self._unpack_module_args(module_args=args)
-        module(*positional_args, **keyword_args)
-
-        for hook in hooks:
-            hook.remove()
-
-        return execution_order