Add Multi-View Aggregation Network (MVANet)

Co-authored-by: Pierre Colle <piercus@gmail.com>
2024-11-21 13:48:46 +00:00 · 2024-08-10 15:17:36 +02:00 · 2024-08-10 15:17:36 +02:00 · 10dfa73a09
parent 58c1cc7cd4
commit 10dfa73a09
19 changed files with 1525 additions and 1 deletions
--- a/docs/reference/SUMMARY.md
+++ b/docs/reference/SUMMARY.md
@ -9,4 +9,4 @@
    * [<code class="doc-symbol doc-symbol-nav doc-symbol-module"></code> DINOv2](foundationals/dinov2.md)
    * [<code class="doc-symbol doc-symbol-nav doc-symbol-module"></code> Latent Diffusion](foundationals/latent_diffusion.md)
    * [<code class="doc-symbol doc-symbol-nav doc-symbol-module"></code> Segment Anything](foundationals/segment_anything.md)
-
+    * [<code class="doc-symbol doc-symbol-nav doc-symbol-module"></code> Swin Transformers](foundationals/swin.md)
--- a/docs/reference/foundationals/swin.md
+++ b/docs/reference/foundationals/swin.md
@ -0,0 +1,2 @@
 ::: refiners.foundationals.swin.swin_transformer
 ::: refiners.foundationals.swin.mvanet
--- a/pyproject.toml
+++ b/pyproject.toml
@ -58,6 +58,7 @@ conversion = [
    "segment-anything-py>=1.0",
    "requests>=2.26.0",
    "tqdm>=4.62.3",
    "gdown>=5.2.0",
 ]
 doc = [
    # required by mkdocs to format the signatures
--- a/requirements.lock
+++ b/requirements.lock
@ -31,6 +31,8 @@ babel==2.15.0
    # via mkdocs-material
 backports-strenum==1.3.1
    # via griffe
 beautifulsoup4==4.12.3
    # via gdown
 bitsandbytes==0.43.3
    # via refiners
 black==24.4.2
@ -70,6 +72,7 @@ docker-pycreds==0.4.0
 filelock==3.15.4
    # via datasets
    # via diffusers
    # via gdown
    # via huggingface-hub
    # via torch
    # via transformers
@ -85,6 +88,8 @@ fsspec==2024.5.0
    # via torch
 future==1.0.0
    # via neptune
 gdown==5.2.0
    # via refiners
 ghp-import==2.1.0
    # via mkdocs
 gitdb==4.0.11
@ -274,6 +279,8 @@ pyjwt==2.9.0
 pymdown-extensions==10.9
    # via mkdocs-material
    # via mkdocstrings
 pysocks==1.7.1
    # via requests
 python-dateutil==2.9.0.post0
    # via arrow
    # via botocore
@ -311,6 +318,7 @@ requests==2.32.3
    # via bravado-core
    # via datasets
    # via diffusers
    # via gdown
    # via huggingface-hub
    # via mkdocs-material
    # via neptune
@ -356,6 +364,8 @@ six==1.16.0
    # via rfc3339-validator
 smmap==5.0.1
    # via gitdb
 soupsieve==2.6
    # via beautifulsoup4
 swagger-spec-validator==3.0.4
    # via bravado-core
    # via neptune
@ -383,6 +393,7 @@ torchvision==0.19.0
    # via timm
 tqdm==4.66.4
    # via datasets
    # via gdown
    # via huggingface-hub
    # via refiners
    # via transformers
--- a/scripts/conversion/convert_mvanet.py
+++ b/scripts/conversion/convert_mvanet.py
@ -0,0 +1,40 @@
 import argparse
 from pathlib import Path
 from refiners.fluxion.utils import load_tensors, save_to_safetensors
 from refiners.foundationals.swin.mvanet.converter import convert_weights
 def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--from",
        type=str,
        required=True,
        dest="source_path",
        help="A MVANet checkpoint. One can be found at https://github.com/qianyu-dlut/MVANet",
    )
    parser.add_argument(
        "--to",
        type=str,
        dest="output_path",
        default=None,
        help=(
            "Path to save the converted model. If not specified, the output path will be the source path with the"
            " extension changed to .safetensors."
        ),
    )
    parser.add_argument("--half", action="store_true", dest="half")
    args = parser.parse_args()
    src_weights = load_tensors(args.source_path)
    weights = convert_weights(src_weights)
    if args.half:
        weights = {key: value.half() for key, value in weights.items()}
    if args.output_path is None:
        args.output_path = f"{Path(args.source_path).stem}.safetensors"
    save_to_safetensors(path=args.output_path, tensors=weights)
 if __name__ == "__main__":
    main()
--- a/scripts/prepare_test_weights.py
+++ b/scripts/prepare_test_weights.py
@ -11,6 +11,7 @@ import subprocess
 import sys
 from urllib.parse import urlparse
 import gdown
 import requests
 from tqdm import tqdm
@ -446,6 +447,25 @@ def download_ic_light():
    )
 def download_mvanet():
    fn = "Model_80.pth"
    dest_folder = os.path.join(test_weights_dir, "mvanet")
    dest_filename = os.path.join(dest_folder, fn)
    if os.environ.get("DRY_RUN") == "1":
        return
    if os.path.exists(dest_filename):
        print(f"✖️ ️ Skipping previously downloaded mvanet/{fn}")
    else:
        os.makedirs(dest_folder, exist_ok=True)
        print(f"🔽 Downloading mvanet/{fn} => '{rel(dest_filename)}'", end="\n")
        gdown.download(id="1_gabQXOF03MfXnf3EWDK1d_8wKiOemOv", output=dest_filename, quiet=True)
        print(f"{previous_line}✅ Downloaded mvanet/{fn} => '{rel(dest_filename)}' ")
    check_hash(dest_filename, "b915d492")
 def printg(msg: str):
    """print in green color"""
    print("\033[92m" + msg + "\033[0m")
@ -808,6 +828,16 @@ def convert_ic_light():
    )
 def convert_mvanet():
    run_conversion_script(
        "convert_mvanet.py",
        "tests/weights/mvanet/Model_80.pth",
        "tests/weights/mvanet/mvanet.safetensors",
        half=True,
        expected_hash="bf9ae4cb",
    )
 def download_all():
    print(f"\nAll weights will be downloaded to {test_weights_dir}\n")
    download_sd15("runwayml/stable-diffusion-v1-5")
@ -830,6 +860,7 @@ def download_all():
    download_sdxl_lightning_base()
    download_sdxl_lightning_lora()
    download_ic_light()
    download_mvanet()
 def convert_all():
@ -850,6 +881,7 @@ def convert_all():
    convert_lcm_base()
    convert_sdxl_lightning_base()
    convert_ic_light()
    convert_mvanet()
 def main():
--- a/src/refiners/foundationals/swin/init.py
+++ b/src/refiners/foundationals/swin/init.py
@ -0,0 +1,3 @@
 from .swin_transformer import SwinTransformer
 __all__ = ["SwinTransformer"]
--- a/src/refiners/foundationals/swin/mvanet/init.py
+++ b/src/refiners/foundationals/swin/mvanet/init.py
@ -0,0 +1,3 @@
 from .mvanet import MVANet
 __all__ = ["MVANet"]
--- a/src/refiners/foundationals/swin/mvanet/converter.py
+++ b/src/refiners/foundationals/swin/mvanet/converter.py
@ -0,0 +1,138 @@
 import re
 from torch import Tensor
 def convert_weights(official_state_dict: dict[str, Tensor]) -> dict[str, Tensor]:
    rm_list = [
        # Official weights contains useless keys
        # See https://github.com/qianyu-dlut/MVANet/issues/3#issuecomment-2105650425
        r"multifieldcrossatt.linear[56]",
        r"multifieldcrossatt.attention.5",
        r"dec_blk\d+\.linear[12]",
        r"dec_blk[1234]\.attention\.[4567]",
        # We don't need the sideout weights
        r"sideout\d+",
    ]
    state_dict = {k: v for k, v in official_state_dict.items() if not any(re.match(rm, k) for rm in rm_list)}
    keys_map: dict[str, str] = {}
    for k in state_dict.keys():
        v: str = k
        def rpfx(s: str, src: str, dst: str) -> str:
            if not s.startswith(src):
                return s
            return s.replace(src, dst, 1)
        # Swin Transformer backbone
        v = rpfx(v, "backbone.patch_embed.proj.", "SwinTransformer.PatchEmbedding.Conv2d.")
        v = rpfx(v, "backbone.patch_embed.norm.", "SwinTransformer.PatchEmbedding.LayerNorm.")
        if m := re.match(r"backbone\.layers\.(\d+)\.downsample\.(.*)", v):
            s = m.group(2).replace("reduction.", "Linear.").replace("norm.", "LayerNorm.")
            v = f"SwinTransformer.Chain_{int(m.group(1)) + 1}.PatchMerging.{s}"
        if m := re.match(r"backbone\.layers\.(\d+)\.blocks\.(\d+)\.(.*)", v):
            s = m.group(3)
            s = s.replace("norm1.", "Residual_1.LayerNorm.")
            s = s.replace("norm2.", "Residual_2.LayerNorm.")
            s = s.replace("attn.qkv.", "Residual_1.WindowAttention.Linear_1.")
            s = s.replace("attn.proj.", "Residual_1.WindowAttention.Linear_2.")
            s = s.replace("attn.relative_position", "Residual_1.WindowAttention.WindowSDPA.rpb.relative_position")
            s = s.replace("mlp.fc", "Residual_2.Linear_")
            v = ".".join(
                [
                    f"SwinTransformer.Chain_{int(m.group(1)) + 1}",
                    f"BasicLayer.SwinTransformerBlock_{int(m.group(2)) + 1}",
                    s,
                ]
            )
        if m := re.match(r"backbone\.norm(\d+)\.(.*)", v):
            v = f"SwinTransformer.Chain_{int(m.group(1)) + 1}.Passthrough.LayerNorm.{m.group(2)}"
        # MVANet
        def mclm(s: str, pfx_src: str, pfx_dst: str) -> str:
            pca = f"{pfx_dst}Residual.PatchwiseCrossAttention"
            s = rpfx(s, f"{pfx_src}linear1.", f"{pfx_dst}FeedForward_1.Linear_1.")
            s = rpfx(s, f"{pfx_src}linear2.", f"{pfx_dst}FeedForward_1.Linear_2.")
            s = rpfx(s, f"{pfx_src}linear3.", f"{pfx_dst}FeedForward_2.Linear_1.")
            s = rpfx(s, f"{pfx_src}linear4.", f"{pfx_dst}FeedForward_2.Linear_2.")
            s = rpfx(s, f"{pfx_src}norm1.", f"{pfx_dst}LayerNorm_1.")
            s = rpfx(s, f"{pfx_src}norm2.", f"{pfx_dst}LayerNorm_2.")
            s = rpfx(s, f"{pfx_src}attention.0.", f"{pfx_dst}GlobalAttention.Sum.Chain.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}attention.1.", f"{pca}.Concatenate.Chain_1.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}attention.2.", f"{pca}.Concatenate.Chain_2.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}attention.3.", f"{pca}.Concatenate.Chain_3.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}attention.4.", f"{pca}.Concatenate.Chain_4.MultiheadAttention.")
            return s
        def mcrm(s: str, pfx_src: str, pfx_dst: str) -> str:
            # Note: there are no linear{1,2}, see https://github.com/qianyu-dlut/MVANet/issues/3#issuecomment-2105650425
            tca = f"{pfx_dst}Parallel_3.TiledCrossAttention"
            pca = f"{tca}.Sum.Chain_2.PatchwiseCrossAttention"
            s = rpfx(s, f"{pfx_src}linear3.", f"{tca}.FeedForward.Linear_1.")
            s = rpfx(s, f"{pfx_src}linear4.", f"{tca}.FeedForward.Linear_2.")
            s = rpfx(s, f"{pfx_src}norm1.", f"{tca}.LayerNorm_1.")
            s = rpfx(s, f"{pfx_src}norm2.", f"{tca}.LayerNorm_2.")
            s = rpfx(s, f"{pfx_src}attention.0.", f"{pca}.Concatenate.Chain_1.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}attention.1.", f"{pca}.Concatenate.Chain_2.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}attention.2.", f"{pca}.Concatenate.Chain_3.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}attention.3.", f"{pca}.Concatenate.Chain_4.MultiheadAttention.")
            s = rpfx(s, f"{pfx_src}sal_conv.", f"{pfx_dst}Parallel_2.Multiply.Chain.Conv2d.")
            return s
        def cbr(s: str, pfx_src: str, pfx_dst: str, shift: int = 0) -> str:
            s = rpfx(s, f"{pfx_src}{shift}.", f"{pfx_dst}Conv2d.")
            s = rpfx(s, f"{pfx_src}{shift + 1}.", f"{pfx_dst}BatchNorm2d.")
            s = rpfx(s, f"{pfx_src}{shift + 2}.", f"{pfx_dst}PReLU.")
            return s
        def cbg(s: str, pfx_src: str, pfx_dst: str) -> str:
            s = rpfx(s, f"{pfx_src}0.", f"{pfx_dst}Conv2d.")
            s = rpfx(s, f"{pfx_src}1.", f"{pfx_dst}BatchNorm2d.")
            return s
        v = rpfx(v, "shallow.0.", "ComputeShallow.Conv2d.")
        v = cbr(v, "output1.", "Pyramid.Sum.Chain.CBR.")
        v = cbr(v, "output2.", "Pyramid.Sum.PyramidL2.Sum.Chain.CBR.")
        v = cbr(v, "output3.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.Sum.Chain.CBR.")
        v = cbr(v, "output4.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.Sum.PyramidL4.Sum.Chain.CBR.")
        v = cbr(v, "output5.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.Sum.PyramidL4.Sum.PyramidL5.CBR.")
        v = cbr(v, "conv1.", "Pyramid.CBR.")
        v = cbr(v, "conv2.", "Pyramid.Sum.PyramidL2.CBR.")
        v = cbr(v, "conv3.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.CBR.")
        v = cbr(v, "conv4.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.Sum.PyramidL4.CBR.")
        v = mclm(v, "multifieldcrossatt.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.Sum.PyramidL4.Sum.PyramidL5.MCLM.")
        v = mcrm(v, "dec_blk1.", "Pyramid.MCRM.")
        v = mcrm(v, "dec_blk2.", "Pyramid.Sum.PyramidL2.MCRM.")
        v = mcrm(v, "dec_blk3.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.MCRM.")
        v = mcrm(v, "dec_blk4.", "Pyramid.Sum.PyramidL2.Sum.PyramidL3.Sum.PyramidL4.MCRM.")
        v = cbr(v, "insmask_head.", "RearrangeMultiView.Chain.CBR_1.")
        v = cbr(v, "insmask_head.", "RearrangeMultiView.Chain.CBR_2.", shift=3)
        v = rpfx(v, "insmask_head.6.", "RearrangeMultiView.Chain.Conv2d.")
        v = cbg(v, "upsample1.", "ShallowUpscaler.Sum_2.Chain_1.CBG.")
        v = cbg(v, "upsample2.", "ShallowUpscaler.CBG.")
        v = rpfx(v, "output.0.", "Conv2d.")
        if v != k:
            keys_map[k] = v
    for key, new_key in keys_map.items():
        state_dict[new_key] = state_dict[key]
        state_dict.pop(key)
    return state_dict
--- a/src/refiners/foundationals/swin/mvanet/mclm.py
+++ b/src/refiners/foundationals/swin/mvanet/mclm.py
@ -0,0 +1,211 @@
 # Multi-View Complementary Localization
 import math
 import torch
 from torch import Tensor, device as Device
 import refiners.fluxion.layers as fl
 from refiners.fluxion.context import Contexts
 from .utils import FeedForward, MultiheadAttention, MultiPool, PatchMerge, PatchwiseCrossAttention, Unflatten
 class PerPixel(fl.Chain):
    """(B, C, H, W) -> H*W, B, C"""
    def __init__(self):
        super().__init__(
            fl.Permute(2, 3, 0, 1),
            fl.Flatten(0, 1),
        )
 class PositionEmbeddingSine(fl.Module):
    """
    Non-trainable position embedding, originally from https://github.com/facebookresearch/detr
    """
    def __init__(self, num_pos_feats: int, device: Device | None = None):
        super().__init__()
        self.device = device
        temperature = 10000
        self.dim_t = torch.arange(0, num_pos_feats, dtype=torch.float32, device=self.device)
        self.dim_t = temperature ** (2 * (self.dim_t // 2) / num_pos_feats)
    def __call__(self, h: int, w: int) -> Tensor:
        mask = torch.ones([1, h, w, 1], dtype=torch.bool, device=self.device)
        y_embed = mask.cumsum(dim=1, dtype=torch.float32)
        x_embed = mask.cumsum(dim=2, dtype=torch.float32)
        eps, scale = 1e-6, 2 * math.pi
        y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * scale
        x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * scale
        pos_x = x_embed / self.dim_t
        pos_y = y_embed / self.dim_t
        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
        return torch.cat((pos_y, pos_x), dim=3).permute(1, 2, 0, 3).flatten(0, 1)
 class MultiPoolPos(fl.Module):
    def __init__(self, pool_ratios: list[int], positional_embedding: PositionEmbeddingSine):
        super().__init__()
        self.pool_ratios = pool_ratios
        self.positional_embedding = positional_embedding
    def forward(self, *args: int) -> Tensor:
        h, w = args
        return torch.cat([self.positional_embedding(h // ratio, w // ratio) for ratio in self.pool_ratios])
 class Repeat(fl.Module):
    def __init__(self, dim: int = 0):
        self.dim = dim
        super().__init__()
    def forward(self, x: Tensor, n: int) -> Tensor:
        return torch.repeat_interleave(x, n, dim=self.dim)
 class _MHA_Arg(fl.Sum):
    def __init__(self, offset: int):
        self.offset = offset
        super().__init__(
            fl.GetArg(offset),  # value
            fl.Chain(
                fl.Parallel(
                    fl.GetArg(self.offset + 1),  # position embedding
                    fl.Lambda(self._batch_size),
                ),
                Repeat(1),
            ),
        )
    def _batch_size(self, *args: Tensor) -> int:
        return args[self.offset].size(1)
 class GlobalAttention(fl.Chain):
    # Input must be a 4-tuple: (global, global pos. emb, pools, pools pos. emb.)
    def __init__(
        self,
        emb_dim: int,
        num_heads: int = 1,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Sum(
                fl.GetArg(0),  # global
                fl.Chain(
                    fl.Parallel(
                        _MHA_Arg(0),  # Q: global + pos. emb
                        _MHA_Arg(2),  # K: pools + pos. emb
                        fl.GetArg(2),  # V: pools
                    ),
                    MultiheadAttention(emb_dim, num_heads, device=device),
                ),
            ),
        )
 class MCLM(fl.Chain):
    """Multi-View Complementary Localization Module
    Inputs:
        tensor: (b, 5, e, h, h)
    Outputs:
        tensor: (b, 5, e, h, h)
    """
    def __init__(
        self,
        emb_dim: int,
        num_heads: int = 1,
        pool_ratios: list[int] | None = None,
        device: Device | None = None,
    ):
        if pool_ratios is None:
            pool_ratios = [2, 8, 16]
        positional_embedding = PositionEmbeddingSine(num_pos_feats=emb_dim // 2, device=device)
        # LayerNorms in MCLM share their weights.
        ln1 = fl.LayerNorm(emb_dim, device=device)
        ln2 = fl.LayerNorm(emb_dim, device=device)
        def proxy(m: fl.Module) -> fl.Module:
            def f(x: Tensor) -> Tensor:
                return m(x)
            return fl.Lambda(f)
        super().__init__(
            fl.Parallel(
                fl.Chain(  # global
                    fl.Slicing(dim=1, start=4),
                    fl.Squeeze(1),
                    fl.Parallel(
                        PerPixel(),  # glb
                        fl.Chain(  # g_pos
                            fl.Lambda(lambda x: x.shape[-2:]),  # type: ignore
                            positional_embedding,
                        ),
                    ),
                ),
                fl.Chain(  # local
                    fl.Slicing(dim=1, end=4),
                    fl.SetContext("mclm", "local"),
                    PatchMerge(),
                    fl.Parallel(
                        fl.Chain(  # pool
                            MultiPool(pool_ratios),
                            fl.Squeeze(0),
                        ),
                        fl.Chain(  # pool_pos
                            fl.Lambda(lambda x: x.shape[-2:]),  # type: ignore
                            MultiPoolPos(pool_ratios, positional_embedding),
                        ),
                    ),
                ),
            ),
            fl.Lambda(lambda t1, t2: (*t1, *t2)),  # type: ignore
            GlobalAttention(emb_dim, num_heads, device=device),
            ln1,
            FeedForward(emb_dim, device=device),
            ln2,
            fl.SetContext("mclm", "global"),
            fl.UseContext("mclm", "local"),
            fl.Flatten(-2, -1),
            fl.Permute(1, 3, 0, 2),
            fl.Residual(
                fl.Parallel(
                    fl.Identity(),
                    fl.Chain(
                        fl.UseContext("mclm", "global"),
                        Unflatten(0, (2, 8, 2, 8)),  # 2, h/2, 2, h/2
                        fl.Permute(0, 2, 1, 3, 4, 5),
                        fl.Flatten(0, 1),
                        fl.Flatten(1, 2),
                    ),
                ),
                PatchwiseCrossAttention(emb_dim, num_heads, device=device),
            ),
            proxy(ln1),
            FeedForward(emb_dim, device=device),
            proxy(ln2),
            fl.Concatenate(
                fl.Identity(),
                fl.Chain(
                    fl.UseContext("mclm", "global"),
                    fl.Unsqueeze(0),
                ),
            ),
            Unflatten(1, (16, 16)),  # h, h
            fl.Permute(3, 0, 4, 1, 2),
        )
    def init_context(self) -> Contexts:
        return {"mclm": {"global": None, "local": None}}
--- a/src/refiners/foundationals/swin/mvanet/mcrm.py
+++ b/src/refiners/foundationals/swin/mvanet/mcrm.py
@ -0,0 +1,119 @@
 # Multi-View Complementary Refinement
 import torch
 from torch import Tensor, device as Device
 import refiners.fluxion.layers as fl
 from .utils import FeedForward, Interpolate, MultiPool, PatchMerge, PatchSplit, PatchwiseCrossAttention, Unflatten
 class Multiply(fl.Chain):
    def __init__(self, o1: fl.Module, o2: fl.Module) -> None:
        super().__init__(o1, o2)
    def forward(self, *args: Tensor) -> Tensor:
        return torch.mul(self[0](*args), self[1](*args))
 class TiledCrossAttention(fl.Chain):
    def __init__(
        self,
        emb_dim: int,
        dim: int,
        num_heads: int = 1,
        pool_ratios: list[int] | None = None,
        device: Device | None = None,
    ):
        # Input must be a 4-tuple: (local, global)
        if pool_ratios is None:
            pool_ratios = [1, 2, 4]
        super().__init__(
            fl.Distribute(
                fl.Chain(  # local
                    fl.Flatten(-2, -1),
                    fl.Permute(1, 3, 0, 2),
                ),
                fl.Chain(  # global
                    PatchSplit(),
                    fl.Squeeze(0),
                    MultiPool(pool_ratios),
                ),
            ),
            fl.Sum(
                fl.Chain(
                    fl.GetArg(0),
                    fl.Permute(2, 1, 0, 3),
                ),
                fl.Chain(
                    PatchwiseCrossAttention(emb_dim, num_heads, device=device),
                    fl.Permute(2, 1, 0, 3),
                ),
            ),
            fl.LayerNorm(emb_dim, device=device),
            FeedForward(emb_dim, device=device),
            fl.LayerNorm(emb_dim, device=device),
            fl.Permute(0, 2, 3, 1),
            Unflatten(-1, (dim, dim)),
        )
 class MCRM(fl.Chain):
    """Multi-View Complementary Refinement"""
    def __init__(
        self,
        emb_dim: int,
        size: int,
        num_heads: int = 1,
        pool_ratios: list[int] | None = None,
        device: Device | None = None,
    ):
        if pool_ratios is None:
            pool_ratios = [1, 2, 4]
        super().__init__(
            fl.Parallel(
                fl.Chain(  # local
                    fl.Slicing(dim=1, end=4),
                ),
                fl.Chain(  # global
                    fl.Slicing(dim=1, start=4),
                    fl.Squeeze(1),
                ),
            ),
            fl.Parallel(
                Multiply(
                    fl.GetArg(0),
                    fl.Chain(
                        fl.GetArg(1),
                        fl.Conv2d(emb_dim, 1, 1, device=device),
                        fl.Sigmoid(),
                        Interpolate((size * 2, size * 2), "nearest"),
                        PatchSplit(),
                    ),
                ),
                fl.GetArg(1),
            ),
            fl.Parallel(
                TiledCrossAttention(emb_dim, size, num_heads, pool_ratios, device=device),
                fl.GetArg(1),
            ),
            fl.Concatenate(
                fl.GetArg(0),
                fl.Chain(
                    fl.Sum(
                        fl.GetArg(1),
                        fl.Chain(
                            fl.GetArg(0),
                            PatchMerge(),
                            Interpolate((size, size), "nearest"),
                        ),
                    ),
                    fl.Unsqueeze(1),
                ),
                dim=1,
            ),
        )
--- a/src/refiners/foundationals/swin/mvanet/mvanet.py
+++ b/src/refiners/foundationals/swin/mvanet/mvanet.py
@ -0,0 +1,337 @@
 # Multi-View Aggregation Network (arXiv:2404.07445)
 from torch import device as Device
 import refiners.fluxion.layers as fl
 from refiners.fluxion.context import Contexts
 from refiners.foundationals.swin.swin_transformer import SwinTransformer
 from .mclm import MCLM  # Multi-View Complementary Localization
 from .mcrm import MCRM  # Multi-View Complementary Refinement
 from .utils import BatchNorm2d, Interpolate, PatchMerge, PatchSplit, PReLU, Rescale, Unflatten
 class CBG(fl.Chain):
    """(C)onvolution + (B)atchNorm + (G)eLU"""
    def __init__(
        self,
        in_dim: int,
        out_dim: int | None = None,
        device: Device | None = None,
    ):
        out_dim = out_dim or in_dim
        super().__init__(
            fl.Conv2d(in_dim, out_dim, kernel_size=3, padding=1, device=device),
            BatchNorm2d(out_dim, device=device),
            fl.GeLU(),
        )
 class CBR(fl.Chain):
    """(C)onvolution + (B)atchNorm + Parametric (R)eLU"""
    def __init__(
        self,
        in_dim: int,
        out_dim: int | None = None,
        device: Device | None = None,
    ):
        out_dim = out_dim or in_dim
        super().__init__(
            fl.Conv2d(in_dim, out_dim, kernel_size=3, padding=1, device=device),
            BatchNorm2d(out_dim, device=device),
            PReLU(device=device),
        )
 class SplitMultiView(fl.Chain):
    """
    Split a hd tensor into 5 ld views, (5 = 1 global + 4 tiles)
    See also the reverse Module [`RearrangeMultiView`][refiners.foundationals.swin.mvanet.RearrangeMultiView]
    Inputs:
        single_view (b, c, H, W)
    Outputs:
        multi_view (b, 5, c, H/2, W/2)
    """
    def __init__(self):
        super().__init__(
            fl.Concatenate(
                PatchSplit(),  # global features
                fl.Chain(  # local features
                    Rescale(scale_factor=0.5, mode="bilinear"),
                    fl.Unsqueeze(1),
                ),
                dim=1,
            )
        )
 class ShallowUpscaler(fl.Chain):
    """4x Upscaler reusing the image as input to upscale the feature
    See [[arXiv:2108.10257] SwinIR: Image Restoration Using Swin Transformer](https://arxiv.org/abs/2108.10257)
    Args:
        embedding_dim (int): the embedding dimension
    Inputs:
        feature (b, E, image_size/4, image_size/4)
    Output:
        upscaled tensor (b, E, image_size, image_size)
    """
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Sum(
                fl.Identity(),
                fl.Chain(
                    fl.UseContext("mvanet", "shallow"),
                    Interpolate((256, 256)),
                ),
            ),
            fl.Sum(
                fl.Chain(
                    Rescale(2),
                    CBG(embedding_dim, device=device),
                ),
                fl.Chain(
                    fl.UseContext("mvanet", "shallow"),
                    Interpolate((512, 512)),
                ),
            ),
            Rescale(2),
            CBG(embedding_dim, device=device),
        )
 class PyramidL5(fl.Chain):
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        super().__init__(
            fl.GetArg(0),  # output5
            fl.Flatten(0, 1),
            CBR(1024, embedding_dim, device=device),
            Unflatten(0, (-1, 5)),
            MCLM(embedding_dim, device=device),
            fl.Flatten(0, 1),
            Interpolate((32, 32)),
        )
 class PyramidL4(fl.Chain):
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Sum(
                PyramidL5(embedding_dim=embedding_dim, device=device),
                fl.Chain(
                    fl.GetArg(1),
                    fl.Flatten(0, 1),
                    CBR(512, embedding_dim, device=device),  # output4
                    Unflatten(0, (-1, 5)),
                ),
            ),
            MCRM(embedding_dim, 32, device=device),  # dec_blk4
            fl.Flatten(0, 1),
            CBR(embedding_dim, device=device),  # conv4
            Interpolate((64, 64)),
        )
 class PyramidL3(fl.Chain):
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Sum(
                PyramidL4(embedding_dim=embedding_dim, device=device),
                fl.Chain(
                    fl.GetArg(2),
                    fl.Flatten(0, 1),
                    CBR(256, embedding_dim, device=device),  # output3
                    Unflatten(0, (-1, 5)),
                ),
            ),
            MCRM(embedding_dim, 64, device=device),  # dec_blk3
            fl.Flatten(0, 1),
            CBR(embedding_dim, device=device),  # conv3
            Interpolate((128, 128)),
        )
 class PyramidL2(fl.Chain):
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        embedding_dim = 128
        super().__init__(
            fl.Sum(
                PyramidL3(embedding_dim=embedding_dim, device=device),
                fl.Chain(
                    fl.GetArg(3),
                    fl.Flatten(0, 1),
                    CBR(128, embedding_dim, device=device),  # output2
                    Unflatten(0, (-1, 5)),
                ),
            ),
            MCRM(embedding_dim, 128, device=device),  # dec_blk2
            fl.Flatten(0, 1),
            CBR(embedding_dim, device=device),  # conv2
            Interpolate((128, 128)),
        )
 class Pyramid(fl.Chain):
    """
    Recursive Pyramidal Network calling MCLM and MCRM blocks
    It acts as a FPN (Feature Pyramid Network) Neck for MVANet
    see [[arXiv:1612.03144] Feature Pyramid Networks for Object Detection](https://arxiv.org/abs/1612.03144)
    Inputs:
        features: a pyramid of N = 5 tensors
            shapes are (b, 5, E_{0}, S_{0}, S_{0}), ..., (b, 5, E_{1}, S_{i}, S_{i}), ..., (b, 5, E_{N-1}, S_{N-1}, S_{N-1})
            with S_{i} = S_{i-1} or S_{i} = 2*S_{i-1} for 0 < i < N
    Outputs:
        output (b, 5, E, S_{N-1}, S_{N-1})
    """
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Sum(
                PyramidL2(embedding_dim=embedding_dim, device=device),
                fl.Chain(
                    fl.GetArg(4),
                    fl.Flatten(0, 1),
                    CBR(128, embedding_dim, device=device),  # output1
                    Unflatten(0, (-1, 5)),
                ),
            ),
            MCRM(embedding_dim, 128, device=device),  # dec_blk1
            fl.Flatten(0, 1),
            CBR(embedding_dim, device=device),  # conv1
            Unflatten(0, (-1, 5)),
        )
 class RearrangeMultiView(fl.Chain):
    """
    Inputs:
        multi_view (b, 5, E, H, W)
    Outputs:
        single_view (b, E, H*2, W*2)
    Fusion a multi view tensor into a single view tensor, using convolutions
    See also the reverse Module [`SplitMultiView`][refiners.foundationals.swin.mvanet.SplitMultiView]
    """
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Sum(
                fl.Chain(  # local features
                    fl.Slicing(dim=1, end=4),
                    PatchMerge(),
                ),
                fl.Chain(  # global feature
                    fl.Slicing(dim=1, start=4),
                    fl.Squeeze(1),
                    Interpolate((256, 256)),
                ),
            ),
            fl.Chain(  # conv head
                CBR(embedding_dim, 384, device=device),
                CBR(384, device=device),
                fl.Conv2d(384, embedding_dim, kernel_size=3, padding=1, device=device),
            ),
        )
 class ComputeShallow(fl.Passthrough):
    def __init__(
        self,
        embedding_dim: int = 128,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Conv2d(3, embedding_dim, kernel_size=3, padding=1, device=device),
            fl.SetContext("mvanet", "shallow"),
        )
 class MVANet(fl.Chain):
    """Multi-view Aggregation Network for Dichotomous Image Segmentation
    See [[arXiv:2404.07445] Multi-view Aggregation Network for Dichotomous Image Segmentation](https://arxiv.org/abs/2404.07445) for more details.
    Args:
        embedding_dim (int): embedding dimension
        n_logits (int): the number of output logits (default to 1)
            1 logit is used for alpha matting/foreground-background segmentation/sod segmentation
        depths (list[int]): see [`SwinTransformer`][refiners.foundationals.swin.swin_transformer.SwinTransformer]
        num_heads (list[int]): see [`SwinTransformer`][refiners.foundationals.swin.swin_transformer.SwinTransformer]
        window_size (int): default to 12, see [`SwinTransformer`][refiners.foundationals.swin.swin_transformer.SwinTransformer]
        device (Device | None): the device to use
    """
    def __init__(
        self,
        embedding_dim: int = 128,
        n_logits: int = 1,
        depths: list[int] | None = None,
        num_heads: list[int] | None = None,
        window_size: int = 12,
        device: Device | None = None,
    ):
        if depths is None:
            depths = [2, 2, 18, 2]
        if num_heads is None:
            num_heads = [4, 8, 16, 32]
        super().__init__(
            ComputeShallow(embedding_dim=embedding_dim, device=device),
            SplitMultiView(),
            fl.Flatten(0, 1),
            SwinTransformer(
                embedding_dim=embedding_dim,
                depths=depths,
                num_heads=num_heads,
                window_size=window_size,
                device=device,
            ),
            fl.Distribute(*(Unflatten(0, (-1, 5)) for _ in range(5))),
            Pyramid(embedding_dim=embedding_dim, device=device),
            RearrangeMultiView(embedding_dim=embedding_dim, device=device),
            ShallowUpscaler(embedding_dim, device=device),
            fl.Conv2d(embedding_dim, n_logits, kernel_size=3, padding=1, device=device),
        )
    def init_context(self) -> Contexts:
        return {"mvanet": {"shallow": None}}
--- a/src/refiners/foundationals/swin/mvanet/utils.py
+++ b/src/refiners/foundationals/swin/mvanet/utils.py
@ -0,0 +1,173 @@
 import torch
 from torch import Size, Tensor
 from torch.nn.functional import (
    adaptive_avg_pool2d,
    interpolate,  # type: ignore
 )
 import refiners.fluxion.layers as fl
 class Unflatten(fl.Module):
    def __init__(self, dim: int, sizes: tuple[int, ...]) -> None:
        super().__init__()
        self.dim = dim
        self.sizes = Size(sizes)
    def forward(self, x: Tensor) -> Tensor:
        return torch.unflatten(input=x, dim=self.dim, sizes=self.sizes)
 class Interpolate(fl.Module):
    def __init__(self, size: tuple[int, ...], mode: str = "bilinear"):
        super().__init__()
        self.size = Size(size)
        self.mode = mode
    def forward(self, x: Tensor) -> Tensor:
        return interpolate(x, size=self.size, mode=self.mode)  # type: ignore
 class Rescale(fl.Module):
    def __init__(self, scale_factor: float, mode: str = "nearest"):
        super().__init__()
        self.scale_factor = scale_factor
        self.mode = mode
    def forward(self, x: Tensor) -> Tensor:
        return interpolate(x, scale_factor=self.scale_factor, mode=self.mode)  # type: ignore
 class BatchNorm2d(torch.nn.BatchNorm2d, fl.WeightedModule):
    def __init__(self, num_features: int, device: torch.device | None = None):
        super().__init__(num_features=num_features, device=device)  # type: ignore
 class PReLU(torch.nn.PReLU, fl.WeightedModule, fl.Activation):
    def __init__(self, device: torch.device | None = None):
        super().__init__(device=device)  # type: ignore
 class PatchSplit(fl.Chain):
    """(B, N, H, W) -> B, 4, N, H/2, W/2"""
    def __init__(self):
        super().__init__(
            Unflatten(-2, (2, -1)),
            Unflatten(-1, (2, -1)),
            fl.Permute(0, 2, 4, 1, 3, 5),
            fl.Flatten(1, 2),
        )
 class PatchMerge(fl.Chain):
    """B, 4, N, H, W -> (B, N, 2*H, 2*W)"""
    def __init__(self):
        super().__init__(
            Unflatten(1, (2, 2)),
            fl.Permute(0, 3, 1, 4, 2, 5),
            fl.Flatten(-2, -1),
            fl.Flatten(-3, -2),
        )
 class FeedForward(fl.Residual):
    def __init__(self, emb_dim: int, device: torch.device | None = None) -> None:
        super().__init__(
            fl.Linear(in_features=emb_dim, out_features=2 * emb_dim, device=device),
            fl.ReLU(),
            fl.Linear(in_features=2 * emb_dim, out_features=emb_dim, device=device),
        )
 class _GetArgs(fl.Parallel):
    def __init__(self, n: int):
        super().__init__(
            fl.Chain(
                fl.GetArg(0),
                fl.Slicing(dim=0, start=n, end=n + 1),
                fl.Squeeze(0),
            ),
            fl.Chain(
                fl.GetArg(1),
                fl.Slicing(dim=0, start=n, end=n + 1),
                fl.Squeeze(0),
            ),
            fl.Chain(
                fl.GetArg(1),
                fl.Slicing(dim=0, start=n, end=n + 1),
                fl.Squeeze(0),
            ),
        )
 class MultiheadAttention(torch.nn.MultiheadAttention, fl.WeightedModule):
    def __init__(self, embedding_dim: int, num_heads: int, device: torch.device | None = None):
        super().__init__(embed_dim=embedding_dim, num_heads=num_heads, device=device)  # type: ignore
    @property
    def weight(self) -> Tensor:  # type: ignore
        return self.in_proj_weight
    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:  # type: ignore
        return super().forward(q, k, v)[0]
 class PatchwiseCrossAttention(fl.Chain):
    # Input is 2 tensors of sizes (4, HW, B, C) and (4, HW', B, C),
    # output is size (4, HW, B, C).
    def __init__(
        self,
        d_model: int,
        num_heads: int,
        device: torch.device | None = None,
    ):
        super().__init__(
            fl.Concatenate(
                fl.Chain(
                    _GetArgs(0),
                    MultiheadAttention(d_model, num_heads, device=device),
                ),
                fl.Chain(
                    _GetArgs(1),
                    MultiheadAttention(d_model, num_heads, device=device),
                ),
                fl.Chain(
                    _GetArgs(2),
                    MultiheadAttention(d_model, num_heads, device=device),
                ),
                fl.Chain(
                    _GetArgs(3),
                    MultiheadAttention(d_model, num_heads, device=device),
                ),
            ),
            Unflatten(0, (4, -1)),
        )
 class Pool(fl.Module):
    def __init__(self, ratio: int) -> None:
        super().__init__()
        self.ratio = ratio
    def forward(self, x: Tensor) -> Tensor:
        b, _, h, w = x.shape
        assert h % self.ratio == 0 and w % self.ratio == 0
        r = adaptive_avg_pool2d(x, (h // self.ratio, w // self.ratio))
        return torch.unflatten(r, 0, (b, -1))
 class MultiPool(fl.Concatenate):
    def __init__(self, pool_ratios: list[int]) -> None:
        super().__init__(
            *(
                fl.Chain(
                    Pool(pool_ratio),
                    fl.Flatten(-2, -1),
                    fl.Permute(0, 3, 1, 2),
                )
                for pool_ratio in pool_ratios
            ),
            dim=1,
        )
--- a/src/refiners/foundationals/swin/swin_transformer.py
+++ b/src/refiners/foundationals/swin/swin_transformer.py
@ -0,0 +1,391 @@
 # Swin Transformer (arXiv:2103.14030)
 #
 # Specific to MVANet, only supports square inputs.
 # Originally adapted from the version in MVANet and InSPyReNet (https://github.com/plemeri/InSPyReNet)
 # Original implementation by Microsoft at https://github.com/microsoft/Swin-Transformer
 import functools
 from math import isqrt
 import torch
 from torch import Tensor, device as Device
 import refiners.fluxion.layers as fl
 from refiners.fluxion.context import Contexts
 def to_windows(x: Tensor, window_size: int) -> Tensor:
    B, H, W, C = x.shape
    assert W == H and H % window_size == 0
    x = x.reshape(B, H // window_size, window_size, W // window_size, window_size, C)
    return x.permute(0, 1, 3, 2, 4, 5).reshape(B, -1, window_size * window_size, C)
 class ToWindows(fl.Module):
    def __init__(self, window_size: int):
        super().__init__()
        self.window_size = window_size
    def forward(self, x: Tensor) -> Tensor:
        return to_windows(x, self.window_size)
 class FromWindows(fl.Module):
    def forward(self, x: Tensor) -> Tensor:
        B, num_windows, window_size_2, C = x.shape
        window_size = isqrt(window_size_2)
        H = isqrt(num_windows * window_size_2)
        x = x.reshape(B, H // window_size, H // window_size, window_size, window_size, C)
        return x.permute(0, 1, 3, 2, 4, 5).reshape(B, H, H, C)
@functools.cache
 def get_attn_mask(H: int, window_size: int, device: Device | None = None) -> Tensor:
    assert H % window_size == 0
    shift_size = window_size // 2
    img_mask = torch.zeros((1, H, H, 1), device=device)
    h_slices = (
        slice(0, -window_size),
        slice(-window_size, -shift_size),
        slice(-shift_size, None),
    )
    w_slices = (
        slice(0, -window_size),
        slice(-window_size, -shift_size),
        slice(-shift_size, None),
    )
    cnt = 0
    for h in h_slices:
        for w in w_slices:
            img_mask[:, h, w, :] = cnt
            cnt += 1
    mask_windows = to_windows(img_mask, window_size).squeeze()  # B, nW, window_size * window_size, [1]
    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
    attn_mask.masked_fill_(attn_mask != 0, -100.0).masked_fill_(attn_mask == 0, 0.0)
    return attn_mask
 class Pad(fl.Module):
    def __init__(self, step: int):
        super().__init__()
        self.step = step
    def forward(self, x: Tensor) -> Tensor:
        B, H, W, C = x.shape
        assert W == H
        if H % self.step == 0:
            return x
        p = self.step * ((H + self.step - 1) // self.step)
        padded = torch.zeros(B, p, p, C, device=x.device, dtype=x.dtype)
        padded[:, :H, :H, :] = x
        return padded
 class StatefulPad(fl.Chain):
    def __init__(self, context: str, key: str, step: int) -> None:
        super().__init__(
            fl.SetContext(context=context, key=key, callback=self._push),
            Pad(step=step),
        )
    def _push(self, sizes: list[int], x: Tensor) -> None:
        sizes.append(x.size(1))
 class StatefulUnpad(fl.Chain):
    def __init__(self, context: str, key: str) -> None:
        super().__init__(
            fl.Parallel(
                fl.Identity(),
                fl.UseContext(context=context, key=key).compose(lambda x: x.pop()),
            ),
            fl.Lambda(self._unpad),
        )
    @staticmethod
    def _unpad(x: Tensor, size: int) -> Tensor:
        return x[:, :size, :size, :]
 class SquareUnflatten(fl.Module):
    # ..., L^2, ... -> ..., L, L, ...
    def __init__(self, dim: int = 0) -> None:
        super().__init__()
        self.dim = dim
    def forward(self, x: Tensor) -> Tensor:
        d = isqrt(x.shape[self.dim])
        return torch.unflatten(x, self.dim, (d, d))
 class WindowUnflatten(fl.Module):
    # ..., H, ... -> ..., H // ws, ws, ...
    def __init__(self, window_size: int, dim: int = 0) -> None:
        super().__init__()
        self.window_size = window_size
        self.dim = dim
    def forward(self, x: Tensor) -> Tensor:
        assert x.shape[self.dim] % self.window_size == 0
        H = x.shape[self.dim]
        return torch.unflatten(x, self.dim, (H // self.window_size, self.window_size))
 class Roll(fl.Module):
    def __init__(self, *shifts: tuple[int, int]):
        super().__init__()
        self.shifts = shifts
        self._dims = tuple(s[0] for s in shifts)
        self._shifts = tuple(s[1] for s in shifts)
    def forward(self, x: Tensor) -> Tensor:
        return torch.roll(x, self._shifts, self._dims)
 class RelativePositionBias(fl.Module):
    relative_position_index: Tensor
    def __init__(self, window_size: int, num_heads: int, device: Device | None = None):
        super().__init__()
        self.relative_position_bias_table = torch.nn.Parameter(
            torch.empty(
                (2 * window_size - 1) * (2 * window_size - 1),
                num_heads,
                device=device,
            )
        )
        relative_position_index = torch.empty(
            window_size**2,
            window_size**2,
            device=device,
            dtype=torch.int64,
        )
        self.register_buffer("relative_position_index", relative_position_index)
    def forward(self) -> Tensor:
        # Yes, this is a (trainable) constant.
        return self.relative_position_bias_table[self.relative_position_index].permute(2, 0, 1).unsqueeze(0)
 class WindowSDPA(fl.Module):
    def __init__(
        self,
        dim: int,
        window_size: int,
        num_heads: int,
        shift: bool = False,
        device: Device | None = None,
    ):
        super().__init__()
        self.window_size = window_size
        self.num_heads = num_heads
        self.shift = shift
        self.rpb = RelativePositionBias(window_size, num_heads, device=device)
    def forward(self, x: Tensor):
        B, num_windows, N, _C = x.shape
        assert _C % (3 * self.num_heads) == 0
        C = _C // 3
        x = torch.reshape(x, (B * num_windows, N, 3, self.num_heads, C // self.num_heads))
        q, k, v = x.permute(2, 0, 3, 1, 4)
        attn_mask = self.rpb()
        if self.shift:
            mask = get_attn_mask(isqrt(num_windows * (self.window_size**2)), self.window_size, x.device)
            mask = mask.reshape(1, num_windows, 1, N, N)
            mask = mask.expand(B, -1, self.num_heads, -1, -1)
            attn_mask = attn_mask + mask.reshape(-1, self.num_heads, N, N)
        x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask)
        x = x.transpose(1, 2).reshape(B, num_windows, N, C)
        return x
 class WindowAttention(fl.Chain):
    """
    Window-based Multi-head Self-Attenion (W-MSA), optionally shifted (SW-MSA).
    It has a trainable relative position bias (RelativePositionBias).
    The input projection is stored as a single Linear for q, k and v.
    """
    def __init__(
        self,
        dim: int,
        window_size: int,
        num_heads: int,
        shift: bool = False,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Linear(dim, dim * 3, bias=True, device=device),
            WindowSDPA(dim, window_size, num_heads, shift, device=device),
            fl.Linear(dim, dim, device=device),
        )
 class SwinTransformerBlock(fl.Chain):
    def __init__(
        self,
        dim: int,
        num_heads: int,
        window_size: int = 7,
        shift_size: int = 0,
        mlp_ratio: float = 4.0,
        device: Device | None = None,
    ):
        assert 0 <= shift_size < window_size, "shift_size must in [0, window_size["
        super().__init__(
            fl.Residual(
                fl.LayerNorm(dim, device=device),
                SquareUnflatten(1),
                StatefulPad(context="padding", key="sizes", step=window_size),
                Roll((1, -shift_size), (2, -shift_size)),
                ToWindows(window_size),
                WindowAttention(
                    dim,
                    window_size=window_size,
                    num_heads=num_heads,
                    shift=shift_size > 0,
                    device=device,
                ),
                FromWindows(),
                Roll((1, shift_size), (2, shift_size)),
                StatefulUnpad(context="padding", key="sizes"),
                fl.Flatten(1, 2),
            ),
            fl.Residual(
                fl.LayerNorm(dim, device=device),
                fl.Linear(dim, int(dim * mlp_ratio), device=device),
                fl.GeLU(),
                fl.Linear(int(dim * mlp_ratio), dim, device=device),
            ),
        )
    def init_context(self) -> Contexts:
        return {"padding": {"sizes": []}}
 class PatchMerging(fl.Chain):
    def __init__(self, dim: int, device: Device | None = None):
        super().__init__(
            SquareUnflatten(1),
            Pad(2),
            WindowUnflatten(2, 2),
            WindowUnflatten(2, 1),
            fl.Permute(0, 1, 3, 4, 2, 5),
            fl.Flatten(3),
            fl.Flatten(1, 2),
            fl.LayerNorm(4 * dim, device=device),
            fl.Linear(4 * dim, 2 * dim, bias=False, device=device),
        )
 class BasicLayer(fl.Chain):
    def __init__(
        self,
        dim: int,
        depth: int,
        num_heads: int,
        window_size: int = 7,
        mlp_ratio: float = 4.0,
        device: Device | None = None,
    ):
        super().__init__(
            SwinTransformerBlock(
                dim=dim,
                num_heads=num_heads,
                window_size=window_size,
                shift_size=0 if (i % 2 == 0) else window_size // 2,
                mlp_ratio=mlp_ratio,
                device=device,
            )
            for i in range(depth)
        )
 class PatchEmbedding(fl.Chain):
    def __init__(
        self,
        patch_size: tuple[int, int] = (4, 4),
        in_chans: int = 3,
        embedding_dim: int = 96,
        device: Device | None = None,
    ):
        super().__init__(
            fl.Conv2d(in_chans, embedding_dim, kernel_size=patch_size, stride=patch_size, device=device),
            fl.Flatten(2),
            fl.Transpose(1, 2),
            fl.LayerNorm(embedding_dim, device=device),
        )
 class SwinTransformer(fl.Chain):
    """Swin Transformer (arXiv:2103.14030)
    Currently specific to MVANet, only supports square inputs.
    """
    def __init__(
        self,
        patch_size: tuple[int, int] = (4, 4),
        in_chans: int = 3,
        embedding_dim: int = 96,
        depths: list[int] | None = None,
        num_heads: list[int] | None = None,
        window_size: int = 7,  # image size is 32 * this
        mlp_ratio: float = 4.0,
        device: Device | None = None,
    ):
        if depths is None:
            depths = [2, 2, 6, 2]
        if num_heads is None:
            num_heads = [3, 6, 12, 24]
        self.num_layers = len(depths)
        assert len(num_heads) == self.num_layers
        super().__init__(
            PatchEmbedding(
                patch_size=patch_size,
                in_chans=in_chans,
                embedding_dim=embedding_dim,
                device=device,
            ),
            fl.Passthrough(
                fl.Transpose(1, 2),
                SquareUnflatten(2),
                fl.SetContext("swin", "outputs", callback=lambda t, x: t.append(x)),
            ),
            *(
                fl.Chain(
                    BasicLayer(
                        dim=int(embedding_dim * 2**i),
                        depth=depths[i],
                        num_heads=num_heads[i],
                        window_size=window_size,
                        mlp_ratio=mlp_ratio,
                        device=device,
                    ),
                    fl.Passthrough(
                        fl.LayerNorm(int(embedding_dim * 2**i), device=device),
                        fl.Transpose(1, 2),
                        SquareUnflatten(2),
                        fl.SetContext("swin", "outputs", callback=lambda t, x: t.insert(0, x)),
                    ),
                    PatchMerging(dim=int(embedding_dim * 2**i), device=device)
                    if i < self.num_layers - 1
                    else fl.UseContext("swin", "outputs").compose(lambda t: tuple(t)),
                )
                for i in range(self.num_layers)
            ),
        )
    def init_context(self) -> Contexts:
        return {"swin": {"outputs": []}}
--- a/tests/e2e/test_mvanet.py
+++ b/tests/e2e/test_mvanet.py
@ -0,0 +1,59 @@
 from pathlib import Path
 from warnings import warn
 import pytest
 import torch
 from PIL import Image
 from tests.utils import ensure_similar_images
 from refiners.fluxion.utils import image_to_tensor, no_grad, normalize, tensor_to_image
 from refiners.foundationals.swin.mvanet import MVANet
 def _img_open(path: Path) -> Image.Image:
    return Image.open(path)  # type: ignore
@pytest.fixture(scope="module")
 def ref_path(test_e2e_path: Path) -> Path:
    return test_e2e_path / "test_mvanet_ref"
@pytest.fixture(scope="module")
 def ref_cactus(ref_path: Path) -> Image.Image:
    return _img_open(ref_path / "cactus.png").convert("RGB")
@pytest.fixture
 def expected_cactus_mask(ref_path: Path) -> Image.Image:
    return _img_open(ref_path / "expected_cactus_mask.png")
@pytest.fixture(scope="module")
 def mvanet_weights(test_weights_path: Path) -> Path:
    weights = test_weights_path / "mvanet" / "mvanet.safetensors"
    if not weights.is_file():
        warn(f"could not find weights at {test_weights_path}, skipping")
        pytest.skip(allow_module_level=True)
    return weights
@pytest.fixture
 def mvanet_model(mvanet_weights: Path, test_device: torch.device) -> MVANet:
    model = MVANet(device=test_device).eval()  # .eval() is important!
    model.load_from_safetensors(mvanet_weights)
    return model
@no_grad()
 def test_mvanet(
    mvanet_model: MVANet,
    ref_cactus: Image.Image,
    expected_cactus_mask: Image.Image,
    test_device: torch.device,
 ):
    in_t = image_to_tensor(ref_cactus.resize((1024, 1024), Image.Resampling.BILINEAR)).squeeze()
    in_t = normalize(in_t, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]).unsqueeze(0)
    prediction: torch.Tensor = mvanet_model(in_t.to(test_device)).sigmoid()
    cactus_mask = tensor_to_image(prediction).resize(ref_cactus.size, Image.Resampling.BILINEAR)
    ensure_similar_images(cactus_mask.convert("RGB"), expected_cactus_mask.convert("RGB"))
--- a/tests/e2e/test_mvanet_ref/README.md
+++ b/tests/e2e/test_mvanet_ref/README.md
@ -0,0 +1,3 @@
 `cactus.png` is cropped from this image: https://www.freepik.com/free-photo/laptop-notebook-pen-coffee-cup-plants-wooden-desk_269339828.htm
 `expected_cactus_mask.png` has been generated using the [official MVANet codebase](https://github.com/qianyu-dlut/MVANet) and weights.
--- a/tests/e2e/test_mvanet_ref/cactus.png
+++ b/tests/e2e/test_mvanet_ref/cactus.png
--- a/tests/e2e/test_mvanet_ref/expected_cactus_mask.png
+++ b/tests/e2e/test_mvanet_ref/expected_cactus_mask.png
--- a/typings/gdown/init.pyi
+++ b/typings/gdown/init.pyi
@ -0,0 +1 @@
 def download(id: str, output: str, quiet: bool = False) -> str: ...
		`@ -0,0 +1,2 @@`
							`::: refiners.foundationals.swin.swin_transformer`
							`::: refiners.foundationals.swin.mvanet`
		`@ -0,0 +1,3 @@`
							`from .swin_transformer import SwinTransformer`

							`__all__ = ["SwinTransformer"]`
		`@ -0,0 +1,3 @@`
							`from .mvanet import MVANet`

							`__all__ = ["MVANet"]`
		`@ -0,0 +1,3 @@`
							`cactus.png` is cropped from this image: https://www.freepik.com/free-photo/laptop-notebook-pen-coffee-cup-plants-wooden-desk_269339828.htm

							`expected_cactus_mask.png` has been generated using the [official MVANet codebase](https://github.com/qianyu-dlut/MVANet) and weights.
		`@ -0,0 +1 @@`
							`def download(id: str, output: str, quiet: bool = False) -> str: ...`