write StyleAligned e2e test

2024-11-21 13:48:46 +00:00 · 2024-02-15 14:11:11 +00:00 · 2024-02-15 14:11:11 +00:00 · da3c3602fb
parent 60c0780fe7
commit da3c3602fb
3 changed files with 83 additions and 0 deletions
--- a/tests/e2e/test_diffusion.py
+++ b/tests/e2e/test_diffusion.py
@ -30,6 +30,7 @@ from refiners.foundationals.latent_diffusion.solvers import DDIM, Euler, NoiseSc
 from refiners.foundationals.latent_diffusion.stable_diffusion_1.multi_diffusion import SD1MultiDiffusion
 from refiners.foundationals.latent_diffusion.stable_diffusion_xl.control_lora import ControlLoraAdapter
 from refiners.foundationals.latent_diffusion.stable_diffusion_xl.model import StableDiffusion_XL
+from refiners.foundationals.latent_diffusion.style_aligned import StyleAlignedAdapter
 from tests.utils import ensure_similar_images


@ -150,6 +151,11 @@ def expected_sdxl_euler_random_init(ref_path: Path) -> Image.Image:
    return Image.open(ref_path / "expected_cutecat_sdxl_euler_random_init.png").convert("RGB")


+@pytest.fixture
+def expected_style_aligned(ref_path: Path) -> Image.Image:
+    return Image.open(fp=ref_path / "expected_style_aligned.png").convert(mode="RGB")
+
+
@pytest.fixture(scope="module", params=["canny", "depth", "lineart", "normals", "sam"])
 def controlnet_data(
    ref_path: Path, test_weights_path: Path, request: pytest.FixtureRequest
@ -2140,3 +2146,79 @@ def test_hello_world(
    predicted_image = sdxl.lda.latents_to_image(x)

    ensure_similar_images(predicted_image, expected_image)
+
+
+@no_grad()
+def test_style_aligned(
+    sdxl_ddim_lda_fp16_fix: StableDiffusion_XL,
+    expected_style_aligned: Image.Image,
+):
+    sdxl = sdxl_ddim_lda_fp16_fix.to(dtype=torch.float16)
+    sdxl.dtype = torch.float16  # FIXME: should not be necessary
+
+    style_aligned_adapter = StyleAlignedAdapter(sdxl.unet)
+    style_aligned_adapter.inject()
+
+    set_of_prompts = [
+        "a toy train. macro photo. 3d game asset",
+        "a toy airplane. macro photo. 3d game asset",
+        "a toy bicycle. macro photo. 3d game asset",
+        "a toy car. macro photo. 3d game asset",
+        "a toy boat. macro photo. 3d game asset",
+    ]
+
+    # create (context) embeddings from prompts
+    # TODO: replace this logic with https://github.com/finegrain-ai/refiners/pull/263 when it gets merged
+    unconds: list[torch.Tensor] = []
+    conds: list[torch.Tensor] = []
+    pooled_unconds: list[torch.Tensor] = []
+    pooled_conds: list[torch.Tensor] = []
+    for prompt in set_of_prompts:
+        clip_text_embedding, pooled_text_embedding = sdxl.compute_clip_text_embedding(text=prompt)
+
+        uncond, cond = clip_text_embedding.chunk(2)
+        pooled_uncond, pooled_cond = pooled_text_embedding.chunk(2)
+
+        unconds.append(uncond)
+        conds.append(cond)
+        pooled_unconds.append(pooled_uncond)
+        pooled_conds.append(pooled_cond)
+
+    uncond = torch.cat(unconds, dim=0)
+    cond = torch.cat(conds, dim=0)
+    pooled_uncond = torch.cat(pooled_unconds, dim=0)
+    pooled_cond = torch.cat(pooled_conds, dim=0)
+
+    clip_text_embedding = torch.cat((uncond, cond), dim=0)
+    pooled_text_embedding = torch.cat((pooled_uncond, pooled_cond), dim=0)
+
+    time_ids = sdxl.default_time_ids.repeat(len(set_of_prompts), 1)
+
+    # initialize latents
+    manual_seed(seed=2)
+    x = torch.randn(
+        (len(set_of_prompts), 4, 128, 128),
+        device=sdxl.device,
+        dtype=sdxl.dtype,
+    )
+
+    # denoise
+    for step in sdxl.steps:
+        x = sdxl(
+            x,
+            step=step,
+            clip_text_embedding=clip_text_embedding,
+            pooled_text_embedding=pooled_text_embedding,
+            time_ids=time_ids,
+        )
+
+    # decode latents
+    predicted_images = [sdxl.lda.decode_latents(latent.unsqueeze(0)) for latent in x]
+
+    # tile all images horizontally
+    merged_image = Image.new("RGB", (1024 * len(predicted_images), 1024))
+    for i in range(len(predicted_images)):
+        merged_image.paste(predicted_images[i], (i * 1024, 0))
+
+    # compare against reference image
+    ensure_similar_images(merged_image, expected_style_aligned, min_psnr=35, min_ssim=0.99)
--- a/tests/e2e/test_diffusion_ref/README.md
+++ b/tests/e2e/test_diffusion_ref/README.md
@ -56,6 +56,7 @@ Special cases:
    - `expected_controllora_PyraCanny.png`
    - `expected_controllora_PyraCanny+CPDS.png`
    - `expected_controllora_disabled.png`
+    - `expected_style_aligned.png`

 ## Other images

--- a/tests/e2e/test_diffusion_ref/expected_style_aligned.png
+++ b/tests/e2e/test_diffusion_ref/expected_style_aligned.png