implement abstract MultiDiffusion class

2024-11-21 21:58:47 +00:00 · 2023-09-18 10:47:01 +02:00 · 2023-09-18 10:47:01 +02:00 · b86521da2f
parent e319f13d05
commit b86521da2f
1 changed files with 98 additions and 0 deletions
--- a/src/refiners/foundationals/latent_diffusion/multi_diffusion.py
+++ b/src/refiners/foundationals/latent_diffusion/multi_diffusion.py
@ -0,0 +1,98 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Generic, TypeVar
 import torch
 from PIL import Image
 from torch import Tensor, device as Device, dtype as DType
 from refiners.foundationals.latent_diffusion.model import LatentDiffusionModel
 MAX_STEPS = 1000
@dataclass
 class DiffusionTarget:
    size: tuple[int, int]
    offset: tuple[int, int]
    clip_text_embedding: Tensor
    init_latents: Tensor | None = None
    mask_latent: Tensor | None = None
    weight: int = 1
    condition_scale: float = 7.5
    start_step: int = 0
    end_step: int = MAX_STEPS
    def crop(self, tensor: Tensor, /) -> Tensor:
        height, width = self.size
        top_offset, left_offset = self.offset
        return tensor[:, :, top_offset : top_offset + height, left_offset : left_offset + width]
    def paste(self, tensor: Tensor, /, crop: Tensor) -> Tensor:
        height, width = self.size
        top_offset, left_offset = self.offset
        tensor[:, :, top_offset : top_offset + height, left_offset : left_offset + width] = crop
        return tensor
 T = TypeVar("T", bound=LatentDiffusionModel)
 D = TypeVar("D", bound=DiffusionTarget)
@dataclass
 class MultiDiffusion(Generic[T, D], ABC):
    ldm: T
    def __call__(self, x: Tensor, /, noise: Tensor, step: int, targets: list[D]) -> Tensor:
        num_updates = torch.zeros_like(input=x)
        cumulative_values = torch.zeros_like(input=x)
        for target in targets:
            match step:
                case step if step == target.start_step and target.init_latents is not None:
                    noise_view = target.crop(noise)
                    view = self.ldm.scheduler.add_noise(
                        x=target.init_latents,
                        noise=noise_view,
                        step=step,
                    )
                case step if target.start_step <= step <= target.end_step:
                    view = target.crop(x)
                case _:
                    continue
            view = self.diffuse_target(x=view, step=step, target=target)
            weight = target.weight * target.mask_latent if target.mask_latent is not None else target.weight
            num_updates = target.paste(num_updates, crop=target.crop(num_updates) + weight)
            cumulative_values = target.paste(cumulative_values, crop=target.crop(cumulative_values) + weight * view)
        return torch.where(condition=num_updates > 0, input=cumulative_values / num_updates, other=x)
    @abstractmethod
    def diffuse_target(self, x: Tensor, step: int, target: D) -> Tensor: ...
    @property
    def steps(self) -> list[int]:
        return self.ldm.steps
    @property
    def device(self) -> Device:
        return self.ldm.device
    @property
    def dtype(self) -> DType:
        return self.ldm.dtype
    def decode_latents(self, x: Tensor) -> Image.Image:
        return self.ldm.lda.decode_latents(x=x)
    @staticmethod
    def generate_offset_grid(size: tuple[int, int], stride: int = 8) -> list[tuple[int, int]]:
        height, width = size
        return [
            (y, x)
            for y in range(0, height, stride)
            for x in range(0, width, stride)
            if y + 64 <= height and x + 64 <= width
        ]