From 78e9f7728e37241da3a22bb1f27180d99bd7e245 Mon Sep 17 00:00:00 2001 From: Laurent Date: Fri, 2 Feb 2024 12:30:41 +0000 Subject: [PATCH] (doc/fluxion/ld) add `StableDiffusion_1` docstrings --- .../stable_diffusion_1/__init__.py | 2 + .../stable_diffusion_1/model.py | 85 +++++++++++++++++++ .../stable_diffusion_xl/model.py | 4 +- 3 files changed, 89 insertions(+), 2 deletions(-) diff --git a/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/__init__.py b/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/__init__.py index 4064f4b..f8cd4c7 100644 --- a/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/__init__.py +++ b/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/__init__.py @@ -1,6 +1,7 @@ from refiners.foundationals.latent_diffusion.stable_diffusion_1.controlnet import SD1ControlnetAdapter from refiners.foundationals.latent_diffusion.stable_diffusion_1.image_prompt import SD1IPAdapter from refiners.foundationals.latent_diffusion.stable_diffusion_1.model import ( + SD1Autoencoder, StableDiffusion_1, StableDiffusion_1_Inpainting, ) @@ -10,6 +11,7 @@ from refiners.foundationals.latent_diffusion.stable_diffusion_1.unet import SD1U __all__ = [ "StableDiffusion_1", "StableDiffusion_1_Inpainting", + "SD1Autoencoder", "SD1UNet", "SD1ControlnetAdapter", "SD1IPAdapter", diff --git a/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/model.py b/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/model.py index a4c312d..d42252e 100644 --- a/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/model.py +++ b/src/refiners/foundationals/latent_diffusion/stable_diffusion_1/model.py @@ -13,10 +13,24 @@ from refiners.foundationals.latent_diffusion.stable_diffusion_1.unet import SD1U class SD1Autoencoder(LatentDiffusionAutoencoder): + """Stable Diffusion 1.5 autoencoder model. + + Attributes: + encoder_scale: The encoder scale to use. + """ + encoder_scale: float = 0.18215 class StableDiffusion_1(LatentDiffusionModel): + """Stable Diffusion 1.5 model. + + Attributes: + unet: The U-Net model. + clip_text_encoder: The text encoder. + lda: The image autoencoder. + """ + unet: SD1UNet clip_text_encoder: CLIPTextEncoderL lda: SD1Autoencoder @@ -30,6 +44,16 @@ class StableDiffusion_1(LatentDiffusionModel): device: Device | str = "cpu", dtype: DType = torch.float32, ) -> None: + """Initializes the model. + + Args: + unet: The SD1UNet U-Net model to use. + lda: The SD1Autoencoder image autoencoder to use. + clip_text_encoder: The CLIPTextEncoderL text encoder to use. + solver: The solver to use. + device: The PyTorch device to use. + dtype: The PyTorch data type to use. + """ unet = unet or SD1UNet(in_channels=4) lda = lda or SD1Autoencoder() clip_text_encoder = clip_text_encoder or CLIPTextEncoderL() @@ -45,6 +69,13 @@ class StableDiffusion_1(LatentDiffusionModel): ) def compute_clip_text_embedding(self, text: str, negative_text: str = "") -> Tensor: + """Compute the CLIP text embedding associated with the given prompt and negative prompt. + + Args: + text: The prompt to compute the CLIP text embedding of. + negative_text: The negative prompt to compute the CLIP text embedding of. + If not provided, the negative prompt is assumed to be empty (i.e., `""`). + """ conditional_embedding = self.clip_text_encoder(text) if text == negative_text: return torch.cat(tensors=(conditional_embedding, conditional_embedding), dim=0) @@ -53,10 +84,22 @@ class StableDiffusion_1(LatentDiffusionModel): return torch.cat(tensors=(negative_embedding, conditional_embedding), dim=0) def set_unet_context(self, *, timestep: Tensor, clip_text_embedding: Tensor, **_: Tensor) -> None: + """Set the various context parameters required by the U-Net model. + + Args: + timestep: The timestep tensor to use. + clip_text_embedding: The CLIP text embedding tensor to use. + """ self.unet.set_timestep(timestep=timestep) self.unet.set_clip_text_embedding(clip_text_embedding=clip_text_embedding) def set_self_attention_guidance(self, enable: bool, scale: float = 1.0) -> None: + """Set whether to enable self-attention guidance. + + Args: + enable: Whether to enable self-attention guidance. + scale: The scale to use. + """ if enable: if sag := self._find_sag_adapter(): sag.scale = scale @@ -67,9 +110,11 @@ class StableDiffusion_1(LatentDiffusionModel): sag.eject() def has_self_attention_guidance(self) -> bool: + """Whether the model has self-attention guidance or not.""" return self._find_sag_adapter() is not None def _find_sag_adapter(self) -> SD1SAGAdapter | None: + """Finds the self-attention guidance adapter.""" for p in self.unet.get_parents(): if isinstance(p, SD1SAGAdapter): return p @@ -78,6 +123,17 @@ class StableDiffusion_1(LatentDiffusionModel): def compute_self_attention_guidance( self, x: Tensor, noise: Tensor, step: int, *, clip_text_embedding: Tensor, **kwargs: Tensor ) -> Tensor: + """Compute the self-attention guidance. + + Args: + x: The input tensor. + noise: The noise tensor. + step: The step to compute the self-attention guidance at. + clip_text_embedding: The CLIP text embedding to compute the self-attention guidance with. + + Returns: + The computed self-attention guidance. + """ sag = self._find_sag_adapter() assert sag is not None @@ -106,6 +162,14 @@ class StableDiffusion_1(LatentDiffusionModel): class StableDiffusion_1_Inpainting(StableDiffusion_1): + """Stable Diffusion 1.5 inpainting model. + + Attributes: + unet: The U-Net model. + clip_text_encoder: The text encoder. + lda: The image autoencoder. + """ + def __init__( self, unet: SD1UNet | None = None, @@ -140,6 +204,16 @@ class StableDiffusion_1_Inpainting(StableDiffusion_1): mask: Image.Image, latents_size: tuple[int, int] = (64, 64), ) -> tuple[Tensor, Tensor]: + """Set the inpainting conditions. + + Args: + target_image: The target image to inpaint. + mask: The mask to use for inpainting. + latents_size: The size of the latents to use. + + Returns: + The mask latents and the target image latents. + """ target_image = target_image.convert(mode="RGB") mask = mask.convert(mode="L") @@ -156,6 +230,17 @@ class StableDiffusion_1_Inpainting(StableDiffusion_1): def compute_self_attention_guidance( self, x: Tensor, noise: Tensor, step: int, *, clip_text_embedding: Tensor, **kwargs: Tensor ) -> Tensor: + """Compute the self-attention guidance. + + Args: + x: The input tensor. + noise: The noise tensor. + step: The step to compute the self-attention guidance at. + clip_text_embedding: The CLIP text embedding to compute the self-attention guidance with. + + Returns: + The computed self-attention guidance. + """ sag = self._find_sag_adapter() assert sag is not None assert self.mask_latents is not None diff --git a/src/refiners/foundationals/latent_diffusion/stable_diffusion_xl/model.py b/src/refiners/foundationals/latent_diffusion/stable_diffusion_xl/model.py index b3891ea..27fe55a 100644 --- a/src/refiners/foundationals/latent_diffusion/stable_diffusion_xl/model.py +++ b/src/refiners/foundationals/latent_diffusion/stable_diffusion_xl/model.py @@ -25,7 +25,7 @@ class StableDiffusion_XL(LatentDiffusionModel): Attributes: unet: The U-Net model. clip_text_encoder: The text encoder. - lda (SDXLAutoencoder): The image autoencoder. + lda: The image autoencoder. """ unet: SDXLUNet @@ -103,7 +103,7 @@ class StableDiffusion_XL(LatentDiffusionModel): time_ids: Tensor, **_: Tensor, ) -> None: - """Sets the various context parameters required by the U-Net model. + """Set the various context parameters required by the U-Net model. Args: timestep: The timestep to set.