refiners/scripts/training/finetune-ldm-textual-inversion.py

import random
from typing import Any

from loguru import logger
from pydantic import BaseModel
from torch import Tensor, randn
from torch.utils.data import Dataset

from refiners.fluxion.utils import save_to_safetensors
from refiners.foundationals.clip.concepts import ConceptExtender, EmbeddingExtender
from refiners.foundationals.clip.text_encoder import CLIPTextEncoder, TokenEncoder
from refiners.foundationals.clip.tokenizer import CLIPTokenizer
from refiners.training_utils.callback import Callback
from refiners.training_utils.huggingface_datasets import HuggingfaceDatasetConfig
from refiners.training_utils.latent_diffusion import (
    FinetuneLatentDiffusionConfig,
    LatentDiffusionConfig,
    LatentDiffusionTrainer,
    TextEmbeddingLatentsBatch,
    TextEmbeddingLatentsDataset,
)

IMAGENET_TEMPLATES_SMALL = [
    "a photo of a {}",
    "a rendering of a {}",
    "a cropped photo of the {}",
    "the photo of a {}",
    "a photo of a clean {}",
    "a photo of a dirty {}",
    "a dark photo of the {}",
    "a photo of my {}",
    "a photo of the cool {}",
    "a close-up photo of a {}",
    "a bright photo of the {}",
    "a cropped photo of a {}",
    "a photo of the {}",
    "a good photo of the {}",
    "a photo of one {}",
    "a close-up photo of the {}",
    "a rendition of the {}",
    "a photo of the clean {}",
    "a rendition of a {}",
    "a photo of a nice {}",
    "a good photo of a {}",
    "a photo of the nice {}",
    "a photo of the small {}",
    "a photo of the weird {}",
    "a photo of the large {}",
    "a photo of a cool {}",
    "a photo of a small {}",
]

IMAGENET_STYLE_TEMPLATES_SMALL = [
    "a painting in the style of {}",
    "a rendering in the style of {}",
    "a cropped painting in the style of {}",
    "the painting in the style of {}",
    "a clean painting in the style of {}",
    "a dirty painting in the style of {}",
    "a dark painting in the style of {}",
    "a picture in the style of {}",
    "a cool painting in the style of {}",
    "a close-up painting in the style of {}",
    "a bright painting in the style of {}",
    "a cropped painting in the style of {}",
    "a good painting in the style of {}",
    "a close-up painting in the style of {}",
    "a rendition in the style of {}",
    "a nice painting in the style of {}",
    "a small painting in the style of {}",
    "a weird painting in the style of {}",
    "a large painting in the style of {}",
]


class TextualInversionDataset(TextEmbeddingLatentsDataset):
    templates: list[str] = []
    placeholder_token: str = ""

    def __init__(self, trainer: "LatentDiffusionTrainer[Any]") -> None:
        super().__init__(trainer)
        self.templates = (
            IMAGENET_STYLE_TEMPLATES_SMALL if self.config.textual_inversion.style_mode else IMAGENET_TEMPLATES_SMALL
        )
        self.placeholder_token = self.config.textual_inversion.placeholder_token

    def get_caption(self, index: int) -> str:
        # Ignore the dataset caption, if any: use a template instead
        return random.choice(self.templates).format(self.placeholder_token)


class TextualInversionConfig(BaseModel):
    # The new token to be learned
    placeholder_token: str = "*"
    # The token to be used as initializer; if None, a random vector is used
    initializer_token: str | None = None
    style_mode: bool = False

    def apply_textual_inversion_to_target(self, text_encoder: CLIPTextEncoder) -> None:
        adapter = ConceptExtender(target=text_encoder)
        tokenizer = text_encoder.ensure_find(CLIPTokenizer)
        token_encoder = text_encoder.ensure_find(TokenEncoder)
        if self.initializer_token is not None:
            bpe = tokenizer.byte_pair_encoding(token=self.initializer_token)
            assert " " not in bpe, "This initializer_token is not a single token."
            token = Tensor([tokenizer.token_to_id_mapping[bpe]]).int().to(text_encoder.device)
            init_embedding = token_encoder(token).squeeze(0)
        else:
            token_encoder = text_encoder.ensure_find(TokenEncoder)
            init_embedding = randn(token_encoder.embedding_dim)
        adapter.add_concept(self.placeholder_token, init_embedding)
        adapter.inject()


class TextualInversionLatentDiffusionConfig(FinetuneLatentDiffusionConfig):
    dataset: HuggingfaceDatasetConfig
    latent_diffusion: LatentDiffusionConfig
    textual_inversion: TextualInversionConfig

    def model_post_init(self, __context: Any) -> None:
        # Pydantic v2 does post init differently, so we need to override this method too.
        logger.info("Freezing models to train only the new embedding.")
        self.models["unet"].train = False
        self.models["text_encoder"].train = False
        self.models["lda"].train = False


class TextualInversionLatentDiffusionTrainer(LatentDiffusionTrainer[TextualInversionLatentDiffusionConfig]):
    def __init__(
        self,
        config: TextualInversionLatentDiffusionConfig,
        callbacks: "list[Callback[Any]] | None" = None,
    ) -> None:
        super().__init__(config=config, callbacks=callbacks)
        self.callbacks.extend((LoadTextualInversion(), SaveTextualInversion()))

    def load_dataset(self) -> Dataset[TextEmbeddingLatentsBatch]:
        return TextualInversionDataset(trainer=self)


class LoadTextualInversion(Callback[TextualInversionLatentDiffusionTrainer]):
    def on_train_begin(self, trainer: TextualInversionLatentDiffusionTrainer) -> None:
        trainer.config.textual_inversion.apply_textual_inversion_to_target(text_encoder=trainer.text_encoder)


class SaveTextualInversion(Callback[TextualInversionLatentDiffusionTrainer]):
    def on_checkpoint_save(self, trainer: TextualInversionLatentDiffusionTrainer) -> None:
        embedding_extender = trainer.text_encoder.ensure_find(EmbeddingExtender)
        tensors = {trainer.config.textual_inversion.placeholder_token: embedding_extender.new_weight.squeeze(0)}

        save_to_safetensors(
            path=trainer.ensure_checkpoints_save_folder / f"step{trainer.clock.step}.safetensors", tensors=tensors
        )


if __name__ == "__main__":
    import sys

    config_path = sys.argv[1]
    config = TextualInversionLatentDiffusionConfig.load_from_toml(toml_path=config_path)
    trainer = TextualInversionLatentDiffusionTrainer(config=config)
    trainer.train()
run lint rules using latest isort settings 2023-12-11 10:46:38 +00:00			`import random`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`from typing import Any`
run lint rules using latest isort settings 2023-12-11 10:46:38 +00:00
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`from loguru import logger`
run lint rules using latest isort settings 2023-12-11 10:46:38 +00:00			`from pydantic import BaseModel`
			`from torch import Tensor, randn`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`from torch.utils.data import Dataset`

run lint rules using latest isort settings 2023-12-11 10:46:38 +00:00			`from refiners.fluxion.utils import save_to_safetensors`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`from refiners.foundationals.clip.concepts import ConceptExtender, EmbeddingExtender`
			`from refiners.foundationals.clip.text_encoder import CLIPTextEncoder, TokenEncoder`
			`from refiners.foundationals.clip.tokenizer import CLIPTokenizer`
			`from refiners.training_utils.callback import Callback`
remove huggingface datasets from default config 2023-12-15 16:12:54 +00:00			`from refiners.training_utils.huggingface_datasets import HuggingfaceDatasetConfig`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`from refiners.training_utils.latent_diffusion import (`
			`FinetuneLatentDiffusionConfig,`
			`LatentDiffusionConfig,`
run lint rules using latest isort settings 2023-12-11 10:46:38 +00:00			`LatentDiffusionTrainer,`
			`TextEmbeddingLatentsBatch,`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`TextEmbeddingLatentsDataset,`
			`)`

			`IMAGENET_TEMPLATES_SMALL = [`
			`"a photo of a {}",`
			`"a rendering of a {}",`
			`"a cropped photo of the {}",`
			`"the photo of a {}",`
			`"a photo of a clean {}",`
			`"a photo of a dirty {}",`
			`"a dark photo of the {}",`
			`"a photo of my {}",`
			`"a photo of the cool {}",`
			`"a close-up photo of a {}",`
			`"a bright photo of the {}",`
			`"a cropped photo of a {}",`
			`"a photo of the {}",`
			`"a good photo of the {}",`
			`"a photo of one {}",`
			`"a close-up photo of the {}",`
			`"a rendition of the {}",`
			`"a photo of the clean {}",`
			`"a rendition of a {}",`
			`"a photo of a nice {}",`
			`"a good photo of a {}",`
			`"a photo of the nice {}",`
			`"a photo of the small {}",`
			`"a photo of the weird {}",`
			`"a photo of the large {}",`
			`"a photo of a cool {}",`
			`"a photo of a small {}",`
			`]`

			`IMAGENET_STYLE_TEMPLATES_SMALL = [`
			`"a painting in the style of {}",`
			`"a rendering in the style of {}",`
			`"a cropped painting in the style of {}",`
			`"the painting in the style of {}",`
			`"a clean painting in the style of {}",`
			`"a dirty painting in the style of {}",`
			`"a dark painting in the style of {}",`
			`"a picture in the style of {}",`
			`"a cool painting in the style of {}",`
			`"a close-up painting in the style of {}",`
			`"a bright painting in the style of {}",`
			`"a cropped painting in the style of {}",`
			`"a good painting in the style of {}",`
			`"a close-up painting in the style of {}",`
			`"a rendition in the style of {}",`
			`"a nice painting in the style of {}",`
			`"a small painting in the style of {}",`
			`"a weird painting in the style of {}",`
			`"a large painting in the style of {}",`
			`]`


			`class TextualInversionDataset(TextEmbeddingLatentsDataset):`
			`templates: list[str] = []`
			`placeholder_token: str = ""`

			`def __init__(self, trainer: "LatentDiffusionTrainer[Any]") -> None:`
			`super().__init__(trainer)`
			`self.templates = (`
			`IMAGENET_STYLE_TEMPLATES_SMALL if self.config.textual_inversion.style_mode else IMAGENET_TEMPLATES_SMALL`
			`)`
			`self.placeholder_token = self.config.textual_inversion.placeholder_token`

			`def get_caption(self, index: int) -> str:`
			`# Ignore the dataset caption, if any: use a template instead`
			`return random.choice(self.templates).format(self.placeholder_token)`


			`class TextualInversionConfig(BaseModel):`
			`# The new token to be learned`
			`placeholder_token: str = "*"`
			`# The token to be used as initializer; if None, a random vector is used`
			`initializer_token: str \| None = None`
			`style_mode: bool = False`

			`def apply_textual_inversion_to_target(self, text_encoder: CLIPTextEncoder) -> None:`
			`adapter = ConceptExtender(target=text_encoder)`
add ensure_find and ensure_find_parent helpers 2023-09-12 09:50:56 +00:00			`tokenizer = text_encoder.ensure_find(CLIPTokenizer)`
			`token_encoder = text_encoder.ensure_find(TokenEncoder)`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`if self.initializer_token is not None:`
			`bpe = tokenizer.byte_pair_encoding(token=self.initializer_token)`
			`assert " " not in bpe, "This initializer_token is not a single token."`
			`token = Tensor([tokenizer.token_to_id_mapping[bpe]]).int().to(text_encoder.device)`
			`init_embedding = token_encoder(token).squeeze(0)`
			`else:`
add ensure_find and ensure_find_parent helpers 2023-09-12 09:50:56 +00:00			`token_encoder = text_encoder.ensure_find(TokenEncoder)`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`init_embedding = randn(token_encoder.embedding_dim)`
			`adapter.add_concept(self.placeholder_token, init_embedding)`
			`adapter.inject()`


			`class TextualInversionLatentDiffusionConfig(FinetuneLatentDiffusionConfig):`
remove huggingface datasets from default config 2023-12-15 16:12:54 +00:00			`dataset: HuggingfaceDatasetConfig`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`latent_diffusion: LatentDiffusionConfig`
			`textual_inversion: TextualInversionConfig`

			`def model_post_init(self, __context: Any) -> None:`
			`# Pydantic v2 does post init differently, so we need to override this method too.`
			`logger.info("Freezing models to train only the new embedding.")`
			`self.models["unet"].train = False`
			`self.models["text_encoder"].train = False`
			`self.models["lda"].train = False`


			`class TextualInversionLatentDiffusionTrainer(LatentDiffusionTrainer[TextualInversionLatentDiffusionConfig]):`
			`def __init__(`
			`self,`
			`config: TextualInversionLatentDiffusionConfig,`
			`callbacks: "list[Callback[Any]] \| None" = None,`
			`) -> None:`
			`super().__init__(config=config, callbacks=callbacks)`
			`self.callbacks.extend((LoadTextualInversion(), SaveTextualInversion()))`

			`def load_dataset(self) -> Dataset[TextEmbeddingLatentsBatch]:`
			`return TextualInversionDataset(trainer=self)`


			`class LoadTextualInversion(Callback[TextualInversionLatentDiffusionTrainer]):`
			`def on_train_begin(self, trainer: TextualInversionLatentDiffusionTrainer) -> None:`
			`trainer.config.textual_inversion.apply_textual_inversion_to_target(text_encoder=trainer.text_encoder)`


			`class SaveTextualInversion(Callback[TextualInversionLatentDiffusionTrainer]):`
			`def on_checkpoint_save(self, trainer: TextualInversionLatentDiffusionTrainer) -> None:`
add ensure_find and ensure_find_parent helpers 2023-09-12 09:50:56 +00:00			`embedding_extender = trainer.text_encoder.ensure_find(EmbeddingExtender)`
Add concepts learning via textual inversion 2023-08-31 14:05:01 +00:00			`tensors = {trainer.config.textual_inversion.placeholder_token: embedding_extender.new_weight.squeeze(0)}`

			`save_to_safetensors(`
			`path=trainer.ensure_checkpoints_save_folder / f"step{trainer.clock.step}.safetensors", tensors=tensors`
			`)`


			`if __name__ == "__main__":`
			`import sys`

			`config_path = sys.argv[1]`
			`config = TextualInversionLatentDiffusionConfig.load_from_toml(toml_path=config_path)`
			`trainer = TextualInversionLatentDiffusionTrainer(config=config)`
			`trainer.train()`