refiners/scripts/conversion/convert_transformers_clip_text_model.py

import argparse
from pathlib import Path
from typing import NamedTuple, cast

from torch import nn
from transformers import CLIPTextModel, CLIPTextModelWithProjection  # type: ignore

import refiners.fluxion.layers as fl
from refiners.fluxion.model_converter import ModelConverter
from refiners.fluxion.utils import save_to_safetensors
from refiners.foundationals.clip.text_encoder import CLIPTextEncoder, CLIPTextEncoderG, CLIPTextEncoderL
from refiners.foundationals.clip.tokenizer import CLIPTokenizer
from refiners.foundationals.latent_diffusion.stable_diffusion_xl.text_encoder import DoubleTextEncoder


class Args(argparse.Namespace):
    source_path: str
    subfolder: str
    output_path: str | None
    half: bool
    verbose: bool


class CLIPTextEncoderConfig(NamedTuple):
    architectures: list[str]
    vocab_size: int
    hidden_size: int
    intermediate_size: int
    num_hidden_layers: int
    num_attention_heads: int
    hidden_act: str
    layer_norm_eps: float
    projection_dim: int


def setup_converter(args: Args, with_projection: bool = False) -> ModelConverter:
    # low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
    cls = CLIPTextModelWithProjection if with_projection else CLIPTextModel
    source: nn.Module = cls.from_pretrained(  # type: ignore
        pretrained_model_name_or_path=args.source_path,
        subfolder=args.subfolder,
        low_cpu_mem_usage=False,
    )
    assert isinstance(source, nn.Module), "Source model is not a nn.Module"
    config = cast(CLIPTextEncoderConfig, source.config)  # pyright: ignore[reportArgumentType, reportUnknownMemberType]
    architecture: str = config.architectures[0]
    embedding_dim: int = config.hidden_size
    projection_dim: int = config.projection_dim
    use_quick_gelu = config.hidden_act == "quick_gelu"
    assert architecture in ("CLIPTextModel", "CLIPTextModelWithProjection"), f"Unsupported architecture: {architecture}"
    target = CLIPTextEncoder(
        embedding_dim=config.hidden_size,
        num_layers=config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        feedforward_dim=config.intermediate_size,
        use_quick_gelu=use_quick_gelu,
    )
    if architecture == "CLIPTextModelWithProjection":
        target.append(module=fl.Linear(in_features=embedding_dim, out_features=projection_dim, bias=False))
    text = "What a nice cat you have there!"
    tokenizer = target.ensure_find(CLIPTokenizer)
    tokens = tokenizer(text)
    converter = ModelConverter(source_model=source, target_model=target, skip_output_check=True, verbose=args.verbose)
    if not converter.run(source_args=(tokens,), target_args=(text,)):
        raise RuntimeError("Model conversion failed")
    return converter


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Converts a CLIPTextEncoder from the library transformers from the HuggingFace Hub to refiners."
    )
    parser.add_argument(
        "--from",
        type=str,
        dest="source_path",
        default="runwayml/stable-diffusion-v1-5",
        help=(
            "Can be a path to a .bin file, a .safetensors file or a model name from the HuggingFace Hub. Default:"
            " runwayml/stable-diffusion-v1-5"
        ),
    )
    parser.add_argument(
        "--subfolder",
        type=str,
        dest="subfolder",
        default="text_encoder",
        help=(
            "Subfolder in the source path where the model is located inside the Hub. Default: text_encoder (for"
            " CLIPTextModel)"
        ),
    )
    parser.add_argument(
        "--subfolder2",
        type=str,
        dest="subfolder2",
        default=None,
        help="Additional subfolder for the 2nd text encoder (useful for SDXL). Default: None",
    )
    parser.add_argument(
        "--to",
        type=str,
        dest="output_path",
        default=None,
        help=(
            "Output path (.safetensors) for converted model. If not provided, the output path will be the same as the"
            " source path."
        ),
    )
    parser.add_argument("--half", action="store_true", help="Convert to half precision.")
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="Prints additional information during conversion. Default: False",
    )
    args = parser.parse_args(namespace=Args())
    if args.output_path is None:
        args.output_path = f"{Path(args.source_path).stem}-{args.subfolder}.safetensors"
    converter = setup_converter(args=args)
    if args.subfolder2 is not None:
        # Assume this is the second text encoder of Stable Diffusion XL
        args.subfolder = args.subfolder2
        converter2 = setup_converter(args=args, with_projection=True)

        text_encoder_l = CLIPTextEncoderL()
        text_encoder_l.load_state_dict(state_dict=converter.get_state_dict())

        projection = cast(CLIPTextEncoder, converter2.target_model)[-1]
        assert isinstance(projection, fl.Linear)
        text_encoder_g_with_projection = CLIPTextEncoderG()
        text_encoder_g_with_projection.append(module=projection)
        text_encoder_g_with_projection.load_state_dict(state_dict=converter2.get_state_dict())

        projection = text_encoder_g_with_projection.pop(index=-1)
        assert isinstance(projection, fl.Linear)
        double_text_encoder = DoubleTextEncoder(
            text_encoder_l=text_encoder_l, text_encoder_g=text_encoder_g_with_projection, projection=projection
        )

        state_dict = double_text_encoder.state_dict()
        if args.half:
            state_dict = {key: value.half() for key, value in state_dict.items()}
        save_to_safetensors(path=args.output_path, tensors=state_dict)
    else:
        converter.save_to_safetensors(path=args.output_path, half=args.half)


if __name__ == "__main__":
    main()
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`import argparse`
			`from pathlib import Path`
fix typing issues coming from torch 2.4 version ; typing is not guaranteed for torch < 2.4 2024-08-02 09:56:37 +00:00			`from typing import NamedTuple, cast`
run lint rules using latest isort settings 2023-12-11 10:46:38 +00:00
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`from torch import nn`
pick the right class for CLIP text converter i.e. CLIPTextModel by default or CLIPTextModelWithProjection for SDXL so-called text_encoder_2 This silent false positive warnings like: Some weights of CLIPTextModelWithProjection were not initialized from the model checkpoint [...] 2024-01-17 16:15:46 +00:00			`from transformers import CLIPTextModel, CLIPTextModelWithProjection # type: ignore`
run lint rules using latest isort settings 2023-12-11 10:46:38 +00:00
			`import refiners.fluxion.layers as fl`
			`from refiners.fluxion.model_converter import ModelConverter`
			`from refiners.fluxion.utils import save_to_safetensors`
			`from refiners.foundationals.clip.text_encoder import CLIPTextEncoder, CLIPTextEncoderG, CLIPTextEncoderL`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`from refiners.foundationals.clip.tokenizer import CLIPTokenizer`
make clip text converter support SDXL i.e. convert the 2nd text encoder and save the final double text encoder 2023-09-11 14:02:22 +00:00			`from refiners.foundationals.latent_diffusion.stable_diffusion_xl.text_encoder import DoubleTextEncoder`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00

			`class Args(argparse.Namespace):`
			`source_path: str`
			`subfolder: str`
			`output_path: str \| None`
scripts: add converter for clip image encoder Tested with: python scripts/conversion/convert_transformers_clip_image_model.py \ \ --from /path/to/stabilityai/stable-diffusion-2-1-unclip 2023-08-30 08:05:31 +00:00			`half: bool`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`verbose: bool`


fix typing issues coming from torch 2.4 version ; typing is not guaranteed for torch < 2.4 2024-08-02 09:56:37 +00:00			`class CLIPTextEncoderConfig(NamedTuple):`
			`architectures: list[str]`
			`vocab_size: int`
			`hidden_size: int`
			`intermediate_size: int`
			`num_hidden_layers: int`
			`num_attention_heads: int`
			`hidden_act: str`
			`layer_norm_eps: float`
			`projection_dim: int`


pick the right class for CLIP text converter i.e. CLIPTextModel by default or CLIPTextModelWithProjection for SDXL so-called text_encoder_2 This silent false positive warnings like: Some weights of CLIPTextModelWithProjection were not initialized from the model checkpoint [...] 2024-01-17 16:15:46 +00:00			`def setup_converter(args: Args, with_projection: bool = False) -> ModelConverter:`
refactor: convert bash script to python Ran successfully to completion. But on a repeat run `convert_unclip` didn't pass the hash check for some reason. - fix inpainting model download urls - shows a progress bar for downloads - skips downloading existing files - uses a temporary file to prevent partial downloads - can do a dry run to check if url is valid `DRY_RUN=1 python scripts/prepare_test_weights.py` - displays the downloaded file hash 2023-12-14 03:53:06 +00:00			# low_cpu_mem_usage=False stops some annoying console messages us to `pip install accelerate`
pick the right class for CLIP text converter i.e. CLIPTextModel by default or CLIPTextModelWithProjection for SDXL so-called text_encoder_2 This silent false positive warnings like: Some weights of CLIPTextModelWithProjection were not initialized from the model checkpoint [...] 2024-01-17 16:15:46 +00:00			`cls = CLIPTextModelWithProjection if with_projection else CLIPTextModel`
			`source: nn.Module = cls.from_pretrained( # type: ignore`
refactor: convert bash script to python Ran successfully to completion. But on a repeat run `convert_unclip` didn't pass the hash check for some reason. - fix inpainting model download urls - shows a progress bar for downloads - skips downloading existing files - uses a temporary file to prevent partial downloads - can do a dry run to check if url is valid `DRY_RUN=1 python scripts/prepare_test_weights.py` - displays the downloaded file hash 2023-12-14 03:53:06 +00:00			`pretrained_model_name_or_path=args.source_path,`
			`subfolder=args.subfolder,`
			`low_cpu_mem_usage=False,`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`)`
			`assert isinstance(source, nn.Module), "Source model is not a nn.Module"`
fix typing issues coming from torch 2.4 version ; typing is not guaranteed for torch < 2.4 2024-08-02 09:56:37 +00:00			`config = cast(CLIPTextEncoderConfig, source.config) # pyright: ignore[reportArgumentType, reportUnknownMemberType]`
			`architecture: str = config.architectures[0]`
			`embedding_dim: int = config.hidden_size`
			`projection_dim: int = config.projection_dim`
			`use_quick_gelu = config.hidden_act == "quick_gelu"`
pick the right class for CLIP text converter i.e. CLIPTextModel by default or CLIPTextModelWithProjection for SDXL so-called text_encoder_2 This silent false positive warnings like: Some weights of CLIPTextModelWithProjection were not initialized from the model checkpoint [...] 2024-01-17 16:15:46 +00:00			`assert architecture in ("CLIPTextModel", "CLIPTextModelWithProjection"), f"Unsupported architecture: {architecture}"`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`target = CLIPTextEncoder(`
fix typing issues coming from torch 2.4 version ; typing is not guaranteed for torch < 2.4 2024-08-02 09:56:37 +00:00			`embedding_dim=config.hidden_size,`
			`num_layers=config.num_hidden_layers,`
			`num_attention_heads=config.num_attention_heads,`
			`feedforward_dim=config.intermediate_size,`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`use_quick_gelu=use_quick_gelu,`
			`)`
pick the right class for CLIP text converter i.e. CLIPTextModel by default or CLIPTextModelWithProjection for SDXL so-called text_encoder_2 This silent false positive warnings like: Some weights of CLIPTextModelWithProjection were not initialized from the model checkpoint [...] 2024-01-17 16:15:46 +00:00			`if architecture == "CLIPTextModelWithProjection":`
			`target.append(module=fl.Linear(in_features=embedding_dim, out_features=projection_dim, bias=False))`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`text = "What a nice cat you have there!"`
add ensure_find and ensure_find_parent helpers 2023-09-12 09:50:56 +00:00			`tokenizer = target.ensure_find(CLIPTokenizer)`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`tokens = tokenizer(text)`
			`converter = ModelConverter(source_model=source, target_model=target, skip_output_check=True, verbose=args.verbose)`
			`if not converter.run(source_args=(tokens,), target_args=(text,)):`
			`raise RuntimeError("Model conversion failed")`
			`return converter`


			`def main() -> None:`
			`parser = argparse.ArgumentParser(`
			`description="Converts a CLIPTextEncoder from the library transformers from the HuggingFace Hub to refiners."`
			`)`
			`parser.add_argument(`
			`"--from",`
			`type=str,`
			`dest="source_path",`
			`default="runwayml/stable-diffusion-v1-5",`
			`help=(`
			`"Can be a path to a .bin file, a .safetensors file or a model name from the HuggingFace Hub. Default:"`
			`" runwayml/stable-diffusion-v1-5"`
			`),`
			`)`
			`parser.add_argument(`
			`"--subfolder",`
			`type=str,`
			`dest="subfolder",`
			`default="text_encoder",`
			`help=(`
			`"Subfolder in the source path where the model is located inside the Hub. Default: text_encoder (for"`
			`" CLIPTextModel)"`
			`),`
			`)`
make clip text converter support SDXL i.e. convert the 2nd text encoder and save the final double text encoder 2023-09-11 14:02:22 +00:00			`parser.add_argument(`
			`"--subfolder2",`
			`type=str,`
			`dest="subfolder2",`
			`default=None,`
			`help="Additional subfolder for the 2nd text encoder (useful for SDXL). Default: None",`
			`)`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`parser.add_argument(`
			`"--to",`
			`type=str,`
			`dest="output_path",`
			`default=None,`
			`help=(`
			`"Output path (.safetensors) for converted model. If not provided, the output path will be the same as the"`
			`" source path."`
			`),`
			`)`
fix invalid default value for --half in help 2024-01-16 14:15:52 +00:00			`parser.add_argument("--half", action="store_true", help="Convert to half precision.")`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00			`parser.add_argument(`
			`"--verbose",`
			`action="store_true",`
			`default=False,`
			`help="Prints additional information during conversion. Default: False",`
			`)`
			`args = parser.parse_args(namespace=Args())`
			`if args.output_path is None:`
			`args.output_path = f"{Path(args.source_path).stem}-{args.subfolder}.safetensors"`
			`converter = setup_converter(args=args)`
make clip text converter support SDXL i.e. convert the 2nd text encoder and save the final double text encoder 2023-09-11 14:02:22 +00:00			`if args.subfolder2 is not None:`
			`# Assume this is the second text encoder of Stable Diffusion XL`
			`args.subfolder = args.subfolder2`
pick the right class for CLIP text converter i.e. CLIPTextModel by default or CLIPTextModelWithProjection for SDXL so-called text_encoder_2 This silent false positive warnings like: Some weights of CLIPTextModelWithProjection were not initialized from the model checkpoint [...] 2024-01-17 16:15:46 +00:00			`converter2 = setup_converter(args=args, with_projection=True)`
make clip text converter support SDXL i.e. convert the 2nd text encoder and save the final double text encoder 2023-09-11 14:02:22 +00:00
			`text_encoder_l = CLIPTextEncoderL()`
			`text_encoder_l.load_state_dict(state_dict=converter.get_state_dict())`

			`projection = cast(CLIPTextEncoder, converter2.target_model)[-1]`
			`assert isinstance(projection, fl.Linear)`
			`text_encoder_g_with_projection = CLIPTextEncoderG()`
			`text_encoder_g_with_projection.append(module=projection)`
			`text_encoder_g_with_projection.load_state_dict(state_dict=converter2.get_state_dict())`

			`projection = text_encoder_g_with_projection.pop(index=-1)`
			`assert isinstance(projection, fl.Linear)`
			`double_text_encoder = DoubleTextEncoder(`
			`text_encoder_l=text_encoder_l, text_encoder_g=text_encoder_g_with_projection, projection=projection`
			`)`

			`state_dict = double_text_encoder.state_dict()`
			`if args.half:`
			`state_dict = {key: value.half() for key, value in state_dict.items()}`
			`save_to_safetensors(path=args.output_path, tensors=state_dict)`
			`else:`
			`converter.save_to_safetensors(path=args.output_path, half=args.half)`
implement the ConvertModule class and refactor conversion scripts 2023-08-24 00:26:37 +00:00

			`if __name__ == "__main__":`
			`main()`