(doc/foundationals) add CLIP, related docstrings

This commit is contained in:
Laurent 2024-02-02 13:27:47 +00:00 committed by Laureηt
parent 8befede3cf
commit 7bc5ce35d2
3 changed files with 155 additions and 21 deletions

View file

@ -0,0 +1,21 @@
from refiners.foundationals.clip.image_encoder import (
CLIPImageEncoder,
CLIPImageEncoderG,
CLIPImageEncoderH,
)
from refiners.foundationals.clip.text_encoder import (
CLIPTextEncoder,
CLIPTextEncoderG,
CLIPTextEncoderH,
CLIPTextEncoderL,
)
__all__ = [
"CLIPTextEncoder",
"CLIPTextEncoderL",
"CLIPTextEncoderH",
"CLIPTextEncoderG",
"CLIPImageEncoder",
"CLIPImageEncoderG",
"CLIPImageEncoderH",
]

View file

@ -108,6 +108,12 @@ class ViTEmbeddings(fl.Chain):
class CLIPImageEncoder(fl.Chain): class CLIPImageEncoder(fl.Chain):
"""Contrastive Language-Image Pretraining (CLIP) image encoder.
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.
"""
def __init__( def __init__(
self, self,
image_size: int = 224, image_size: int = 224,
@ -121,6 +127,20 @@ class CLIPImageEncoder(fl.Chain):
device: Device | str | None = None, device: Device | str | None = None,
dtype: DType | None = None, dtype: DType | None = None,
) -> None: ) -> None:
"""Initialize a CLIP image encoder.
Args:
image_size: The size of the input image.
embedding_dim: The dimension of the embedding.
output_dim: The dimension of the output.
patch_size: The size of the patches.
num_layers: The number of layers.
num_attention_heads: The number of attention heads.
feedforward_dim: The dimension of the feedforward layer.
layer_norm_eps: The epsilon value for normalization.
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
self.image_size = image_size self.image_size = image_size
self.embedding_dim = embedding_dim self.embedding_dim = embedding_dim
self.output_dim = output_dim self.output_dim = output_dim
@ -152,7 +172,27 @@ class CLIPImageEncoder(fl.Chain):
class CLIPImageEncoderH(CLIPImageEncoder): class CLIPImageEncoderH(CLIPImageEncoder):
"""CLIP huge image encoder.
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.
Attributes:
embedding_dim (int): 1280
output_dim (int): 1024
patch_size (int): 14
num_layers (int): 32
num_attention_heads (int): 16
feedforward_dim (int): 5120
"""
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None: def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP huge image encoder.
Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__( super().__init__(
embedding_dim=1280, embedding_dim=1280,
output_dim=1024, output_dim=1024,
@ -166,7 +206,27 @@ class CLIPImageEncoderH(CLIPImageEncoder):
class CLIPImageEncoderG(CLIPImageEncoder): class CLIPImageEncoderG(CLIPImageEncoder):
"""CLIP giant image encoder.
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.
Attributes:
embedding_dim (int): 1664
output_dim (int): 1280
patch_size (int): 14
num_layers (int): 48
num_attention_heads (int): 16
feedforward_dim (int): 8192
"""
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None: def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP giant image encoder.
Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__( super().__init__(
embedding_dim=1664, embedding_dim=1664,
output_dim=1280, output_dim=1280,

View file

@ -71,6 +71,12 @@ class TransformerLayer(fl.Chain):
class CLIPTextEncoder(fl.Chain): class CLIPTextEncoder(fl.Chain):
"""Contrastive Language-Image Pretraining (CLIP) text encoder.
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.
"""
def __init__( def __init__(
self, self,
embedding_dim: int = 768, embedding_dim: int = 768,
@ -85,6 +91,21 @@ class CLIPTextEncoder(fl.Chain):
device: Device | str | None = None, device: Device | str | None = None,
dtype: DType | None = None, dtype: DType | None = None,
) -> None: ) -> None:
"""Initialize CLIP text encoder.
Args:
embedding_dim: The embedding dimension.
max_sequence_length: The maximum sequence length.
vocabulary_size: The vocabulary size.
num_layers: The number of layers.
num_attention_heads: The number of attention heads.
feedforward_dim: The feedforward dimension.
layer_norm_eps: The epsilon value for layer normalization.
use_quick_gelu: Whether to use the quick GeLU activation function.
tokenizer: The tokenizer.
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
self.embedding_dim = embedding_dim self.embedding_dim = embedding_dim
self.max_sequence_length = max_sequence_length self.max_sequence_length = max_sequence_length
self.vocabulary_size = vocabulary_size self.vocabulary_size = vocabulary_size
@ -129,19 +150,30 @@ class CLIPTextEncoder(fl.Chain):
class CLIPTextEncoderL(CLIPTextEncoder): class CLIPTextEncoderL(CLIPTextEncoder):
""" """CLIP large text encoder.
CLIPTextEncoderL is the CLIP text encoder with the following parameters:
embedding_dim=768
num_layers=12
num_attention_heads=12
feedforward_dim=3072
use_quick_gelu=True
Note:
We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation
of OpenAI (https://github.com/openai/CLIP/blob/main/clip/model.py#L166) of OpenAI (https://github.com/openai/CLIP/blob/main/clip/model.py#L166)
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.
Attributes:
embedding_dim (int): 768
num_layers (int): 12
num_attention_heads (int): 12
feedforward_dim (int): 3072
use_quick_gelu (bool): True
""" """
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None: def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP large text encoder.
Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__( super().__init__(
embedding_dim=768, embedding_dim=768,
num_layers=12, num_layers=12,
@ -154,15 +186,25 @@ class CLIPTextEncoderL(CLIPTextEncoder):
class CLIPTextEncoderH(CLIPTextEncoder): class CLIPTextEncoderH(CLIPTextEncoder):
""" """CLIP huge text encoder.
CLIPTextEncoderH is the CLIP text encoder with the following parameters:
embedding_dim=1024 See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
num_layers=23 for more details.
num_attention_heads=16
feedforward_dim=4096 Attributes:
embedding_dim (int): 1024
num_layers (int): 23
num_attention_heads (int): 16
feedforward_dim (int): 4096
""" """
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None: def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP huge text encoder.
Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__( super().__init__(
embedding_dim=1024, embedding_dim=1024,
num_layers=23, num_layers=23,
@ -174,15 +216,26 @@ class CLIPTextEncoderH(CLIPTextEncoder):
class CLIPTextEncoderG(CLIPTextEncoder): class CLIPTextEncoderG(CLIPTextEncoder):
""" """CLIP giant text encoder.
CLIPTextEncoderG is the CLIP text encoder with the following parameters:
embedding_dim=1280 See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
num_layers=32 for more details.
num_attention_heads=16
feedforward_dim=5120 Attributes:
embedding_dim (int): 1280
num_layers (int): 32
num_attention_heads (int): 20
feedforward_dim (int): 5120
tokenizer (CLIPTokenizer): CLIPTokenizer(pad_token_id=0)
""" """
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None: def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP giant text encoder.
Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
tokenizer = CLIPTokenizer(pad_token_id=0) tokenizer = CLIPTokenizer(pad_token_id=0)
super().__init__( super().__init__(
embedding_dim=1280, embedding_dim=1280,