mirror of
https://github.com/finegrain-ai/refiners.git
synced 2024-11-24 07:08:45 +00:00
(doc/foundationals) add CLIP
, related docstrings
This commit is contained in:
parent
3910845e29
commit
a926696141
|
@ -0,0 +1,21 @@
|
||||||
|
from refiners.foundationals.clip.image_encoder import (
|
||||||
|
CLIPImageEncoder,
|
||||||
|
CLIPImageEncoderG,
|
||||||
|
CLIPImageEncoderH,
|
||||||
|
)
|
||||||
|
from refiners.foundationals.clip.text_encoder import (
|
||||||
|
CLIPTextEncoder,
|
||||||
|
CLIPTextEncoderG,
|
||||||
|
CLIPTextEncoderH,
|
||||||
|
CLIPTextEncoderL,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"CLIPTextEncoder",
|
||||||
|
"CLIPTextEncoderL",
|
||||||
|
"CLIPTextEncoderH",
|
||||||
|
"CLIPTextEncoderG",
|
||||||
|
"CLIPImageEncoder",
|
||||||
|
"CLIPImageEncoderG",
|
||||||
|
"CLIPImageEncoderH",
|
||||||
|
]
|
|
@ -108,6 +108,12 @@ class ViTEmbeddings(fl.Chain):
|
||||||
|
|
||||||
|
|
||||||
class CLIPImageEncoder(fl.Chain):
|
class CLIPImageEncoder(fl.Chain):
|
||||||
|
"""Contrastive Language-Image Pretraining (CLIP) image encoder.
|
||||||
|
|
||||||
|
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||||
|
for more details.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
image_size: int = 224,
|
image_size: int = 224,
|
||||||
|
@ -121,6 +127,20 @@ class CLIPImageEncoder(fl.Chain):
|
||||||
device: Device | str | None = None,
|
device: Device | str | None = None,
|
||||||
dtype: DType | None = None,
|
dtype: DType | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Initialize a CLIP image encoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_size: The size of the input image.
|
||||||
|
embedding_dim: The dimension of the embedding.
|
||||||
|
output_dim: The dimension of the output.
|
||||||
|
patch_size: The size of the patches.
|
||||||
|
num_layers: The number of layers.
|
||||||
|
num_attention_heads: The number of attention heads.
|
||||||
|
feedforward_dim: The dimension of the feedforward layer.
|
||||||
|
layer_norm_eps: The epsilon value for normalization.
|
||||||
|
device: The PyTorch device to use.
|
||||||
|
dtype: The PyTorch data type to use.
|
||||||
|
"""
|
||||||
self.image_size = image_size
|
self.image_size = image_size
|
||||||
self.embedding_dim = embedding_dim
|
self.embedding_dim = embedding_dim
|
||||||
self.output_dim = output_dim
|
self.output_dim = output_dim
|
||||||
|
@ -152,7 +172,27 @@ class CLIPImageEncoder(fl.Chain):
|
||||||
|
|
||||||
|
|
||||||
class CLIPImageEncoderH(CLIPImageEncoder):
|
class CLIPImageEncoderH(CLIPImageEncoder):
|
||||||
|
"""CLIP huge image encoder.
|
||||||
|
|
||||||
|
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
embedding_dim (int): 1280
|
||||||
|
output_dim (int): 1024
|
||||||
|
patch_size (int): 14
|
||||||
|
num_layers (int): 32
|
||||||
|
num_attention_heads (int): 16
|
||||||
|
feedforward_dim (int): 5120
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
||||||
|
"""Initialize CLIP huge image encoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: The PyTorch device to use.
|
||||||
|
dtype: The PyTorch data type to use.
|
||||||
|
"""
|
||||||
super().__init__(
|
super().__init__(
|
||||||
embedding_dim=1280,
|
embedding_dim=1280,
|
||||||
output_dim=1024,
|
output_dim=1024,
|
||||||
|
@ -166,7 +206,27 @@ class CLIPImageEncoderH(CLIPImageEncoder):
|
||||||
|
|
||||||
|
|
||||||
class CLIPImageEncoderG(CLIPImageEncoder):
|
class CLIPImageEncoderG(CLIPImageEncoder):
|
||||||
|
"""CLIP giant image encoder.
|
||||||
|
|
||||||
|
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
embedding_dim (int): 1664
|
||||||
|
output_dim (int): 1280
|
||||||
|
patch_size (int): 14
|
||||||
|
num_layers (int): 48
|
||||||
|
num_attention_heads (int): 16
|
||||||
|
feedforward_dim (int): 8192
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
||||||
|
"""Initialize CLIP giant image encoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: The PyTorch device to use.
|
||||||
|
dtype: The PyTorch data type to use.
|
||||||
|
"""
|
||||||
super().__init__(
|
super().__init__(
|
||||||
embedding_dim=1664,
|
embedding_dim=1664,
|
||||||
output_dim=1280,
|
output_dim=1280,
|
||||||
|
|
|
@ -71,6 +71,12 @@ class TransformerLayer(fl.Chain):
|
||||||
|
|
||||||
|
|
||||||
class CLIPTextEncoder(fl.Chain):
|
class CLIPTextEncoder(fl.Chain):
|
||||||
|
"""Contrastive Language-Image Pretraining (CLIP) text encoder.
|
||||||
|
|
||||||
|
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||||
|
for more details.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
embedding_dim: int = 768,
|
embedding_dim: int = 768,
|
||||||
|
@ -85,6 +91,21 @@ class CLIPTextEncoder(fl.Chain):
|
||||||
device: Device | str | None = None,
|
device: Device | str | None = None,
|
||||||
dtype: DType | None = None,
|
dtype: DType | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Initialize CLIP text encoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding_dim: The embedding dimension.
|
||||||
|
max_sequence_length: The maximum sequence length.
|
||||||
|
vocabulary_size: The vocabulary size.
|
||||||
|
num_layers: The number of layers.
|
||||||
|
num_attention_heads: The number of attention heads.
|
||||||
|
feedforward_dim: The feedforward dimension.
|
||||||
|
layer_norm_eps: The epsilon value for layer normalization.
|
||||||
|
use_quick_gelu: Whether to use the quick GeLU activation function.
|
||||||
|
tokenizer: The tokenizer.
|
||||||
|
device: The PyTorch device to use.
|
||||||
|
dtype: The PyTorch data type to use.
|
||||||
|
"""
|
||||||
self.embedding_dim = embedding_dim
|
self.embedding_dim = embedding_dim
|
||||||
self.max_sequence_length = max_sequence_length
|
self.max_sequence_length = max_sequence_length
|
||||||
self.vocabulary_size = vocabulary_size
|
self.vocabulary_size = vocabulary_size
|
||||||
|
@ -129,19 +150,30 @@ class CLIPTextEncoder(fl.Chain):
|
||||||
|
|
||||||
|
|
||||||
class CLIPTextEncoderL(CLIPTextEncoder):
|
class CLIPTextEncoderL(CLIPTextEncoder):
|
||||||
"""
|
"""CLIP large text encoder.
|
||||||
CLIPTextEncoderL is the CLIP text encoder with the following parameters:
|
|
||||||
embedding_dim=768
|
|
||||||
num_layers=12
|
|
||||||
num_attention_heads=12
|
|
||||||
feedforward_dim=3072
|
|
||||||
use_quick_gelu=True
|
|
||||||
|
|
||||||
We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation
|
Note:
|
||||||
of OpenAI (https://github.com/openai/CLIP/blob/main/clip/model.py#L166)
|
We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation
|
||||||
|
of OpenAI (https://github.com/openai/CLIP/blob/main/clip/model.py#L166)
|
||||||
|
|
||||||
|
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
embedding_dim (int): 768
|
||||||
|
num_layers (int): 12
|
||||||
|
num_attention_heads (int): 12
|
||||||
|
feedforward_dim (int): 3072
|
||||||
|
use_quick_gelu (bool): True
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
||||||
|
"""Initialize CLIP large text encoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: The PyTorch device to use.
|
||||||
|
dtype: The PyTorch data type to use.
|
||||||
|
"""
|
||||||
super().__init__(
|
super().__init__(
|
||||||
embedding_dim=768,
|
embedding_dim=768,
|
||||||
num_layers=12,
|
num_layers=12,
|
||||||
|
@ -154,15 +186,25 @@ class CLIPTextEncoderL(CLIPTextEncoder):
|
||||||
|
|
||||||
|
|
||||||
class CLIPTextEncoderH(CLIPTextEncoder):
|
class CLIPTextEncoderH(CLIPTextEncoder):
|
||||||
"""
|
"""CLIP huge text encoder.
|
||||||
CLIPTextEncoderH is the CLIP text encoder with the following parameters:
|
|
||||||
embedding_dim=1024
|
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||||
num_layers=23
|
for more details.
|
||||||
num_attention_heads=16
|
|
||||||
feedforward_dim=4096
|
Attributes:
|
||||||
|
embedding_dim (int): 1024
|
||||||
|
num_layers (int): 23
|
||||||
|
num_attention_heads (int): 16
|
||||||
|
feedforward_dim (int): 4096
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
||||||
|
"""Initialize CLIP huge text encoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: The PyTorch device to use.
|
||||||
|
dtype: The PyTorch data type to use.
|
||||||
|
"""
|
||||||
super().__init__(
|
super().__init__(
|
||||||
embedding_dim=1024,
|
embedding_dim=1024,
|
||||||
num_layers=23,
|
num_layers=23,
|
||||||
|
@ -174,15 +216,26 @@ class CLIPTextEncoderH(CLIPTextEncoder):
|
||||||
|
|
||||||
|
|
||||||
class CLIPTextEncoderG(CLIPTextEncoder):
|
class CLIPTextEncoderG(CLIPTextEncoder):
|
||||||
"""
|
"""CLIP giant text encoder.
|
||||||
CLIPTextEncoderG is the CLIP text encoder with the following parameters:
|
|
||||||
embedding_dim=1280
|
See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
|
||||||
num_layers=32
|
for more details.
|
||||||
num_attention_heads=16
|
|
||||||
feedforward_dim=5120
|
Attributes:
|
||||||
|
embedding_dim (int): 1280
|
||||||
|
num_layers (int): 32
|
||||||
|
num_attention_heads (int): 20
|
||||||
|
feedforward_dim (int): 5120
|
||||||
|
tokenizer (CLIPTokenizer): CLIPTokenizer(pad_token_id=0)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
|
||||||
|
"""Initialize CLIP giant text encoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device: The PyTorch device to use.
|
||||||
|
dtype: The PyTorch data type to use.
|
||||||
|
"""
|
||||||
tokenizer = CLIPTokenizer(pad_token_id=0)
|
tokenizer = CLIPTokenizer(pad_token_id=0)
|
||||||
super().__init__(
|
super().__init__(
|
||||||
embedding_dim=1280,
|
embedding_dim=1280,
|
||||||
|
|
Loading…
Reference in a new issue