From 3910845e29f2792aa8b0d8ae128473a1abe4d6a7 Mon Sep 17 00:00:00 2001
From: Laurent <laurent@lagon.tech>
Date: Fri, 2 Feb 2024 13:12:13 +0000
Subject: [PATCH] (doc/foundationals) add `DINOv2`, related docstrings

---
 src/refiners/foundationals/dinov2/dinov2.py | 120 ++++++++++++++++++++
 src/refiners/foundationals/dinov2/vit.py    |  19 +++-
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/src/refiners/foundationals/dinov2/dinov2.py b/src/refiners/foundationals/dinov2/dinov2.py
index a4fbdf7..7011cb0 100644
--- a/src/refiners/foundationals/dinov2/dinov2.py
+++ b/src/refiners/foundationals/dinov2/dinov2.py
@@ -7,11 +7,30 @@ from refiners.foundationals.dinov2.vit import ViT
 
 
 class DINOv2_small(ViT):
+    """DINOv2 small model.
+
+    See [[arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 384
+        patch_size (int): 14
+        image_size (int): 518
+        num_layers (int): 12
+        num_heads (int): 6
+    """
+
     def __init__(
         self,
         device: torch.device | str | None = None,
         dtype: torch.dtype | None = None,
     ) -> None:
+        """Initialize DINOv2 small model.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=384,
             patch_size=14,
@@ -24,11 +43,30 @@ class DINOv2_small(ViT):
 
 
 class DINOv2_base(ViT):
+    """DINOv2 base model.
+
+    See [[arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 768
+        patch_size (int): 14
+        image_size (int): 518
+        num_layers (int): 12
+        num_heads (int): 12
+    """
+
     def __init__(
         self,
         device: torch.device | str | None = None,
         dtype: torch.dtype | None = None,
     ) -> None:
+        """Initialize DINOv2 base model.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=768,
             patch_size=14,
@@ -41,11 +79,30 @@ class DINOv2_base(ViT):
 
 
 class DINOv2_large(ViT):
+    """DINOv2 large model.
+
+    See [[arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 1024
+        patch_size (int): 14
+        image_size (int): 518
+        num_layers (int): 24
+        num_heads (int): 16
+    """
+
     def __init__(
         self,
         device: torch.device | str | None = None,
         dtype: torch.dtype | None = None,
     ) -> None:
+        """Initialize DINOv2 large model.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=1024,
             patch_size=14,
@@ -76,11 +133,32 @@ class DINOv2_large(ViT):
 
 
 class DINOv2_small_reg(ViT):
+    """DINOv2 small model with register.
+
+    See [[arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+    and [[arXiv:2309.16588] Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 384
+        patch_size (int): 14
+        image_size (int): 518
+        num_layers (int): 12
+        num_heads (int): 6
+        num_registers (int): 4
+    """
+
     def __init__(
         self,
         device: torch.device | str | None = None,
         dtype: torch.dtype | None = None,
     ) -> None:
+        """Initialize DINOv2 small model with register.
+
+        Args:
+            device (torch.device | str | None): The PyTorch device to use.
+            dtype (torch.dtype | None): The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=384,
             patch_size=14,
@@ -94,11 +172,32 @@ class DINOv2_small_reg(ViT):
 
 
 class DINOv2_base_reg(ViT):
+    """DINOv2 base model with register.
+
+    See [[arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+    and [[arXiv:2309.16588] Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 768
+        patch_size (int): 14
+        image_size (int): 518
+        num_layers (int): 12
+        num_heads (int): 12
+        num_registers (int): 4
+    """
+
     def __init__(
         self,
         device: torch.device | str | None = None,
         dtype: torch.dtype | None = None,
     ) -> None:
+        """Initialize DINOv2 base model with register.
+
+        Args:
+            device (torch.device | str | None): The PyTorch device to use.
+            dtype (torch.dtype | None): The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=768,
             patch_size=14,
@@ -112,11 +211,32 @@ class DINOv2_base_reg(ViT):
 
 
 class DINOv2_large_reg(ViT):
+    """DINOv2 large model with register.
+
+    See [[arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+    and [[arXiv:2309.16588] Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 1024
+        patch_size (int): 14
+        image_size (int): 518
+        num_layers (int): 24
+        num_heads (int): 16
+        num_registers (int): 4
+    """
+
     def __init__(
         self,
         device: torch.device | str | None = None,
         dtype: torch.dtype | None = None,
     ) -> None:
+        """Initialize DINOv2 large model with register.
+
+        Args:
+            device (torch.device | str | None): The PyTorch device to use.
+            dtype (torch.dtype | None): The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=1024,
             patch_size=14,
diff --git a/src/refiners/foundationals/dinov2/vit.py b/src/refiners/foundationals/dinov2/vit.py
index eb08ee2..407faac 100644
--- a/src/refiners/foundationals/dinov2/vit.py
+++ b/src/refiners/foundationals/dinov2/vit.py
@@ -227,9 +227,10 @@ class Registers(fl.Concatenate):
 
 
 class ViT(fl.Chain):
-    """Vision Transformer (ViT).
+    """Vision Transformer (ViT) model.
 
-    see https://arxiv.org/abs/2010.11929v2
+    See [[arXiv:2010.11929] An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
+    for more details.
     """
 
     def __init__(
@@ -245,6 +246,20 @@ class ViT(fl.Chain):
         device: torch.device | str | None = None,
         dtype: torch.dtype | None = None,
     ) -> None:
+        """Initialize a Vision Transformer (ViT) model.
+
+        Args:
+            embedding_dim: The dimension of the embedding.
+            patch_size: The size of the patches.
+            image_size: The size of the input image.
+            num_layers: The number of layers.
+            num_heads: The number of heads.
+            norm_eps: The epsilon value for normalization.
+            mlp_ratio: The ratio for the multi-layer perceptron (MLP).
+            num_registers: The number of registers.
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         num_patches = image_size // patch_size
         self.embedding_dim = embedding_dim
         self.patch_size = patch_size