Skip to content

DINOv2

DINOv2_base

DINOv2_base(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 base model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

768

patch_size int

14

image_size int

518

num_layers int

12

num_heads int

12

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 base model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=768,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=12,
        device=device,
        dtype=dtype,
    )

DINOv2_base_reg

DINOv2_base_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 base model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers for more details.

Attributes:

Name Type Description
embedding_dim int

768

patch_size int

14

image_size int

518

num_layers int

12

num_heads int

12

num_registers int

4

interpolate_antialias bool

True

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 base model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=768,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=12,
        num_registers=4,
        interpolate_antialias=True,
        device=device,
        dtype=dtype,
    )

DINOv2_giant

DINOv2_giant(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 giant model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

1536

feedforward_dim int

4096

patch_size int

14

image_size int

518

num_layers int

40

num_heads int

24

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 giant model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1536,
        feedforward_dim=4096,
        patch_size=14,
        image_size=518,
        num_layers=40,
        num_heads=24,
        activation=GLU(SiLU()),
        device=device,
        dtype=dtype,
    )

DINOv2_giant_reg

DINOv2_giant_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 giant model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers

Attributes:

Name Type Description
embedding_dim int

1536

feedforward_dim int

4096

patch_size int

14

image_size int

518

num_layers int

40

num_heads int

24

num_registers int

4

interpolate_antialias bool

True

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 giant model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1536,
        feedforward_dim=4096,
        patch_size=14,
        image_size=518,
        num_layers=40,
        num_heads=24,
        num_registers=4,
        interpolate_antialias=True,
        activation=GLU(SiLU()),
        device=device,
        dtype=dtype,
    )

DINOv2_large

DINOv2_large(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 large model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

1024

patch_size int

14

image_size int

518

num_layers int

24

num_heads int

16

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 large model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1024,
        patch_size=14,
        image_size=518,
        num_layers=24,
        num_heads=16,
        device=device,
        dtype=dtype,
    )

DINOv2_large_reg

DINOv2_large_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 large model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers for more details.

Attributes:

Name Type Description
embedding_dim int

1024

patch_size int

14

image_size int

518

num_layers int

24

num_heads int

16

num_registers int

4

interpolate_antialias bool

True

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 large model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=1024,
        patch_size=14,
        image_size=518,
        num_layers=24,
        num_heads=16,
        num_registers=4,
        interpolate_antialias=True,
        device=device,
        dtype=dtype,
    )

DINOv2_small

DINOv2_small(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 small model.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision for more details.

Attributes:

Name Type Description
embedding_dim int

384

patch_size int

14

image_size int

518

num_layers int

12

num_heads int

6

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 small model.

    Args:
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=384,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=6,
        device=device,
        dtype=dtype,
    )

DINOv2_small_reg

DINOv2_small_reg(
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: ViT

DINOv2 small model with register.

See [arXiv:2304.07193] DINOv2: Learning Robust Visual Features without Supervision and [arXiv:2309.16588] Vision Transformers Need Registers for more details.

Attributes:

Name Type Description
embedding_dim int

384

patch_size int

14

image_size int

518

num_layers int

12

num_heads int

6

num_registers int

4

interpolate_antialias bool

True

Parameters:

Name Type Description Default
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/dinov2.py
def __init__(
    self,
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize DINOv2 small model with register.

    Args:
        device (torch.device | str | None): The PyTorch device to use.
        dtype (torch.dtype | None): The PyTorch data type to use.
    """
    super().__init__(
        embedding_dim=384,
        patch_size=14,
        image_size=518,
        num_layers=12,
        num_heads=6,
        num_registers=4,
        interpolate_antialias=True,
        device=device,
        dtype=dtype,
    )

ViT

ViT(
    embedding_dim: int = 768,
    patch_size: int = 16,
    image_size: int = 224,
    num_layers: int = 12,
    num_heads: int = 12,
    norm_eps: float = 1e-06,
    mlp_ratio: int = 4,
    num_registers: int = 0,
    activation: Activation = GeLU(),
    feedforward_dim: int | None = None,
    interpolate_antialias: bool = False,
    interpolate_mode: str = "bicubic",
    device: device | str | None = None,
    dtype: dtype | None = None,
)

Bases: Chain

Vision Transformer (ViT) model.

See [arXiv:2010.11929] An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale for more details.

Parameters:

Name Type Description Default
embedding_dim int

The dimension of the embedding.

768
patch_size int

The size of the patches.

16
image_size int

The size of the input image.

224
num_layers int

The number of layers.

12
num_heads int

The number of heads.

12
norm_eps float

The epsilon value for normalization.

1e-06
mlp_ratio int

The ratio for the multi-layer perceptron (MLP).

4
num_registers int

The number of registers.

0
activation Activation

The activation function.

GeLU()
feedforward_dim int | None

The dimension of the feedforward layer.

None
interpolate_antialias bool

Whether to use antialiasing for interpolation.

False
interpolate_mode str

The interpolation mode.

'bicubic'
device device | str | None

The PyTorch device to use.

None
dtype dtype | None

The PyTorch data type to use.

None
Source code in src/refiners/foundationals/dinov2/vit.py
def __init__(
    self,
    embedding_dim: int = 768,
    patch_size: int = 16,
    image_size: int = 224,
    num_layers: int = 12,
    num_heads: int = 12,
    norm_eps: float = 1e-6,
    mlp_ratio: int = 4,
    num_registers: int = 0,
    activation: Activation = fl.GeLU(),
    feedforward_dim: int | None = None,
    interpolate_antialias: bool = False,
    interpolate_mode: str = "bicubic",
    device: torch.device | str | None = None,
    dtype: torch.dtype | None = None,
) -> None:
    """Initialize a Vision Transformer (ViT) model.

    Args:
        embedding_dim: The dimension of the embedding.
        patch_size: The size of the patches.
        image_size: The size of the input image.
        num_layers: The number of layers.
        num_heads: The number of heads.
        norm_eps: The epsilon value for normalization.
        mlp_ratio: The ratio for the multi-layer perceptron (MLP).
        num_registers: The number of registers.
        activation: The activation function.
        feedforward_dim: The dimension of the feedforward layer.
        interpolate_antialias: Whether to use antialiasing for interpolation.
        interpolate_mode: The interpolation mode.
        device: The PyTorch device to use.
        dtype: The PyTorch data type to use.
    """
    num_patches = image_size // patch_size
    self.embedding_dim = embedding_dim
    self.patch_size = patch_size
    self.image_size = image_size
    self.num_layers = num_layers
    self.num_heads = num_heads
    self.norm_eps = norm_eps
    self.mlp_ratio = mlp_ratio
    self.num_registers = num_registers
    self.feedforward_dim = feedforward_dim

    super().__init__(
        fl.Concatenate(
            ClassToken(
                embedding_dim=embedding_dim,
                device=device,
                dtype=dtype,
            ),
            PatchEncoder(
                in_channels=3,
                out_channels=embedding_dim,
                patch_size=patch_size,
                device=device,
                dtype=dtype,
            ),
            dim=1,
        ),
        PositionalEncoder(
            PositionalEmbedding(
                sequence_length=num_patches**2 + 1,
                embedding_dim=embedding_dim,
                patch_size=patch_size,
                device=device,
                dtype=dtype,
            ),
            fl.Chain(
                fl.Parallel(
                    fl.Identity(),
                    fl.UseContext(context="dinov2_vit", key="input"),
                ),
                InterpolateEmbedding(
                    mode=interpolate_mode,
                    antialias=interpolate_antialias,
                    patch_size=patch_size,
                ),
            ),
        ),
        Transformer(
            TransformerLayer(
                embedding_dim=embedding_dim,
                feedforward_dim=feedforward_dim,
                activation=activation,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                norm_eps=norm_eps,
                device=device,
                dtype=dtype,
            )
            for _ in range(num_layers)
        ),
        fl.LayerNorm(
            normalized_shape=embedding_dim,
            eps=norm_eps,
            device=device,
            dtype=dtype,
        ),
    )

    if self.num_registers > 0:
        registers = Registers(
            num_registers=num_registers,
            embedding_dim=embedding_dim,
            device=device,
            dtype=dtype,
        )
        self.insert_before_type(Transformer, registers)

preprocess

preprocess(img: Image, dim: int = 224) -> Tensor

Preprocess an image for use with DINOv2. Uses ImageNet mean and standard deviation. Note that this only resizes and normalizes the image, there is no center crop.

Parameters:

Name Type Description Default
img Image

The image.

required
dim int

The square dimension to resize the image. Typically 224 or 518.

224

Returns:

Type Description
Tensor

A float32 tensor with shape (3, dim, dim).

Source code in src/refiners/foundationals/dinov2/dinov2.py
def preprocess(img: Image.Image, dim: int = 224) -> torch.Tensor:
    """
    Preprocess an image for use with DINOv2. Uses ImageNet mean and standard deviation.
    Note that this only resizes and normalizes the image, there is no center crop.

    Args:
        img: The image.
        dim: The square dimension to resize the image. Typically 224 or 518.

    Returns:
        A float32 tensor with shape (3, dim, dim).
    """
    img = img.convert("RGB").resize((dim, dim))  # type: ignore
    t = image_to_tensor(img).squeeze()
    return normalize(t, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])