refiners/tests/foundationals/dinov2/test_dinov2.py

from math import isclose
from pathlib import Path
from warnings import warn

import pytest
import torch
from transformers import AutoModel  # type: ignore
from transformers.models.dinov2.modeling_dinov2 import Dinov2Model  # type: ignore

from refiners.fluxion.utils import load_from_safetensors, manual_seed
from refiners.foundationals.dinov2 import (
    DINOv2_base,
    DINOv2_base_reg,
    DINOv2_large,
    DINOv2_large_reg,
    DINOv2_small,
    DINOv2_small_reg,
)
from refiners.foundationals.dinov2.vit import ViT

FLAVORS = [
    "dinov2_vits14",
    "dinov2_vitb14",
    "dinov2_vitl14",
    "dinov2_vits14_reg4",
    "dinov2_vitb14_reg4",
    "dinov2_vitl14_reg4",
]


@pytest.fixture(scope="module", params=FLAVORS)
def flavor(request: pytest.FixtureRequest) -> str:
    return request.param


# Temporary: see comments in `test_encoder_only`
@pytest.fixture(scope="module")
def seed_expected_norm(flavor: str) -> tuple[int, float]:
    match flavor:
        case "dinov2_vits14":
            return (42, 1977.9213867)
        case "dinov2_vitb14":
            return (42, 1902.6384277)
        case "dinov2_vitl14":
            return (42, 1763.9187011)
        case "dinov2_vits14_reg4":
            return (42, 989.2380981)
        case "dinov2_vitb14_reg4":
            return (42, 974.4362182)
        case "dinov2_vitl14_reg4":
            return (42, 924.8797607)
        case _:
            raise ValueError(f"Unexpected DINOv2 flavor: {flavor}")


@pytest.fixture(scope="module")
def our_backbone(test_weights_path: Path, flavor: str, test_device: torch.device) -> ViT:
    weights = test_weights_path / f"{flavor}_pretrain.safetensors"
    if not weights.is_file():
        warn(f"could not find weights at {weights}, skipping")
        pytest.skip(allow_module_level=True)
    match flavor:
        case "dinov2_vits14":
            backbone = DINOv2_small(device=test_device)
        case "dinov2_vitb14":
            backbone = DINOv2_base(device=test_device)
        case "dinov2_vitl14":
            backbone = DINOv2_large(device=test_device)
        case "dinov2_vits14_reg4":
            backbone = DINOv2_small_reg(device=test_device)
        case "dinov2_vitb14_reg4":
            backbone = DINOv2_base_reg(device=test_device)
        case "dinov2_vitl14_reg4":
            backbone = DINOv2_large_reg(device=test_device)
        case _:
            raise ValueError(f"Unexpected DINOv2 flavor: {flavor}")
    tensors = load_from_safetensors(weights)
    backbone.load_state_dict(tensors)
    return backbone


@pytest.fixture(scope="module")
def dinov2_weights_path(test_weights_path: Path, flavor: str):
    # TODO: At the time of writing, those are not yet supported in transformers
    # (https://github.com/huggingface/transformers/issues/27379). Alternatively, it is also possible to use
    # facebookresearch/dinov2 directly (https://github.com/finegrain-ai/refiners/pull/132).
    if flavor.endswith("_reg4"):
        warn(f"DINOv2 with registers are not yet supported in Hugging Face, skipping")
        pytest.skip(allow_module_level=True)
    match flavor:
        case "dinov2_vits14":
            name = "dinov2-small"
        case "dinov2_vitb14":
            name = "dinov2-base"
        case "dinov2_vitl14":
            name = "dinov2-large"
        case _:
            raise ValueError(f"Unexpected DINOv2 flavor: {flavor}")
    r = test_weights_path / "facebook" / name
    if not r.is_dir():
        warn(f"could not find DINOv2 weights at {r}, skipping")
        pytest.skip(allow_module_level=True)
    return r


@pytest.fixture(scope="module")
def ref_backbone(dinov2_weights_path: Path, test_device: torch.device) -> Dinov2Model:
    backbone = AutoModel.from_pretrained(dinov2_weights_path)  # type: ignore
    assert isinstance(backbone, Dinov2Model)
    return backbone.to(test_device)  # type: ignore


def test_encoder(
    ref_backbone: Dinov2Model,
    our_backbone: ViT,
    test_device: torch.device,
):
    manual_seed(42)

    # Position encoding interpolation [1] at runtime is not supported yet. So stick to the default image resolution
    # e.g. using (224, 224) pixels as input would give a runtime error (sequence size mismatch)
    # [1]: https://github.com/facebookresearch/dinov2/blob/2302b6b/dinov2/models/vision_transformer.py#L179
    assert our_backbone.image_size == 518

    x = torch.randn(1, 3, 518, 518).to(test_device)

    with torch.no_grad():
        ref_features = ref_backbone(x).last_hidden_state
        our_features = our_backbone(x)

    assert (our_features - ref_features).abs().max() < 1e-3


# Mainly for DINOv2 + registers coverage (this test can be removed once `test_encoder` supports all flavors)
def test_encoder_only(
    our_backbone: ViT,
    seed_expected_norm: tuple[int, float],
    test_device: torch.device,
):
    seed, expected_norm = seed_expected_norm
    manual_seed(seed)

    x = torch.randn(1, 3, 518, 518).to(test_device)

    our_features = our_backbone(x)

    assert isclose(our_features.norm().item(), expected_norm, rel_tol=1e-04)
dinov2: add some coverage for registers Those are not supported yet in HF: so just compared with a precomputed norm. Note: in the initial PR [1] the Refiners' implementation has been tested against the official code using Torch Hub. [1]: https://github.com/finegrain-ai/refiners/pull/132#issuecomment-1852021656 2023-12-18 09:10:39 +00:00			`from math import isclose`
add minimal unit tests for DINOv2 To be completed with tests using image preprocessing, e.g. test cosine similarity on a relevant pair of images 2023-12-16 15:16:54 +00:00			`from pathlib import Path`
			`from warnings import warn`

			`import pytest`
			`import torch`
			`from transformers import AutoModel # type: ignore`
			`from transformers.models.dinov2.modeling_dinov2 import Dinov2Model # type: ignore`

			`from refiners.fluxion.utils import load_from_safetensors, manual_seed`
dinov2: add some coverage for registers Those are not supported yet in HF: so just compared with a precomputed norm. Note: in the initial PR [1] the Refiners' implementation has been tested against the official code using Torch Hub. [1]: https://github.com/finegrain-ai/refiners/pull/132#issuecomment-1852021656 2023-12-18 09:10:39 +00:00			`from refiners.foundationals.dinov2 import (`
			`DINOv2_base,`
			`DINOv2_base_reg,`
			`DINOv2_large,`
			`DINOv2_large_reg,`
			`DINOv2_small,`
			`DINOv2_small_reg,`
			`)`
add minimal unit tests for DINOv2 To be completed with tests using image preprocessing, e.g. test cosine similarity on a relevant pair of images 2023-12-16 15:16:54 +00:00			`from refiners.foundationals.dinov2.vit import ViT`

			`FLAVORS = [`
			`"dinov2_vits14",`
			`"dinov2_vitb14",`
			`"dinov2_vitl14",`
dinov2: add some coverage for registers Those are not supported yet in HF: so just compared with a precomputed norm. Note: in the initial PR [1] the Refiners' implementation has been tested against the official code using Torch Hub. [1]: https://github.com/finegrain-ai/refiners/pull/132#issuecomment-1852021656 2023-12-18 09:10:39 +00:00			`"dinov2_vits14_reg4",`
			`"dinov2_vitb14_reg4",`
			`"dinov2_vitl14_reg4",`
add minimal unit tests for DINOv2 To be completed with tests using image preprocessing, e.g. test cosine similarity on a relevant pair of images 2023-12-16 15:16:54 +00:00			`]`


			`@pytest.fixture(scope="module", params=FLAVORS)`
			`def flavor(request: pytest.FixtureRequest) -> str:`
			`return request.param`


dinov2: add some coverage for registers Those are not supported yet in HF: so just compared with a precomputed norm. Note: in the initial PR [1] the Refiners' implementation has been tested against the official code using Torch Hub. [1]: https://github.com/finegrain-ai/refiners/pull/132#issuecomment-1852021656 2023-12-18 09:10:39 +00:00			# Temporary: see comments in `test_encoder_only`
			`@pytest.fixture(scope="module")`
			`def seed_expected_norm(flavor: str) -> tuple[int, float]:`
			`match flavor:`
			`case "dinov2_vits14":`
			`return (42, 1977.9213867)`
			`case "dinov2_vitb14":`
			`return (42, 1902.6384277)`
			`case "dinov2_vitl14":`
			`return (42, 1763.9187011)`
			`case "dinov2_vits14_reg4":`
			`return (42, 989.2380981)`
			`case "dinov2_vitb14_reg4":`
			`return (42, 974.4362182)`
			`case "dinov2_vitl14_reg4":`
			`return (42, 924.8797607)`
			`case _:`
			`raise ValueError(f"Unexpected DINOv2 flavor: {flavor}")`


add minimal unit tests for DINOv2 To be completed with tests using image preprocessing, e.g. test cosine similarity on a relevant pair of images 2023-12-16 15:16:54 +00:00			`@pytest.fixture(scope="module")`
			`def our_backbone(test_weights_path: Path, flavor: str, test_device: torch.device) -> ViT:`
			`weights = test_weights_path / f"{flavor}_pretrain.safetensors"`
			`if not weights.is_file():`
			`warn(f"could not find weights at {weights}, skipping")`
			`pytest.skip(allow_module_level=True)`
			`match flavor:`
			`case "dinov2_vits14":`
			`backbone = DINOv2_small(device=test_device)`
			`case "dinov2_vitb14":`
			`backbone = DINOv2_base(device=test_device)`
			`case "dinov2_vitl14":`
			`backbone = DINOv2_large(device=test_device)`
dinov2: add some coverage for registers Those are not supported yet in HF: so just compared with a precomputed norm. Note: in the initial PR [1] the Refiners' implementation has been tested against the official code using Torch Hub. [1]: https://github.com/finegrain-ai/refiners/pull/132#issuecomment-1852021656 2023-12-18 09:10:39 +00:00			`case "dinov2_vits14_reg4":`
			`backbone = DINOv2_small_reg(device=test_device)`
			`case "dinov2_vitb14_reg4":`
			`backbone = DINOv2_base_reg(device=test_device)`
			`case "dinov2_vitl14_reg4":`
			`backbone = DINOv2_large_reg(device=test_device)`
add minimal unit tests for DINOv2 To be completed with tests using image preprocessing, e.g. test cosine similarity on a relevant pair of images 2023-12-16 15:16:54 +00:00			`case _:`
			`raise ValueError(f"Unexpected DINOv2 flavor: {flavor}")`
			`tensors = load_from_safetensors(weights)`
			`backbone.load_state_dict(tensors)`
			`return backbone`


			`@pytest.fixture(scope="module")`
			`def dinov2_weights_path(test_weights_path: Path, flavor: str):`
dinov2: add some coverage for registers Those are not supported yet in HF: so just compared with a precomputed norm. Note: in the initial PR [1] the Refiners' implementation has been tested against the official code using Torch Hub. [1]: https://github.com/finegrain-ai/refiners/pull/132#issuecomment-1852021656 2023-12-18 09:10:39 +00:00			`# TODO: At the time of writing, those are not yet supported in transformers`
			`# (https://github.com/huggingface/transformers/issues/27379). Alternatively, it is also possible to use`
			`# facebookresearch/dinov2 directly (https://github.com/finegrain-ai/refiners/pull/132).`
			`if flavor.endswith("_reg4"):`
			`warn(f"DINOv2 with registers are not yet supported in Hugging Face, skipping")`
			`pytest.skip(allow_module_level=True)`
add minimal unit tests for DINOv2 To be completed with tests using image preprocessing, e.g. test cosine similarity on a relevant pair of images 2023-12-16 15:16:54 +00:00			`match flavor:`
			`case "dinov2_vits14":`
			`name = "dinov2-small"`
			`case "dinov2_vitb14":`
			`name = "dinov2-base"`
			`case "dinov2_vitl14":`
			`name = "dinov2-large"`
			`case _:`
			`raise ValueError(f"Unexpected DINOv2 flavor: {flavor}")`
			`r = test_weights_path / "facebook" / name`
			`if not r.is_dir():`
			`warn(f"could not find DINOv2 weights at {r}, skipping")`
			`pytest.skip(allow_module_level=True)`
			`return r`


			`@pytest.fixture(scope="module")`
			`def ref_backbone(dinov2_weights_path: Path, test_device: torch.device) -> Dinov2Model:`
			`backbone = AutoModel.from_pretrained(dinov2_weights_path) # type: ignore`
			`assert isinstance(backbone, Dinov2Model)`
			`return backbone.to(test_device) # type: ignore`


			`def test_encoder(`
			`ref_backbone: Dinov2Model,`
			`our_backbone: ViT,`
			`test_device: torch.device,`
			`):`
			`manual_seed(42)`

			`# Position encoding interpolation [1] at runtime is not supported yet. So stick to the default image resolution`
			`# e.g. using (224, 224) pixels as input would give a runtime error (sequence size mismatch)`
			`# [1]: https://github.com/facebookresearch/dinov2/blob/2302b6b/dinov2/models/vision_transformer.py#L179`
			`assert our_backbone.image_size == 518`

			`x = torch.randn(1, 3, 518, 518).to(test_device)`

			`with torch.no_grad():`
			`ref_features = ref_backbone(x).last_hidden_state`
			`our_features = our_backbone(x)`

			`assert (our_features - ref_features).abs().max() < 1e-3`
dinov2: add some coverage for registers Those are not supported yet in HF: so just compared with a precomputed norm. Note: in the initial PR [1] the Refiners' implementation has been tested against the official code using Torch Hub. [1]: https://github.com/finegrain-ai/refiners/pull/132#issuecomment-1852021656 2023-12-18 09:10:39 +00:00

			# Mainly for DINOv2 + registers coverage (this test can be removed once `test_encoder` supports all flavors)
			`def test_encoder_only(`
			`our_backbone: ViT,`
			`seed_expected_norm: tuple[int, float],`
			`test_device: torch.device,`
			`):`
			`seed, expected_norm = seed_expected_norm`
			`manual_seed(seed)`

			`x = torch.randn(1, 3, 518, 518).to(test_device)`

			`our_features = our_backbone(x)`

			`assert isclose(our_features.norm().item(), expected_norm, rel_tol=1e-04)`