(doc/foundationals) add SegmentAnything, related docstrings

2024-11-24 07:08:45 +00:00 · 2024-02-02 13:51:43 +00:00 · 2024-02-02 13:51:43 +00:00 · f62e71da1c
parent a926696141
commit f62e71da1c
2 changed files with 95 additions and 0 deletions
--- a/src/refiners/foundationals/segment_anything/init.py
+++ b/src/refiners/foundationals/segment_anything/init.py
@ -0,0 +1,3 @@
+from refiners.foundationals.segment_anything.model import SegmentAnything, SegmentAnythingH
+
+__all__ = ["SegmentAnything", "SegmentAnythingH"]
--- a/src/refiners/foundationals/segment_anything/model.py
+++ b/src/refiners/foundationals/segment_anything/model.py
@ -21,6 +21,14 @@ class ImageEmbedding:


 class SegmentAnything(fl.Module):
+    """SegmentAnything model.
+
+    See [[arXiv:2304.02643] Segment Anything](https://arxiv.org/abs/2304.02643)
+
+    Attributes:
+        mask_threshold (float): 0.0
+    """
+
    mask_threshold: float = 0.0

    def __init__(
@ -32,6 +40,16 @@ class SegmentAnything(fl.Module):
        device: Device | str = "cpu",
        dtype: DType = torch.float32,
    ) -> None:
+        """Initialize SegmentAnything model.
+
+        Args:
+            image_encoder: The image encoder to use.
+            point_encoder: The point encoder to use.
+            mask_encoder: The mask encoder to use.
+            mask_decoder: The mask decoder to use.
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
        super().__init__()
        self.device: Device = device if isinstance(device, Device) else Device(device=device)
        self.dtype = dtype
@ -42,6 +60,14 @@ class SegmentAnything(fl.Module):

    @no_grad()
    def compute_image_embedding(self, image: Image.Image) -> ImageEmbedding:
+        """Compute the emmbedding of an image.
+
+        Args:
+            image: The image to compute the embedding of.
+
+        Returns:
+            The computed image embedding.
+        """
        original_size = (image.height, image.width)
        target_size = self.compute_target_size(original_size)
        return ImageEmbedding(
@ -59,6 +85,21 @@ class SegmentAnything(fl.Module):
        low_res_mask: Float[Tensor, "1 1 256 256"] | None = None,
        binarize: bool = True,
    ) -> tuple[Tensor, Tensor, Tensor]:
+        """Predict the masks of the input image.
+
+        Args:
+            input: The input image or its embedding.
+            foreground_points: The points of the foreground.
+            background_points: The points of the background.
+            box_points: The points of the box.
+            low_res_mask: The low resolution mask.
+            binarize: Whether to binarize the masks.
+
+        Returns:
+            The predicted masks.
+            The IOU prediction.
+            The low resolution masks.
+        """
        if isinstance(input, ImageEmbedding):
            original_size = input.original_image_size
            target_size = self.compute_target_size(original_size)
@ -107,11 +148,21 @@ class SegmentAnything(fl.Module):

    @property
    def image_size(self) -> int:
+        """The image size."""
        w, h = self.image_encoder.image_size
        assert w == h
        return w

    def compute_target_size(self, size: tuple[int, int]) -> tuple[int, int]:
+        """Compute the target size for a given size.
+
+        Args:
+            size: The size of the image.
+
+        Returns:
+            The target height.
+            The target width.
+        """
        oldh, oldw = size
        scale = self.image_size * 1.0 / max(oldh, oldw)
        newh, neww = oldh * scale, oldw * scale
@ -120,6 +171,15 @@ class SegmentAnything(fl.Module):
        return (newh, neww)

    def preprocess_image(self, image: Image.Image, target_size: tuple[int, int]) -> Tensor:
+        """Preprocess an image.
+
+        Args:
+            image: The image to preprocess.
+            target_size: The target size.
+
+        Returns:
+            The preprocessed image.
+        """
        h, w = target_size
        padh = self.image_size - h
        padw = self.image_size - w
@ -133,11 +193,31 @@ class SegmentAnything(fl.Module):
        )

    def normalize(self, coordinates: Tensor, target_size: tuple[int, int], original_size: tuple[int, int]) -> Tensor:
+        """Normalize the coordinates.
+
+        Args:
+            coordinates: The coordinates to normalize.
+            target_size: The target size.
+            original_size: The original size.
+
+        Returns:
+            The normalized coordinates.
+        """
        coordinates[:, :, 0] = ((coordinates[:, :, 0] * (target_size[1] / original_size[1])) + 0.5) / self.image_size
        coordinates[:, :, 1] = ((coordinates[:, :, 1] * (target_size[0] / original_size[0])) + 0.5) / self.image_size
        return coordinates

    def postprocess_masks(self, masks: Tensor, target_size: tuple[int, int], original_size: tuple[int, int]) -> Tensor:
+        """Postprocess the masks.
+
+        Args:
+            masks: The masks to postprocess.
+            target_size: The target size.
+            original_size: The original size.
+
+        Returns:
+            The postprocessed masks.
+        """
        masks = interpolate(masks, factor=torch.Size((self.image_size, self.image_size)), mode="bilinear")
        masks = masks[..., : target_size[0], : target_size[1]]  # remove padding added at `preprocess_image` time
        masks = interpolate(masks, factor=torch.Size(original_size), mode="bilinear")
@ -145,6 +225,8 @@ class SegmentAnything(fl.Module):


 class SegmentAnythingH(SegmentAnything):
+    """SegmentAnything huge model."""
+
    def __init__(
        self,
        image_encoder: SAMViTH | None = None,
@ -154,6 +236,16 @@ class SegmentAnythingH(SegmentAnything):
        device: Device | str = "cpu",
        dtype: DType = torch.float32,
    ) -> None:
+        """Initialize SegmentAnything huge model.
+
+        Args:
+            image_encoder: The image encoder to use.
+            point_encoder: The point encoder to use.
+            mask_encoder: The mask encoder to use.
+            mask_decoder: The mask decoder to use.
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
        image_encoder = image_encoder or SAMViTH()
        point_encoder = point_encoder or PointEncoder()
        mask_encoder = mask_encoder or MaskEncoder()