diff --git a/src/refiners/fluxion/layers/activations.py b/src/refiners/fluxion/layers/activations.py
index 1c0df61..eded3ef 100644
--- a/src/refiners/fluxion/layers/activations.py
+++ b/src/refiners/fluxion/layers/activations.py
@@ -1,34 +1,102 @@
-from torch import Tensor, sigmoid
+from abc import ABC
+from enum import Enum
+
+from torch import Tensor
 from torch.nn.functional import (
-    gelu,  # type: ignore
+    gelu,
+    relu,
+    sigmoid,
     silu,
 )
 
 from refiners.fluxion.layers.module import Module
 
 
-class Activation(Module):
+class Activation(Module, ABC):
+    """Base class for activation layers.
+
+    Activation layers are layers that apply a (non-linear) function to their input.
+
+    Receives:
+        x (Tensor):
+
+    Returns:
+        (Tensor):
+    """
+
     def __init__(self) -> None:
         super().__init__()
 
 
 class SiLU(Activation):
+    """Sigmoid Linear Unit activation function.
+
+    See [[arXiv:1702.03118] Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning](https://arxiv.org/abs/1702.03118) for more details.
+    """
+
     def __init__(self) -> None:
         super().__init__()
 
     def forward(self, x: Tensor) -> Tensor:
-        return silu(x)  # type: ignore
+        return silu(x)
 
 
 class ReLU(Activation):
+    """Rectified Linear Unit activation function.
+
+    See [Rectified Linear Units Improve Restricted Boltzmann Machines](https://www.cs.toronto.edu/%7Efritz/absps/reluICML.pdf)
+    and [Cognitron: A self-organizing multilayered neural network](https://link.springer.com/article/10.1007/BF00342633)
+
+    Example:
+        ```py
+        relu = fl.ReLU()
+
+        tensor = torch.tensor([[-1.0, 0.0, 1.0]])
+        output = relu(tensor)
+
+        expected_output = torch.tensor([[0.0, 0.0, 1.0]])
+        assert torch.allclose(output, expected_output)
+        ```
+    """
+
     def __init__(self) -> None:
         super().__init__()
 
     def forward(self, x: Tensor) -> Tensor:
-        return x.relu()
+        return relu(x)
+
+
+class GeLUApproximation(Enum):
+    """Approximation methods for the Gaussian Error Linear Unit activation function.
+
+    Attributes:
+        NONE: No approximation, use the original formula.
+        TANH: Use the tanh approximation.
+        SIGMOID: Use the sigmoid approximation.
+    """
+
+    NONE = "none"
+    TANH = "tanh"
+    SIGMOID = "sigmoid"
 
 
 class GeLU(Activation):
+    """Gaussian Error Linear Unit activation function.
+
+    This activation can be quite expensive to compute, a few approximations are available,
+    see [`GeLUApproximation`][refiners.fluxion.layers.activations.GeLUApproximation].
+
+    See [[arXiv:1606.08415] Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415) for more details.
+
+    Example:
+        ```py
+        gelu = fl.GeLU()
+
+        tensor = torch.tensor([[-1.0, 0.0, 1.0]])
+        output = gelu(tensor)
+        ```
+    """
+
     def __init__(self) -> None:
         super().__init__()
 
@@ -50,18 +118,36 @@ class ApproximateGeLU(Activation):
 
 
 class Sigmoid(Activation):
+    """Sigmoid activation function.
+
+    Example:
+        ```py
+        sigmoid = fl.Sigmoid()
+
+        tensor = torch.tensor([[-1.0, 0.0, 1.0]])
+        output = sigmoid(tensor)
+        ```
+    """
+
     def __init__(self) -> None:
         super().__init__()
 
     def forward(self, x: Tensor) -> Tensor:
-        return x.sigmoid()
+        return sigmoid(x)
 
 
 class GLU(Activation):
-    """
-    Gated Linear Unit activation layer.
+    """Gated Linear Unit activation function.
 
-    See https://arxiv.org/abs/2002.05202v1 for details.
+    See [[arXiv:2002.05202] GLU Variants Improve Transformer](https://arxiv.org/abs/2002.05202) for more details.
+
+    Example:
+        ```py
+        glu = fl.GLU()
+
+        tensor = torch.tensor([[-1.0, 0.0, 1.0]])
+        output = glu(tensor)
+        ```
     """
 
     def __init__(self, activation: Activation) -> None: