diff --git a/src/refiners/fluxion/layers/basics.py b/src/refiners/fluxion/layers/basics.py
index ada1979..03b48b8 100644
--- a/src/refiners/fluxion/layers/basics.py
+++ b/src/refiners/fluxion/layers/basics.py
@@ -19,10 +19,6 @@ class View(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.view(*self.shape)
 
-    def __repr__(self):
-        shape_repr = ", ".join([repr(s) for s in self.shape])
-        return f"{self.__class__.__name__}({shape_repr})"
-
 
 class Flatten(Module):
     def __init__(self, start_dim: int = 0, end_dim: int = -1) -> None:
@@ -33,9 +29,6 @@ class Flatten(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.flatten(self.start_dim, self.end_dim)
 
-    def __repr__(self):
-        return f"{self.__class__.__name__}(start_dim={repr(self.start_dim)}, end_dim={repr(self.end_dim)})"
-
 
 class Unflatten(Module):
     def __init__(self, dim: int) -> None:
@@ -45,9 +38,6 @@ class Unflatten(Module):
     def forward(self, x: Tensor, sizes: Size) -> Tensor:
         return x.unflatten(self.dim, sizes)  # type: ignore
 
-    def __repr__(self):
-        return f"{self.__class__.__name__}(dim={repr(self.dim)})"
-
 
 class Reshape(Module):
     """
@@ -62,10 +52,6 @@ class Reshape(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.reshape(x.shape[0], *self.shape)
 
-    def __repr__(self):
-        shape_repr = ", ".join([repr(s) for s in self.shape])
-        return f"{self.__class__.__name__}({shape_repr})"
-
 
 class Transpose(Module):
     def __init__(self, dim0: int, dim1: int) -> None:
@@ -76,9 +62,6 @@ class Transpose(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.transpose(self.dim0, self.dim1)
 
-    def __repr__(self):
-        return f"{self.__class__.__name__}(dim0={repr(self.dim0)}, dim1={repr(self.dim1)})"
-
 
 class Permute(Module):
     def __init__(self, *dims: int) -> None:
@@ -88,10 +71,6 @@ class Permute(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.permute(*self.dims)
 
-    def __repr__(self):
-        dims_repr = ", ".join([repr(d) for d in self.dims])
-        return f"{self.__class__.__name__}({dims_repr})"
-
 
 class Slicing(Module):
     def __init__(self, dim: int, start: int, length: int) -> None:
@@ -103,9 +82,6 @@ class Slicing(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.narrow(self.dim, self.start, self.length)
 
-    def __repr__(self):
-        return f"{self.__class__.__name__}(dim={repr(self.dim)}, start={repr(self.start)}, length={repr(self.length)})"
-
 
 class Squeeze(Module):
     def __init__(self, dim: int) -> None:
@@ -115,9 +91,6 @@ class Squeeze(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.squeeze(self.dim)
 
-    def __repr__(self):
-        return f"{self.__class__.__name__}(dim={repr(self.dim)})"
-
 
 class Unsqueeze(Module):
     def __init__(self, dim: int) -> None:
@@ -127,9 +100,6 @@ class Unsqueeze(Module):
     def forward(self, x: Tensor) -> Tensor:
         return x.unsqueeze(self.dim)
 
-    def __repr__(self):
-        return f"{self.__class__.__name__}(dim={repr(self.dim)})"
-
 
 class Parameter(WeightedModule):
     """
@@ -138,6 +108,7 @@ class Parameter(WeightedModule):
 
     def __init__(self, *dims: int, device: Device | str | None = None, dtype: DType | None = None) -> None:
         super().__init__()
+        self.dims = dims
         self.register_parameter("parameter", TorchParameter(randn(*dims, device=device, dtype=dtype)))
 
     @property
@@ -151,10 +122,6 @@ class Parameter(WeightedModule):
     def forward(self, _: Tensor) -> Tensor:
         return self.parameter
 
-    def __repr__(self):
-        dims_repr = ", ".join([repr(d) for d in list(self.parameter.shape)])
-        return f"{self.__class__.__name__}({dims_repr}, device={repr(self.device)})"
-
 
 class Buffer(WeightedModule):
     """
@@ -165,6 +132,7 @@ class Buffer(WeightedModule):
 
     def __init__(self, *dims: int, device: Device | str | None = None, dtype: DType | None = None) -> None:
         super().__init__()
+        self.dims = dims
         self.register_buffer("buffer", randn(*dims, device=device, dtype=dtype))
 
     @property
@@ -177,7 +145,3 @@ class Buffer(WeightedModule):
 
     def forward(self, _: Tensor) -> Tensor:
         return self.buffer
-
-    def __repr__(self):
-        dims_repr = ", ".join([repr(d) for d in list(self.buffer.shape)])
-        return f"{self.__class__.__name__}({dims_repr}, device={repr(self.device)})"
diff --git a/src/refiners/fluxion/layers/chain.py b/src/refiners/fluxion/layers/chain.py
index 6669cda..a6081f9 100644
--- a/src/refiners/fluxion/layers/chain.py
+++ b/src/refiners/fluxion/layers/chain.py
@@ -20,7 +20,7 @@ class Lambda(Module):
     def forward(self, *args: Any) -> Any:
         return self.func(*args)
 
-    def __repr__(self):
+    def __str__(self) -> str:
         func_name = getattr(self.func, "__name__", "partial_function")
         return f"Lambda({func_name}{str(inspect.signature(self.func))})"
 
@@ -115,6 +115,7 @@ def structural_copy(m: T) -> T:
 class Chain(ContextModule):
     _modules: dict[str, Module]
     _provider: ContextProvider
+    _tag = "CHAIN"
 
     def __init__(self, *args: Module | Iterable[Module]) -> None:
         super().__init__()
@@ -235,28 +236,6 @@ class Chain(ContextModule):
     def __iter__(self) -> Iterator[Module]:
         return iter(self._modules.values())
 
-    def _pretty_print(self, num_tab: int = 0, layer_name: str | None = None) -> str:
-        layer_name = self.__class__.__name__ if layer_name is None else layer_name
-        pretty_print = f"{layer_name}:\n"
-        tab = " " * (num_tab + 4)
-        module_strings: list[str] = []
-        for i, (name, module) in enumerate(self._modules.items()):
-            ident = ("└+" if isinstance(self, Sum) else "└─") if i == 0 else "  "
-            module_str = (
-                module
-                if not isinstance(module, Chain)
-                else (module._pretty_print(len(tab), name) if num_tab < 12 else f"{name}(...)")
-            )
-            module_strings.append(f"{tab}{ident} {module_str}")
-        pretty_print += "\n".join(module_strings)
-        return pretty_print
-
-    def __repr__(self) -> str:
-        return self._pretty_print()
-
-    def __str__(self) -> str:
-        return f"<{self.__class__.__name__} at {hex(id(self))}>"
-
     def __len__(self) -> int:
         return len(self._modules)
 
@@ -418,25 +397,45 @@ class Chain(ContextModule):
 
         return clone
 
+    def _show_only_tag(self) -> bool:
+        return self.__class__ == Chain
+
 
 class Parallel(Chain):
+    _tag = "PAR"
+
     def forward(self, *args: Any) -> tuple[Tensor, ...]:
         return tuple([self.call_layer(module, name, *args) for name, module in self._modules.items()])
 
+    def _show_only_tag(self) -> bool:
+        return self.__class__ == Parallel
+
 
 class Distribute(Chain):
+    _tag = "DISTR"
+
     def forward(self, *args: Any) -> tuple[Tensor, ...]:
         assert len(args) == len(self._modules), "Number of positional arguments must match number of sub-modules."
         return tuple([self.call_layer(module, name, arg) for arg, (name, module) in zip(args, self._modules.items())])
 
+    def _show_only_tag(self) -> bool:
+        return self.__class__ == Distribute
+
 
 class Passthrough(Chain):
+    _tag = "PASS"
+
     def forward(self, *inputs: Any) -> Any:
         super().forward(*inputs)
         return inputs
 
+    def _show_only_tag(self) -> bool:
+        return self.__class__ == Passthrough
+
 
 class Sum(Chain):
+    _tag = "SUM"
+
     def forward(self, *inputs: Any) -> Any:
         output = None
         for layer in self:
@@ -446,6 +445,9 @@ class Sum(Chain):
             output = layer_output if output is None else output + layer_output
         return output
 
+    def _show_only_tag(self) -> bool:
+        return self.__class__ == Sum
+
 
 class Residual(Sum):
     def __init__(self, *modules: Module) -> None:
@@ -468,6 +470,7 @@ class Breakpoint(ContextModule):
 
 
 class Concatenate(Chain):
+    _tag = "CAT"
     structural_attrs = ["dim"]
 
     def __init__(self, *modules: Module, dim: int = 0) -> None:
@@ -477,3 +480,6 @@ class Concatenate(Chain):
     def forward(self, *args: Any) -> Tensor:
         outputs = [module(*args) for module in self]
         return cat([output for output in outputs if output is not None], dim=self.dim)
+
+    def _show_only_tag(self) -> bool:
+        return self.__class__ == Concatenate
diff --git a/src/refiners/fluxion/layers/conv.py b/src/refiners/fluxion/layers/conv.py
index fd24308..baab860 100644
--- a/src/refiners/fluxion/layers/conv.py
+++ b/src/refiners/fluxion/layers/conv.py
@@ -8,11 +8,11 @@ class Conv2d(nn.Conv2d, WeightedModule):
         in_channels: int,
         out_channels: int,
         kernel_size: int | tuple[int, int],
-        stride: int | tuple[int, int] = 1,
-        padding: int | tuple[int, int] | str = 0,
+        stride: int | tuple[int, int] = (1, 1),
+        padding: int | tuple[int, int] | str = (0, 0),
         groups: int = 1,
         use_bias: bool = True,
-        dilation: int | tuple[int, int] = 1,
+        dilation: int | tuple[int, int] = (1, 1),
         padding_mode: str = "zeros",
         device: Device | str | None = None,
         dtype: DType | None = None,
@@ -30,6 +30,7 @@ class Conv2d(nn.Conv2d, WeightedModule):
             device,
             dtype,
         )
+        self.use_bias = use_bias
 
 
 class Conv1d(nn.Conv1d, WeightedModule):
diff --git a/src/refiners/fluxion/layers/module.py b/src/refiners/fluxion/layers/module.py
index edf864b..5a4ea2d 100644
--- a/src/refiners/fluxion/layers/module.py
+++ b/src/refiners/fluxion/layers/module.py
@@ -1,5 +1,6 @@
+from inspect import signature, Parameter
 from pathlib import Path
-from typing import Any, Generator, TypeVar
+from typing import Any, Generator, TypeVar, TypedDict, cast
 
 from torch import device as Device, dtype as DType
 from torch.nn.modules.module import Module as TorchModule
@@ -7,18 +8,20 @@ from torch.nn.modules.module import Module as TorchModule
 from refiners.fluxion.utils import load_from_safetensors
 from refiners.fluxion.context import Context, ContextProvider
 
-from typing import Callable, TYPE_CHECKING
+from typing import Callable, TYPE_CHECKING, Sequence
 
 if TYPE_CHECKING:
     from refiners.fluxion.layers.chain import Chain
 
 T = TypeVar("T", bound="Module")
 TContextModule = TypeVar("TContextModule", bound="ContextModule")
+BasicType = str | float | int | bool
 
 
 class Module(TorchModule):
     _parameters: dict[str, Any]
     _buffers: dict[str, Any]
+    _tag: str = ""
 
     __getattr__: Callable[["Module", str], Any]  # type: ignore
     __setattr__: Callable[["Module", str, Any], None]  # type: ignore
@@ -37,6 +40,56 @@ class Module(TorchModule):
     def to(self: T, device: Device | str | None = None, dtype: DType | None = None) -> T:  # type: ignore
         return super().to(device=device, dtype=dtype)  # type: ignore
 
+    def __str__(self) -> str:
+        basic_attributes_str = ", ".join(
+            f"{key}={value}" for key, value in self.basic_attributes(init_attrs_only=True).items()
+        )
+        result = f"{self.__class__.__name__}({basic_attributes_str})"
+        return result
+
+    def __repr__(self) -> str:
+        tree = ModuleTree(module=self)
+        return repr(tree)
+
+    def pretty_print(self, depth: int = -1) -> None:
+        tree = ModuleTree(module=self)
+        print(tree.generate_tree_repr(tree.root, is_root=True, depth=depth))
+
+    def basic_attributes(self, init_attrs_only: bool = False) -> dict[str, BasicType]:
+        """Return a dictionary of basic attributes of the module.
+
+        Basic attributes are public attributes made of basic types (int, float, str, bool) or a sequence of basic types.
+        """
+        sig = signature(obj=self.__init__)
+        init_params = set(sig.parameters.keys()) - {"self"}
+        default_values = {k: v.default for k, v in sig.parameters.items() if v.default is not Parameter.empty}
+
+        def is_basic_attribute(key: str, value: Any) -> bool:
+            if key.startswith("_"):
+                return False
+
+            if isinstance(value, BasicType):
+                return True
+
+            if isinstance(value, Sequence) and all(isinstance(y, BasicType) for y in cast(Sequence[Any], value)):
+                return True
+
+            return False
+
+        return {
+            key: str(object=value)
+            for key, value in self.__dict__.items()
+            if is_basic_attribute(key=key, value=value)
+            and (not init_attrs_only or (key in init_params and value != default_values.get(key)))
+        }
+
+    def _show_only_tag(self) -> bool:
+        """Whether to show only the tag when printing the module.
+
+        This is useful to distinguish between Chain subclasses that override their forward from one another.
+        """
+        return False
+
 
 class ContextModule(Module):
     # we store parent into a one element list to avoid pytorch thinking it's a submodule
@@ -100,3 +153,73 @@ class WeightedModule(Module):
     @property
     def dtype(self) -> DType:
         return self.weight.dtype
+
+
+class TreeNode(TypedDict):
+    value: str
+    children: list["TreeNode"]
+
+
+class ModuleTree:
+    def __init__(self, module: Module) -> None:
+        self.root: TreeNode = self._module_to_tree(module=module)
+        self._fold_successive_identical(node=self.root)
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}(root={self.root['value']})"
+
+    def __repr__(self) -> str:
+        return self.generate_tree_repr(node=self.root, is_root=True, depth=7)
+
+    def generate_tree_repr(
+        self, node: TreeNode, prefix: str = "", is_last: bool = True, is_root: bool = True, depth: int = -1
+    ) -> str:
+        if depth == 0:
+            return ""
+
+        if depth > 0:
+            depth -= 1
+
+        tree_icon: str = "" if is_root else ("└── " if is_last else "├── ")
+        lines = [f"{prefix}{tree_icon}{node['value']}"]
+        new_prefix: str = "    " if is_last else "│   "
+
+        for i, child in enumerate(iterable=node["children"]):
+            lines.append(
+                self.generate_tree_repr(
+                    node=child,
+                    prefix=prefix + new_prefix,
+                    is_last=i == len(node["children"]) - 1,
+                    is_root=False,
+                    depth=depth,
+                )
+            )
+
+        return "\n".join(filter(bool, lines))
+
+    def _module_to_tree(self, module: Module) -> TreeNode:
+        match (module._tag, module._show_only_tag()):  # pyright: ignore[reportPrivateUsage]
+            case ("", False):
+                value = str(object=module)
+            case (_, True):
+                value = f"({module._tag})"  # pyright: ignore[reportPrivateUsage]
+            case (_, False):
+                value = f"({module._tag}) {module}"  # pyright: ignore[reportPrivateUsage]
+
+        node: TreeNode = {"value": value, "children": []}
+        for child in module.children():
+            node["children"].append(self._module_to_tree(module=child))  # type: ignore
+        return node
+
+    def _fold_successive_identical(self, node: TreeNode) -> None:
+        i = 0
+        while i < len(node["children"]):
+            j = i
+            while j < len(node["children"]) and node["children"][i] == node["children"][j]:
+                j += 1
+            count = j - i
+            if count > 1:
+                node["children"][i]["value"] += f" (x{count})"
+                del node["children"][i + 1 : j]
+            self._fold_successive_identical(node=node["children"][i])
+            i += 1