chore: rename "datasets" internal module to "dataset"

2023-04-11 13:50:00 +02:00 · 2023-04-11 13:50:00 +02:00 · a97c2e87f9
parent b24a4cc4e9
commit a97c2e87f9
12 changed files with 224 additions and 13 deletions
--- a/datasets/init.py
+++ b/datasets/init.py
--- a/datasets/partnet.py
+++ b/datasets/partnet.py
--- a/dataset/rotor37_data.py
+++ b/dataset/rotor37_data.py
@ -0,0 +1,194 @@
+from pathlib import Path
+
+import h5py
+import numpy as np
+import pyvista as pv
+import torch
+from rich.progress import track
+from torch.utils.data import Dataset
+
+DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/")
+VTKFILE_NOMINAL = DATASET_DIR / "ncs" / "nominal_blade.vtk"
+H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5"
+H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5"
+CARDINALITY_TRAIN = 1000
+CARDINALITY_TEST = 200
+
+
+def rotate_nominal_blade(blade: pv.PolyData) -> None:
+    """Rotate nominal blade points.
+
+    The nominal blade must be rotated to match the orientation of the other blades.
+    Rotations applied (sequentially) are:
+        - -90° around z-axis
+        - -90° around y-axis
+
+    Args:
+        blade (pyvista.PolyData): blade to rotate
+    """
+    THETA = -90
+    PHI = -90
+
+    RZ = np.array(
+        [
+            [np.cos(np.deg2rad(THETA)), -np.sin(np.deg2rad(THETA)), 0],
+            [np.sin(np.deg2rad(THETA)), np.cos(np.deg2rad(THETA)), 0],
+            [0, 0, 1],
+        ]
+    )
+
+    RY = np.array(
+        [
+            [np.cos(np.deg2rad(PHI)), 0, np.sin(np.deg2rad(PHI))],
+            [0, 1, 0],
+            [-np.sin(np.deg2rad(PHI)), 0, np.cos(np.deg2rad(PHI))],
+        ]
+    )
+
+    # rotation of θ° around z-axis
+    blade.points = np.asarray(blade.points) @ RZ
+    blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RZ
+
+    # rotation of φ° around y-axis
+    blade.points = np.asarray(blade.points) @ RY
+    blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RY
+
+
+class Rotor37Dataset(Dataset):
+    """Rotor37 dataset.
+
+    This dataset is a collection of 1200 graphs, each representing a blade of a wind turbine.
+    The dataset is split into 2 subsets: train and test, with 1000 and 200 graphs respectively.
+    Each graph is a 3D mesh, with 3D deformations from a nominal blade, 3D normals, 3D faces and physical properties.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        split: str = "train",
+    ):
+        """Initialize a new Rotor37 dataset instance.
+
+        Args:
+            root (str): root directory of the dataset
+            split (str): split of the dataset, either "train" or "test"
+        """
+        # set split
+        assert split in ("train", "test")
+        self.split = split
+
+        # set cardinality and h5file according to split
+        self.cardinality = CARDINALITY_TRAIN if split == "train" else CARDINALITY_TEST
+        self.h5file = H5FILE_TRAIN if split == "train" else H5FILE_TEST
+
+        super().__init__(root, transform, pre_transform)
+
+    @property
+    def raw_file_names(self) -> list[str]:
+        """No raw files."""
+        return []
+
+    @property
+    def processed_file_names(self) -> list[str]:
+        """Processed files are named data_{split}_{idx:04d}.pt, where idx is the index of the graph."""
+        return [f"data_{self.split}_{idx:04d}.pt" for idx in range(self.cardinality)]
+
+    def download(self):
+        """No need to download, data already in cluster."""
+        pass
+
+    def process(self) -> None:
+        """Process the dataset.
+
+        The dataset is processed by loading the nominal blade, and then loading all deformed blades.
+        For each deformed blade, the following attributes are computed and stored in a `Data` object:
+            - delta: deformed blade - nominal blade
+            - fields: physical properties of the blade
+            - normals: normals of the blade
+            - edges: edges of the blade
+            - faces: faces of the blade
+
+        The `Data` object is then saved to disk.
+        """
+        # load nominal blade
+        vtk_reader = pv.get_reader(VTKFILE_NOMINAL)
+        nominal = vtk_reader.read()
+        rotate_nominal_blade(nominal)
+        nominal_positions = torch.as_tensor(nominal.points, dtype=torch.float32)
+
+        # load all deformed blades
+        with h5py.File(self.h5file, "r") as h5file:
+            # NB: torch.as_tensor(np.asarray(data)) is a bit ugly
+            # but torch torch.as_tensor(data) complains about data being an array of numpy arrays, and is also slower
+
+            # common edges and faces matrix for each graph
+            edges = torch.as_tensor(np.asarray(h5file["adj"]), dtype=torch.int64).transpose(0, 1)
+            faces = torch.as_tensor(np.asarray(h5file["faces"]), dtype=torch.int64).transpose(0, 1)
+
+            # attributes specific to each graph
+            attributes = zip(
+                h5file["points"],  # type: ignore
+                h5file["normals"],  # type: ignore
+                h5file["output_fields"],  # type: ignore
+            )
+
+            # for each graph
+            for idx, (positions, normals, fields) in track(
+                enumerate(attributes),
+                total=self.cardinality,
+            ):
+                # convert to torch tensors
+                positions = torch.as_tensor(np.asarray(positions), dtype=torch.float32)
+                fields = torch.as_tensor(np.asarray(fields), dtype=torch.float32)
+                normals = torch.as_tensor(np.asarray(normals), dtype=torch.float32)
+                delta = positions - nominal_positions
+
+                # save data to disk
+
+    def len(self) -> int:
+        """Return the cardinality of the dataset."""
+        return self.cardinality
+
+    def get(self, idx) -> Data:
+        """Load and return the graph `Data`.
+
+        Args:
+            idx (int): index of the graph to return
+
+        Returns:
+            Data: graph at index `idx`
+        """
+        return torch.load(self.processed_dir / f"data_{self.split}_{idx:04d}.pt")
+
+    def __repr__(self) -> str:
+        """Return a string representation of the dataset."""
+        return f"{self.__class__.__name__}({self.split}, {len(self)})"
+
+    @property
+    def processed_dir(self) -> Path:
+        """Wrap processed_dir to return a Path instead of a str."""
+        return Path(super().processed_dir)
+
+
+if __name__ == "__main__":
+    from torch_geometric.loader import DataLoader
+
+    # load test split
+    ds_test = Rotor37Dataset(root="./datasets/Rotor37/", split="test")
+    print(ds_test)
+    print(ds_test[0])
+
+    # create test data loader
+    ld_test = DataLoader(ds_test, batch_size=8, shuffle=True)
+    print(ld_test)
+    print(next(iter(ld_test)))
+
+    # load train split
+    ds_train = Rotor37Dataset(root="./datasets/Rotor37/", split="train")
+    print(ds_train)
+    print(ds_train[0])
+
+    # create train data loader
+    ld_train = DataLoader(ds_train, batch_size=8, shuffle=True)
+    print(ld_train)
+    print(next(iter(ld_train)))
--- a/datasets/shapenet_data_pc.py
+++ b/datasets/shapenet_data_pc.py
@ -102,6 +102,7 @@ class Uniform15KPC(Dataset):
        self.all_cate_mids = []
        self.cate_idx_lst = []
        self.all_points = []
+
        for cate_idx, subd in enumerate(self.subdirs):
            # NOTE: [subd] here is synset id
            sub_path = os.path.join(root_dir, subd, self.split)
@ -158,7 +159,6 @@ class Uniform15KPC(Dataset):
            self.all_points_std = self.all_points.max(axis=1).reshape(B, 1, input_dim) - self.all_points.min(
                axis=1
            ).reshape(B, 1, input_dim)
-
        else:  # normalize across the dataset
            self.all_points_mean = self.all_points.reshape(-1, input_dim).mean(axis=0).reshape(1, 1, input_dim)
            if normalize_std_per_axis:
@ -292,6 +292,3 @@ class ShapeNet15kPointClouds(Uniform15KPC):
            input_dim=3,
            use_mask=use_mask,
        )
-
-
-####################################################################################
--- a/datasets/shapenet_data_sv.py
+++ b/datasets/shapenet_data_sv.py
--- a/environment.yml
+++ b/environment.yml
@ -18,8 +18,12 @@ dependencies:
  - trimesh
  - scipy
  - scikit-learn
-#---# tooling (linting, typing...)
+  - h5py
+  - pyvista
+  - datasets
+#---# toolings
  - ruff
  - mypy
  - black
  - isort
+  - bandit
--- a/model/pvcnn_completion.py
+++ b/model/pvcnn_completion.py
@ -4,7 +4,15 @@ import numpy as np
 import torch
 import torch.nn as nn

-from modules import Attention, PointNetAModule, PointNetFPModule, PointNetSAModule, PVConv, SharedMLP, Swish
+from modules import (
+    Attention,
+    PointNetAModule,
+    PointNetFPModule,
+    PointNetSAModule,
+    PVConv,
+    SharedMLP,
+    Swish,
+)


 def _linear_gn_relu(in_channels, out_channels):
--- a/model/pvcnn_generation.py
+++ b/model/pvcnn_generation.py
@ -4,7 +4,15 @@ import numpy as np
 import torch
 import torch.nn as nn

-from modules import Attention, PointNetAModule, PointNetFPModule, PointNetSAModule, PVConv, SharedMLP, Swish
+from modules import (
+    Attention,
+    PointNetAModule,
+    PointNetFPModule,
+    PointNetSAModule,
+    PVConv,
+    SharedMLP,
+    Swish,
+)


 def _linear_gn_relu(in_channels, out_channels):
--- a/test_completion.py
+++ b/test_completion.py
@ -5,8 +5,8 @@ import torch.nn as nn
 import torch.utils.data
 from torch.distributions import Normal

-from datasets.shapenet_data_pc import ShapeNet15kPointClouds
-from datasets.shapenet_data_sv import *
+from dataset.shapenet_data_pc import ShapeNet15kPointClouds
+from dataset.shapenet_data_sv import *
 from metrics.evaluation_metrics import EMD_CD, compute_all_metrics
 from model.pvcnn_completion import PVCNN2Base
 from utils.file_utils import *
--- a/test_generation.py
+++ b/test_generation.py
@ -7,7 +7,7 @@ import torch.utils.data
 from torch.distributions import Normal
 from tqdm import tqdm

-from datasets.shapenet_data_pc import ShapeNet15kPointClouds
+from dataset.shapenet_data_pc import ShapeNet15kPointClouds
 from metrics.evaluation_metrics import compute_all_metrics
 from metrics.evaluation_metrics import jsd_between_point_cloud_sets as JSD
 from model.pvcnn_generation import PVCNN2Base
--- a/train_completion.py
+++ b/train_completion.py
@ -7,8 +7,8 @@ import torch.optim as optim
 import torch.utils.data
 from torch.distributions import Normal

-from datasets.shapenet_data_pc import ShapeNet15kPointClouds
-from datasets.shapenet_data_sv import ShapeNet_Multiview_Points
+from dataset.shapenet_data_pc import ShapeNet15kPointClouds
+from dataset.shapenet_data_sv import ShapeNet_Multiview_Points
 from model.pvcnn_completion import PVCNN2Base
 from utils.file_utils import *
 from utils.visualize import *
--- a/train_generation.py
+++ b/train_generation.py
@ -8,7 +8,7 @@ import torch.optim as optim
 import torch.utils.data
 from torch.distributions import Normal

-from datasets.shapenet_data_pc import ShapeNet15kPointClouds
+from dataset.shapenet_data_pc import ShapeNet15kPointClouds
 from model.pvcnn_generation import PVCNN2Base
 from utils.file_utils import *
 from utils.visualize import *