PVD/dataset/rotor37_data.py

from pathlib import Path

import h5py
import numpy as np
import pyvista as pv
import torch
from rich.progress import track
from torch.utils.data import Dataset

DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/")
VTKFILE_NOMINAL = DATASET_DIR / "ncs" / "nominal_blade.vtk"
H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5"
H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5"
CARDINALITY_TRAIN = 1000
CARDINALITY_TEST = 200


def rotate_nominal_blade(blade: pv.PolyData) -> None:
    """Rotate nominal blade points.

    The nominal blade must be rotated to match the orientation of the other blades.
    Rotations applied (sequentially) are:
        - -90° around z-axis
        - -90° around y-axis

    Args:
        blade (pyvista.PolyData): blade to rotate
    """
    THETA = -90
    PHI = -90

    RZ = np.array(
        [
            [np.cos(np.deg2rad(THETA)), -np.sin(np.deg2rad(THETA)), 0],
            [np.sin(np.deg2rad(THETA)), np.cos(np.deg2rad(THETA)), 0],
            [0, 0, 1],
        ]
    )

    RY = np.array(
        [
            [np.cos(np.deg2rad(PHI)), 0, np.sin(np.deg2rad(PHI))],
            [0, 1, 0],
            [-np.sin(np.deg2rad(PHI)), 0, np.cos(np.deg2rad(PHI))],
        ]
    )

    # rotation of θ° around z-axis
    blade.points = np.asarray(blade.points) @ RZ
    blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RZ

    # rotation of φ° around y-axis
    blade.points = np.asarray(blade.points) @ RY
    blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RY


class Rotor37Dataset(Dataset):
    """Rotor37 dataset.

    This dataset is a collection of 1200 graphs, each representing a blade of a wind turbine.
    The dataset is split into 2 subsets: train and test, with 1000 and 200 graphs respectively.
    Each graph is a 3D mesh, with 3D deformations from a nominal blade, 3D normals, 3D faces and physical properties.
    """

    def __init__(
        self,
        root: str,
        split: str = "train",
    ):
        """Initialize a new Rotor37 dataset instance.

        Args:
            root (str): root directory of the dataset
            split (str): split of the dataset, either "train" or "test"
        """
        # set split
        assert split in ("train", "test")
        self.split = split

        # set cardinality and h5file according to split
        self.cardinality = CARDINALITY_TRAIN if split == "train" else CARDINALITY_TEST
        self.h5file = H5FILE_TRAIN if split == "train" else H5FILE_TEST

        super().__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self) -> list[str]:
        """No raw files."""
        return []

    @property
    def processed_file_names(self) -> list[str]:
        """Processed files are named data_{split}_{idx:04d}.pt, where idx is the index of the graph."""
        return [f"data_{self.split}_{idx:04d}.pt" for idx in range(self.cardinality)]

    def download(self):
        """No need to download, data already in cluster."""
        pass

    def process(self) -> None:
        """Process the dataset.

        The dataset is processed by loading the nominal blade, and then loading all deformed blades.
        For each deformed blade, the following attributes are computed and stored in a `Data` object:
            - delta: deformed blade - nominal blade
            - fields: physical properties of the blade
            - normals: normals of the blade
            - edges: edges of the blade
            - faces: faces of the blade

        The `Data` object is then saved to disk.
        """
        # load nominal blade
        vtk_reader = pv.get_reader(VTKFILE_NOMINAL)
        nominal = vtk_reader.read()
        rotate_nominal_blade(nominal)
        nominal_positions = torch.as_tensor(nominal.points, dtype=torch.float32)

        # load all deformed blades
        with h5py.File(self.h5file, "r") as h5file:
            # NB: torch.as_tensor(np.asarray(data)) is a bit ugly
            # but torch torch.as_tensor(data) complains about data being an array of numpy arrays, and is also slower

            # common edges and faces matrix for each graph
            edges = torch.as_tensor(np.asarray(h5file["adj"]), dtype=torch.int64).transpose(0, 1)
            faces = torch.as_tensor(np.asarray(h5file["faces"]), dtype=torch.int64).transpose(0, 1)

            # attributes specific to each graph
            attributes = zip(
                h5file["points"],  # type: ignore
                h5file["normals"],  # type: ignore
                h5file["output_fields"],  # type: ignore
            )

            # for each graph
            for idx, (positions, normals, fields) in track(
                enumerate(attributes),
                total=self.cardinality,
            ):
                # convert to torch tensors
                positions = torch.as_tensor(np.asarray(positions), dtype=torch.float32)
                fields = torch.as_tensor(np.asarray(fields), dtype=torch.float32)
                normals = torch.as_tensor(np.asarray(normals), dtype=torch.float32)
                delta = positions - nominal_positions

                # save data to disk

    def len(self) -> int:
        """Return the cardinality of the dataset."""
        return self.cardinality

    def get(self, idx) -> Data:
        """Load and return the graph `Data`.

        Args:
            idx (int): index of the graph to return

        Returns:
            Data: graph at index `idx`
        """
        return torch.load(self.processed_dir / f"data_{self.split}_{idx:04d}.pt")

    def __repr__(self) -> str:
        """Return a string representation of the dataset."""
        return f"{self.__class__.__name__}({self.split}, {len(self)})"

    @property
    def processed_dir(self) -> Path:
        """Wrap processed_dir to return a Path instead of a str."""
        return Path(super().processed_dir)


if __name__ == "__main__":
    from torch_geometric.loader import DataLoader

    # load test split
    ds_test = Rotor37Dataset(root="./datasets/Rotor37/", split="test")
    print(ds_test)
    print(ds_test[0])

    # create test data loader
    ld_test = DataLoader(ds_test, batch_size=8, shuffle=True)
    print(ld_test)
    print(next(iter(ld_test)))

    # load train split
    ds_train = Rotor37Dataset(root="./datasets/Rotor37/", split="train")
    print(ds_train)
    print(ds_train[0])

    # create train data loader
    ld_train = DataLoader(ds_train, batch_size=8, shuffle=True)
    print(ld_train)
    print(next(iter(ld_train)))
chore: rename "datasets" internal module to "dataset" 2023-04-11 11:50:00 +00:00			`from pathlib import Path`

			`import h5py`
			`import numpy as np`
			`import pyvista as pv`
			`import torch`
			`from rich.progress import track`
			`from torch.utils.data import Dataset`

			`DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/")`
			`VTKFILE_NOMINAL = DATASET_DIR / "ncs" / "nominal_blade.vtk"`
			`H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5"`
			`H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5"`
			`CARDINALITY_TRAIN = 1000`
			`CARDINALITY_TEST = 200`


			`def rotate_nominal_blade(blade: pv.PolyData) -> None:`
			`"""Rotate nominal blade points.`

			`The nominal blade must be rotated to match the orientation of the other blades.`
			`Rotations applied (sequentially) are:`
			`- -90° around z-axis`
			`- -90° around y-axis`

			`Args:`
			`blade (pyvista.PolyData): blade to rotate`
			`"""`
			`THETA = -90`
			`PHI = -90`

			`RZ = np.array(`
			`[`
			`[np.cos(np.deg2rad(THETA)), -np.sin(np.deg2rad(THETA)), 0],`
			`[np.sin(np.deg2rad(THETA)), np.cos(np.deg2rad(THETA)), 0],`
			`[0, 0, 1],`
			`]`
			`)`

			`RY = np.array(`
			`[`
			`[np.cos(np.deg2rad(PHI)), 0, np.sin(np.deg2rad(PHI))],`
			`[0, 1, 0],`
			`[-np.sin(np.deg2rad(PHI)), 0, np.cos(np.deg2rad(PHI))],`
			`]`
			`)`

			`# rotation of θ° around z-axis`
			`blade.points = np.asarray(blade.points) @ RZ`
			`blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RZ`

			`# rotation of φ° around y-axis`
			`blade.points = np.asarray(blade.points) @ RY`
			`blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RY`


			`class Rotor37Dataset(Dataset):`
			`"""Rotor37 dataset.`

			`This dataset is a collection of 1200 graphs, each representing a blade of a wind turbine.`
			`The dataset is split into 2 subsets: train and test, with 1000 and 200 graphs respectively.`
			`Each graph is a 3D mesh, with 3D deformations from a nominal blade, 3D normals, 3D faces and physical properties.`
			`"""`

			`def __init__(`
			`self,`
			`root: str,`
			`split: str = "train",`
			`):`
			`"""Initialize a new Rotor37 dataset instance.`

			`Args:`
			`root (str): root directory of the dataset`
			`split (str): split of the dataset, either "train" or "test"`
			`"""`
			`# set split`
			`assert split in ("train", "test")`
			`self.split = split`

			`# set cardinality and h5file according to split`
			`self.cardinality = CARDINALITY_TRAIN if split == "train" else CARDINALITY_TEST`
			`self.h5file = H5FILE_TRAIN if split == "train" else H5FILE_TEST`

			`super().__init__(root, transform, pre_transform)`

			`@property`
			`def raw_file_names(self) -> list[str]:`
			`"""No raw files."""`
			`return []`

			`@property`
			`def processed_file_names(self) -> list[str]:`
			`"""Processed files are named data_{split}_{idx:04d}.pt, where idx is the index of the graph."""`
			`return [f"data_{self.split}_{idx:04d}.pt" for idx in range(self.cardinality)]`

			`def download(self):`
			`"""No need to download, data already in cluster."""`
			`pass`

			`def process(self) -> None:`
			`"""Process the dataset.`

			`The dataset is processed by loading the nominal blade, and then loading all deformed blades.`
			For each deformed blade, the following attributes are computed and stored in a `Data` object:
			`- delta: deformed blade - nominal blade`
			`- fields: physical properties of the blade`
			`- normals: normals of the blade`
			`- edges: edges of the blade`
			`- faces: faces of the blade`

			The `Data` object is then saved to disk.
			`"""`
			`# load nominal blade`
			`vtk_reader = pv.get_reader(VTKFILE_NOMINAL)`
			`nominal = vtk_reader.read()`
			`rotate_nominal_blade(nominal)`
			`nominal_positions = torch.as_tensor(nominal.points, dtype=torch.float32)`

			`# load all deformed blades`
			`with h5py.File(self.h5file, "r") as h5file:`
			`# NB: torch.as_tensor(np.asarray(data)) is a bit ugly`
			`# but torch torch.as_tensor(data) complains about data being an array of numpy arrays, and is also slower`

			`# common edges and faces matrix for each graph`
			`edges = torch.as_tensor(np.asarray(h5file["adj"]), dtype=torch.int64).transpose(0, 1)`
			`faces = torch.as_tensor(np.asarray(h5file["faces"]), dtype=torch.int64).transpose(0, 1)`

			`# attributes specific to each graph`
			`attributes = zip(`
			`h5file["points"], # type: ignore`
			`h5file["normals"], # type: ignore`
			`h5file["output_fields"], # type: ignore`
			`)`

			`# for each graph`
			`for idx, (positions, normals, fields) in track(`
			`enumerate(attributes),`
			`total=self.cardinality,`
			`):`
			`# convert to torch tensors`
			`positions = torch.as_tensor(np.asarray(positions), dtype=torch.float32)`
			`fields = torch.as_tensor(np.asarray(fields), dtype=torch.float32)`
			`normals = torch.as_tensor(np.asarray(normals), dtype=torch.float32)`
			`delta = positions - nominal_positions`

			`# save data to disk`

			`def len(self) -> int:`
			`"""Return the cardinality of the dataset."""`
			`return self.cardinality`

			`def get(self, idx) -> Data:`
			"""Load and return the graph `Data`.

			`Args:`
			`idx (int): index of the graph to return`

			`Returns:`
			Data: graph at index `idx`
			`"""`
			`return torch.load(self.processed_dir / f"data_{self.split}_{idx:04d}.pt")`

			`def __repr__(self) -> str:`
			`"""Return a string representation of the dataset."""`
			`return f"{self.__class__.__name__}({self.split}, {len(self)})"`

			`@property`
			`def processed_dir(self) -> Path:`
			`"""Wrap processed_dir to return a Path instead of a str."""`
			`return Path(super().processed_dir)`


			`if __name__ == "__main__":`
			`from torch_geometric.loader import DataLoader`

			`# load test split`
			`ds_test = Rotor37Dataset(root="./datasets/Rotor37/", split="test")`
			`print(ds_test)`
			`print(ds_test[0])`

			`# create test data loader`
			`ld_test = DataLoader(ds_test, batch_size=8, shuffle=True)`
			`print(ld_test)`
			`print(next(iter(ld_test)))`

			`# load train split`
			`ds_train = Rotor37Dataset(root="./datasets/Rotor37/", split="train")`
			`print(ds_train)`
			`print(ds_train[0])`

			`# create train data loader`
			`ld_train = DataLoader(ds_train, batch_size=8, shuffle=True)`
			`print(ld_train)`
			`print(next(iter(ld_train)))`