feat: add huggingface rotor37 dataset

2023-04-11 16:00:54 +02:00 · 2023-04-11 16:00:54 +02:00 · df48f8272a
parent a97c2e87f9
commit df48f8272a
4 changed files with 79 additions and 198 deletions
--- a/dataset/rotor37_data.py
+++ b/dataset/rotor37_data.py
@ -1,194 +1,65 @@
 from pathlib import Path

+import datasets
 import h5py
-import numpy as np
-import pyvista as pv
-import torch
-from rich.progress import track
-from torch.utils.data import Dataset

 DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/")
-VTKFILE_NOMINAL = DATASET_DIR / "ncs" / "nominal_blade.vtk"
 H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5"
 H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5"
-CARDINALITY_TRAIN = 1000
-CARDINALITY_TEST = 200
+N_POINTS = 29773

+_VERSION = "1.0.0"

-def rotate_nominal_blade(blade: pv.PolyData) -> None:
-    """Rotate nominal blade points.
-
-    The nominal blade must be rotated to match the orientation of the other blades.
-    Rotations applied (sequentially) are:
-        - -90° around z-axis
-        - -90° around y-axis
-
-    Args:
-        blade (pyvista.PolyData): blade to rotate
+_DESCRIPTION = """
+This dataset is a collection of 1200 pointclouds, each representing a blade of a wind turbine.
+The dataset is split into 2 subsets: train and test, with 1000 and 200 clouds respectively.
+Each pointcloud has 29773 points, each point has 3D coordinates, 3D normals and physical properties.
 """
-    THETA = -90
-    PHI = -90

-    RZ = np.array(
-        [
-            [np.cos(np.deg2rad(THETA)), -np.sin(np.deg2rad(THETA)), 0],
-            [np.sin(np.deg2rad(THETA)), np.cos(np.deg2rad(THETA)), 0],
-            [0, 0, 1],
-        ]
+
+class Rotor37(datasets.GeneratorBasedBuilder):
+    """Rotor37 dataset."""
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            version=_VERSION,
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "positions": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"),
+                    "normals": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"),
+                    "features": datasets.Array2D(shape=(N_POINTS, 4), dtype="float32"),
+                }
+            ),
        )

-    RY = np.array(
-        [
-            [np.cos(np.deg2rad(PHI)), 0, np.sin(np.deg2rad(PHI))],
-            [0, 1, 0],
-            [-np.sin(np.deg2rad(PHI)), 0, np.cos(np.deg2rad(PHI))],
+    def _split_generators(self, dl_manager):
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,  # type: ignore
+                gen_kwargs={
+                    "h5file": H5FILE_TEST,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,  # type: ignore
+                gen_kwargs={
+                    "h5file": H5FILE_TRAIN,
+                },
+            ),
        ]
-    )

-    # rotation of θ° around z-axis
-    blade.points = np.asarray(blade.points) @ RZ
-    blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RZ
-
-    # rotation of φ° around y-axis
-    blade.points = np.asarray(blade.points) @ RY
-    blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RY
-
-
-class Rotor37Dataset(Dataset):
-    """Rotor37 dataset.
-
-    This dataset is a collection of 1200 graphs, each representing a blade of a wind turbine.
-    The dataset is split into 2 subsets: train and test, with 1000 and 200 graphs respectively.
-    Each graph is a 3D mesh, with 3D deformations from a nominal blade, 3D normals, 3D faces and physical properties.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        split: str = "train",
-    ):
-        """Initialize a new Rotor37 dataset instance.
-
-        Args:
-            root (str): root directory of the dataset
-            split (str): split of the dataset, either "train" or "test"
-        """
-        # set split
-        assert split in ("train", "test")
-        self.split = split
-
-        # set cardinality and h5file according to split
-        self.cardinality = CARDINALITY_TRAIN if split == "train" else CARDINALITY_TEST
-        self.h5file = H5FILE_TRAIN if split == "train" else H5FILE_TEST
-
-        super().__init__(root, transform, pre_transform)
-
-    @property
-    def raw_file_names(self) -> list[str]:
-        """No raw files."""
-        return []
-
-    @property
-    def processed_file_names(self) -> list[str]:
-        """Processed files are named data_{split}_{idx:04d}.pt, where idx is the index of the graph."""
-        return [f"data_{self.split}_{idx:04d}.pt" for idx in range(self.cardinality)]
-
-    def download(self):
-        """No need to download, data already in cluster."""
-        pass
-
-    def process(self) -> None:
-        """Process the dataset.
-
-        The dataset is processed by loading the nominal blade, and then loading all deformed blades.
-        For each deformed blade, the following attributes are computed and stored in a `Data` object:
-            - delta: deformed blade - nominal blade
-            - fields: physical properties of the blade
-            - normals: normals of the blade
-            - edges: edges of the blade
-            - faces: faces of the blade
-
-        The `Data` object is then saved to disk.
-        """
-        # load nominal blade
-        vtk_reader = pv.get_reader(VTKFILE_NOMINAL)
-        nominal = vtk_reader.read()
-        rotate_nominal_blade(nominal)
-        nominal_positions = torch.as_tensor(nominal.points, dtype=torch.float32)
-
-        # load all deformed blades
-        with h5py.File(self.h5file, "r") as h5file:
-            # NB: torch.as_tensor(np.asarray(data)) is a bit ugly
-            # but torch torch.as_tensor(data) complains about data being an array of numpy arrays, and is also slower
-
-            # common edges and faces matrix for each graph
-            edges = torch.as_tensor(np.asarray(h5file["adj"]), dtype=torch.int64).transpose(0, 1)
-            faces = torch.as_tensor(np.asarray(h5file["faces"]), dtype=torch.int64).transpose(0, 1)
-
-            # attributes specific to each graph
+    def _generate_examples(self, h5file: Path):
+        with h5py.File(h5file, "r") as f:
            attributes = zip(
-                h5file["points"],  # type: ignore
-                h5file["normals"],  # type: ignore
-                h5file["output_fields"],  # type: ignore
+                f["points"],  # type: ignore
+                f["normals"],  # type: ignore
+                f["output_fields"],  # type: ignore
            )

-            # for each graph
-            for idx, (positions, normals, fields) in track(
-                enumerate(attributes),
-                total=self.cardinality,
-            ):
-                # convert to torch tensors
-                positions = torch.as_tensor(np.asarray(positions), dtype=torch.float32)
-                fields = torch.as_tensor(np.asarray(fields), dtype=torch.float32)
-                normals = torch.as_tensor(np.asarray(normals), dtype=torch.float32)
-                delta = positions - nominal_positions
-
-                # save data to disk
-
-    def len(self) -> int:
-        """Return the cardinality of the dataset."""
-        return self.cardinality
-
-    def get(self, idx) -> Data:
-        """Load and return the graph `Data`.
-
-        Args:
-            idx (int): index of the graph to return
-
-        Returns:
-            Data: graph at index `idx`
-        """
-        return torch.load(self.processed_dir / f"data_{self.split}_{idx:04d}.pt")
-
-    def __repr__(self) -> str:
-        """Return a string representation of the dataset."""
-        return f"{self.__class__.__name__}({self.split}, {len(self)})"
-
-    @property
-    def processed_dir(self) -> Path:
-        """Wrap processed_dir to return a Path instead of a str."""
-        return Path(super().processed_dir)
-
-
-if __name__ == "__main__":
-    from torch_geometric.loader import DataLoader
-
-    # load test split
-    ds_test = Rotor37Dataset(root="./datasets/Rotor37/", split="test")
-    print(ds_test)
-    print(ds_test[0])
-
-    # create test data loader
-    ld_test = DataLoader(ds_test, batch_size=8, shuffle=True)
-    print(ld_test)
-    print(next(iter(ld_test)))
-
-    # load train split
-    ds_train = Rotor37Dataset(root="./datasets/Rotor37/", split="train")
-    print(ds_train)
-    print(ds_train[0])
-
-    # create train data loader
-    ld_train = DataLoader(ds_train, batch_size=8, shuffle=True)
-    print(ld_train)
-    print(next(iter(ld_train)))
+            for index, (positions, normals, fields) in enumerate(attributes):
+                yield index, {
+                    "positions": positions,
+                    "normals": normals,
+                    "features": fields,
+                }
--- a/dataset/test_rotor37_data.py
+++ b/dataset/test_rotor37_data.py
@ -0,0 +1,11 @@
+import datasets
+
+train_ds = datasets.load_dataset("dataset/rotor37_data.py", split="train")
+train_ds = train_ds.with_format("torch")
+print(train_ds)
+
+test_ds = datasets.load_dataset("dataset/rotor37_data.py", split="test")
+test_ds = test_ds.with_format("torch")
+print(test_ds)
+
+print("yay")
--- a/environment.yml
+++ b/environment.yml
@ -18,7 +18,6 @@ dependencies:
  - trimesh
  - scipy
  - scikit-learn
-  - h5py
  - pyvista
  - datasets
 #---# toolings