From df48f8272a59490f5e3784344b7c240b92b96a8d Mon Sep 17 00:00:00 2001 From: Laurent FAINSIN Date: Tue, 11 Apr 2023 16:00:54 +0200 Subject: [PATCH] feat: add huggingface rotor37 dataset --- .vscode/launch.json | 38 +++--- dataset/rotor37_data.py | 227 ++++++++--------------------------- dataset/test_rotor37_data.py | 11 ++ environment.yml | 1 - 4 files changed, 79 insertions(+), 198 deletions(-) create mode 100644 dataset/test_rotor37_data.py diff --git a/.vscode/launch.json b/.vscode/launch.json index 3c70c0b..94118b8 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,20 +1,20 @@ { - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python: Current File", - "type": "python", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal", - "justMyCode": true, - "args": [ - "--category", - "car", - ] - } - ] -} \ No newline at end of file + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": true, + "args": [ + "--category", + "car", + ] + } + ] +} diff --git a/dataset/rotor37_data.py b/dataset/rotor37_data.py index b15b265..6e247cb 100644 --- a/dataset/rotor37_data.py +++ b/dataset/rotor37_data.py @@ -1,194 +1,65 @@ from pathlib import Path +import datasets import h5py -import numpy as np -import pyvista as pv -import torch -from rich.progress import track -from torch.utils.data import Dataset DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/") -VTKFILE_NOMINAL = DATASET_DIR / "ncs" / "nominal_blade.vtk" H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5" H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5" -CARDINALITY_TRAIN = 1000 -CARDINALITY_TEST = 200 +N_POINTS = 29773 + +_VERSION = "1.0.0" + +_DESCRIPTION = """ +This dataset is a collection of 1200 pointclouds, each representing a blade of a wind turbine. +The dataset is split into 2 subsets: train and test, with 1000 and 200 clouds respectively. +Each pointcloud has 29773 points, each point has 3D coordinates, 3D normals and physical properties. +""" -def rotate_nominal_blade(blade: pv.PolyData) -> None: - """Rotate nominal blade points. +class Rotor37(datasets.GeneratorBasedBuilder): + """Rotor37 dataset.""" - The nominal blade must be rotated to match the orientation of the other blades. - Rotations applied (sequentially) are: - - -90° around z-axis - - -90° around y-axis + def _info(self): + return datasets.DatasetInfo( + version=_VERSION, + description=_DESCRIPTION, + features=datasets.Features( + { + "positions": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"), + "normals": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"), + "features": datasets.Array2D(shape=(N_POINTS, 4), dtype="float32"), + } + ), + ) - Args: - blade (pyvista.PolyData): blade to rotate - """ - THETA = -90 - PHI = -90 - - RZ = np.array( - [ - [np.cos(np.deg2rad(THETA)), -np.sin(np.deg2rad(THETA)), 0], - [np.sin(np.deg2rad(THETA)), np.cos(np.deg2rad(THETA)), 0], - [0, 0, 1], + def _split_generators(self, dl_manager): + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, # type: ignore + gen_kwargs={ + "h5file": H5FILE_TEST, + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TRAIN, # type: ignore + gen_kwargs={ + "h5file": H5FILE_TRAIN, + }, + ), ] - ) - RY = np.array( - [ - [np.cos(np.deg2rad(PHI)), 0, np.sin(np.deg2rad(PHI))], - [0, 1, 0], - [-np.sin(np.deg2rad(PHI)), 0, np.cos(np.deg2rad(PHI))], - ] - ) - - # rotation of θ° around z-axis - blade.points = np.asarray(blade.points) @ RZ - blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RZ - - # rotation of φ° around y-axis - blade.points = np.asarray(blade.points) @ RY - blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RY - - -class Rotor37Dataset(Dataset): - """Rotor37 dataset. - - This dataset is a collection of 1200 graphs, each representing a blade of a wind turbine. - The dataset is split into 2 subsets: train and test, with 1000 and 200 graphs respectively. - Each graph is a 3D mesh, with 3D deformations from a nominal blade, 3D normals, 3D faces and physical properties. - """ - - def __init__( - self, - root: str, - split: str = "train", - ): - """Initialize a new Rotor37 dataset instance. - - Args: - root (str): root directory of the dataset - split (str): split of the dataset, either "train" or "test" - """ - # set split - assert split in ("train", "test") - self.split = split - - # set cardinality and h5file according to split - self.cardinality = CARDINALITY_TRAIN if split == "train" else CARDINALITY_TEST - self.h5file = H5FILE_TRAIN if split == "train" else H5FILE_TEST - - super().__init__(root, transform, pre_transform) - - @property - def raw_file_names(self) -> list[str]: - """No raw files.""" - return [] - - @property - def processed_file_names(self) -> list[str]: - """Processed files are named data_{split}_{idx:04d}.pt, where idx is the index of the graph.""" - return [f"data_{self.split}_{idx:04d}.pt" for idx in range(self.cardinality)] - - def download(self): - """No need to download, data already in cluster.""" - pass - - def process(self) -> None: - """Process the dataset. - - The dataset is processed by loading the nominal blade, and then loading all deformed blades. - For each deformed blade, the following attributes are computed and stored in a `Data` object: - - delta: deformed blade - nominal blade - - fields: physical properties of the blade - - normals: normals of the blade - - edges: edges of the blade - - faces: faces of the blade - - The `Data` object is then saved to disk. - """ - # load nominal blade - vtk_reader = pv.get_reader(VTKFILE_NOMINAL) - nominal = vtk_reader.read() - rotate_nominal_blade(nominal) - nominal_positions = torch.as_tensor(nominal.points, dtype=torch.float32) - - # load all deformed blades - with h5py.File(self.h5file, "r") as h5file: - # NB: torch.as_tensor(np.asarray(data)) is a bit ugly - # but torch torch.as_tensor(data) complains about data being an array of numpy arrays, and is also slower - - # common edges and faces matrix for each graph - edges = torch.as_tensor(np.asarray(h5file["adj"]), dtype=torch.int64).transpose(0, 1) - faces = torch.as_tensor(np.asarray(h5file["faces"]), dtype=torch.int64).transpose(0, 1) - - # attributes specific to each graph + def _generate_examples(self, h5file: Path): + with h5py.File(h5file, "r") as f: attributes = zip( - h5file["points"], # type: ignore - h5file["normals"], # type: ignore - h5file["output_fields"], # type: ignore + f["points"], # type: ignore + f["normals"], # type: ignore + f["output_fields"], # type: ignore ) - # for each graph - for idx, (positions, normals, fields) in track( - enumerate(attributes), - total=self.cardinality, - ): - # convert to torch tensors - positions = torch.as_tensor(np.asarray(positions), dtype=torch.float32) - fields = torch.as_tensor(np.asarray(fields), dtype=torch.float32) - normals = torch.as_tensor(np.asarray(normals), dtype=torch.float32) - delta = positions - nominal_positions - - # save data to disk - - def len(self) -> int: - """Return the cardinality of the dataset.""" - return self.cardinality - - def get(self, idx) -> Data: - """Load and return the graph `Data`. - - Args: - idx (int): index of the graph to return - - Returns: - Data: graph at index `idx` - """ - return torch.load(self.processed_dir / f"data_{self.split}_{idx:04d}.pt") - - def __repr__(self) -> str: - """Return a string representation of the dataset.""" - return f"{self.__class__.__name__}({self.split}, {len(self)})" - - @property - def processed_dir(self) -> Path: - """Wrap processed_dir to return a Path instead of a str.""" - return Path(super().processed_dir) - - -if __name__ == "__main__": - from torch_geometric.loader import DataLoader - - # load test split - ds_test = Rotor37Dataset(root="./datasets/Rotor37/", split="test") - print(ds_test) - print(ds_test[0]) - - # create test data loader - ld_test = DataLoader(ds_test, batch_size=8, shuffle=True) - print(ld_test) - print(next(iter(ld_test))) - - # load train split - ds_train = Rotor37Dataset(root="./datasets/Rotor37/", split="train") - print(ds_train) - print(ds_train[0]) - - # create train data loader - ld_train = DataLoader(ds_train, batch_size=8, shuffle=True) - print(ld_train) - print(next(iter(ld_train))) + for index, (positions, normals, fields) in enumerate(attributes): + yield index, { + "positions": positions, + "normals": normals, + "features": fields, + } diff --git a/dataset/test_rotor37_data.py b/dataset/test_rotor37_data.py new file mode 100644 index 0000000..f25852b --- /dev/null +++ b/dataset/test_rotor37_data.py @@ -0,0 +1,11 @@ +import datasets + +train_ds = datasets.load_dataset("dataset/rotor37_data.py", split="train") +train_ds = train_ds.with_format("torch") +print(train_ds) + +test_ds = datasets.load_dataset("dataset/rotor37_data.py", split="test") +test_ds = test_ds.with_format("torch") +print(test_ds) + +print("yay") diff --git a/environment.yml b/environment.yml index 78a0b86..14f7796 100644 --- a/environment.yml +++ b/environment.yml @@ -18,7 +18,6 @@ dependencies: - trimesh - scipy - scikit-learn - - h5py - pyvista - datasets #---# toolings