from pathlib import Path import datasets import h5py import numpy as np DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/") H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5" H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5" N_POINTS = 29773 _VERSION = "1.0.0" _DESCRIPTION = """ This dataset is a collection of 1200 pointclouds, each representing a blade of a wind turbine. The dataset is split into 2 subsets: train and test, with 1000 and 200 clouds respectively. Each pointcloud has 29773 points, each point has 3D coordinates, 3D normals and physical properties. """ class Rotor37(datasets.GeneratorBasedBuilder): """Rotor37 dataset.""" def _info(self): return datasets.DatasetInfo( version=_VERSION, description=_DESCRIPTION, features=datasets.Features( { "positions": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"), "normals": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"), "features": datasets.Array2D(shape=(N_POINTS, 4), dtype="float32"), } ), ) def _split_generators(self, dl_manager): return [ datasets.SplitGenerator( name=datasets.Split.TEST, # type: ignore gen_kwargs={ "h5file": H5FILE_TEST, }, ), datasets.SplitGenerator( name=datasets.Split.TRAIN, # type: ignore gen_kwargs={ "h5file": H5FILE_TRAIN, }, ), ] def _generate_examples(self, h5file: Path): with h5py.File(h5file, "r") as f: # compute mean and std of positions positions = np.asarray(f["points"]) positions_mean = positions.mean(axis=(0, 1)) positions_std = positions.std(axis=(0, 1)) # normalize positions positions = (positions - positions_mean) / positions_std # zip attributes attributes = zip( positions, f["normals"], # type: ignore f["output_fields"], # type: ignore ) for index, (positions, normals, fields) in enumerate(attributes): yield index, { "positions": positions, "normals": normals, "features": fields, }