PVD/dataset/rotor37_data.py

from pathlib import Path

import datasets
import h5py
import numpy as np

DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/")
H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5"
H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5"
N_POINTS = 29773

_VERSION = "1.0.0"

_DESCRIPTION = """
This dataset is a collection of 1200 pointclouds, each representing a blade of a wind turbine.
The dataset is split into 2 subsets: train and test, with 1000 and 200 clouds respectively.
Each pointcloud has 29773 points, each point has 3D coordinates, 3D normals and physical properties.
"""


class Rotor37(datasets.GeneratorBasedBuilder):
    """Rotor37 dataset."""

    def _info(self):
        return datasets.DatasetInfo(
            version=_VERSION,
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "positions": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"),
                    "normals": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"),
                    "features": datasets.Array2D(shape=(N_POINTS, 4), dtype="float32"),
                }
            ),
        )

    def _split_generators(self, dl_manager):
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,  # type: ignore
                gen_kwargs={
                    "h5file": H5FILE_TEST,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,  # type: ignore
                gen_kwargs={
                    "h5file": H5FILE_TRAIN,
                },
            ),
        ]

    def _generate_examples(self, h5file: Path):
        with h5py.File(h5file, "r") as f:
            # compute mean and std of positions
            positions = np.asarray(f["points"])
            positions_mean = positions.mean(axis=(0, 1))
            positions_std = positions.std(axis=(0, 1))

            # normalize positions
            positions = (positions - positions_mean) / positions_std

            # zip attributes
            attributes = zip(
                positions,
                f["normals"],  # type: ignore
                f["output_fields"],  # type: ignore
            )

            for index, (positions, normals, fields) in enumerate(attributes):
                yield index, {
                    "positions": positions,
                    "normals": normals,
                    "features": fields,
                }