feat: add huggingface rotor37 dataset

This commit is contained in:
Laurent FAINSIN 2023-04-11 16:00:54 +02:00
parent a97c2e87f9
commit df48f8272a
4 changed files with 79 additions and 198 deletions

View file

@ -1,194 +1,65 @@
from pathlib import Path from pathlib import Path
import datasets
import h5py import h5py
import numpy as np
import pyvista as pv
import torch
from rich.progress import track
from torch.utils.data import Dataset
DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/") DATASET_DIR = Path("/gpfs_new/cold-data/InputData/public_datasets/rotor37/rotor37_1200/")
VTKFILE_NOMINAL = DATASET_DIR / "ncs" / "nominal_blade.vtk"
H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5" H5FILE_TRAIN = DATASET_DIR / "h5" / "blade_meshes_train.h5"
H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5" H5FILE_TEST = DATASET_DIR / "h5" / "blade_meshes_test.h5"
CARDINALITY_TRAIN = 1000 N_POINTS = 29773
CARDINALITY_TEST = 200
_VERSION = "1.0.0"
_DESCRIPTION = """
This dataset is a collection of 1200 pointclouds, each representing a blade of a wind turbine.
The dataset is split into 2 subsets: train and test, with 1000 and 200 clouds respectively.
Each pointcloud has 29773 points, each point has 3D coordinates, 3D normals and physical properties.
"""
def rotate_nominal_blade(blade: pv.PolyData) -> None: class Rotor37(datasets.GeneratorBasedBuilder):
"""Rotate nominal blade points. """Rotor37 dataset."""
The nominal blade must be rotated to match the orientation of the other blades. def _info(self):
Rotations applied (sequentially) are: return datasets.DatasetInfo(
- -90° around z-axis version=_VERSION,
- -90° around y-axis description=_DESCRIPTION,
features=datasets.Features(
Args: {
blade (pyvista.PolyData): blade to rotate "positions": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"),
""" "normals": datasets.Array2D(shape=(N_POINTS, 3), dtype="float32"),
THETA = -90 "features": datasets.Array2D(shape=(N_POINTS, 4), dtype="float32"),
PHI = -90 }
),
RZ = np.array(
[
[np.cos(np.deg2rad(THETA)), -np.sin(np.deg2rad(THETA)), 0],
[np.sin(np.deg2rad(THETA)), np.cos(np.deg2rad(THETA)), 0],
[0, 0, 1],
]
) )
RY = np.array( def _split_generators(self, dl_manager):
[ return [
[np.cos(np.deg2rad(PHI)), 0, np.sin(np.deg2rad(PHI))], datasets.SplitGenerator(
[0, 1, 0], name=datasets.Split.TEST, # type: ignore
[-np.sin(np.deg2rad(PHI)), 0, np.cos(np.deg2rad(PHI))], gen_kwargs={
"h5file": H5FILE_TEST,
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN, # type: ignore
gen_kwargs={
"h5file": H5FILE_TRAIN,
},
),
] ]
)
# rotation of θ° around z-axis def _generate_examples(self, h5file: Path):
blade.points = np.asarray(blade.points) @ RZ with h5py.File(h5file, "r") as f:
blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RZ
# rotation of φ° around y-axis
blade.points = np.asarray(blade.points) @ RY
blade.point_data["Normals"] = np.asarray(blade.point_normals) @ RY
class Rotor37Dataset(Dataset):
"""Rotor37 dataset.
This dataset is a collection of 1200 graphs, each representing a blade of a wind turbine.
The dataset is split into 2 subsets: train and test, with 1000 and 200 graphs respectively.
Each graph is a 3D mesh, with 3D deformations from a nominal blade, 3D normals, 3D faces and physical properties.
"""
def __init__(
self,
root: str,
split: str = "train",
):
"""Initialize a new Rotor37 dataset instance.
Args:
root (str): root directory of the dataset
split (str): split of the dataset, either "train" or "test"
"""
# set split
assert split in ("train", "test")
self.split = split
# set cardinality and h5file according to split
self.cardinality = CARDINALITY_TRAIN if split == "train" else CARDINALITY_TEST
self.h5file = H5FILE_TRAIN if split == "train" else H5FILE_TEST
super().__init__(root, transform, pre_transform)
@property
def raw_file_names(self) -> list[str]:
"""No raw files."""
return []
@property
def processed_file_names(self) -> list[str]:
"""Processed files are named data_{split}_{idx:04d}.pt, where idx is the index of the graph."""
return [f"data_{self.split}_{idx:04d}.pt" for idx in range(self.cardinality)]
def download(self):
"""No need to download, data already in cluster."""
pass
def process(self) -> None:
"""Process the dataset.
The dataset is processed by loading the nominal blade, and then loading all deformed blades.
For each deformed blade, the following attributes are computed and stored in a `Data` object:
- delta: deformed blade - nominal blade
- fields: physical properties of the blade
- normals: normals of the blade
- edges: edges of the blade
- faces: faces of the blade
The `Data` object is then saved to disk.
"""
# load nominal blade
vtk_reader = pv.get_reader(VTKFILE_NOMINAL)
nominal = vtk_reader.read()
rotate_nominal_blade(nominal)
nominal_positions = torch.as_tensor(nominal.points, dtype=torch.float32)
# load all deformed blades
with h5py.File(self.h5file, "r") as h5file:
# NB: torch.as_tensor(np.asarray(data)) is a bit ugly
# but torch torch.as_tensor(data) complains about data being an array of numpy arrays, and is also slower
# common edges and faces matrix for each graph
edges = torch.as_tensor(np.asarray(h5file["adj"]), dtype=torch.int64).transpose(0, 1)
faces = torch.as_tensor(np.asarray(h5file["faces"]), dtype=torch.int64).transpose(0, 1)
# attributes specific to each graph
attributes = zip( attributes = zip(
h5file["points"], # type: ignore f["points"], # type: ignore
h5file["normals"], # type: ignore f["normals"], # type: ignore
h5file["output_fields"], # type: ignore f["output_fields"], # type: ignore
) )
# for each graph for index, (positions, normals, fields) in enumerate(attributes):
for idx, (positions, normals, fields) in track( yield index, {
enumerate(attributes), "positions": positions,
total=self.cardinality, "normals": normals,
): "features": fields,
# convert to torch tensors }
positions = torch.as_tensor(np.asarray(positions), dtype=torch.float32)
fields = torch.as_tensor(np.asarray(fields), dtype=torch.float32)
normals = torch.as_tensor(np.asarray(normals), dtype=torch.float32)
delta = positions - nominal_positions
# save data to disk
def len(self) -> int:
"""Return the cardinality of the dataset."""
return self.cardinality
def get(self, idx) -> Data:
"""Load and return the graph `Data`.
Args:
idx (int): index of the graph to return
Returns:
Data: graph at index `idx`
"""
return torch.load(self.processed_dir / f"data_{self.split}_{idx:04d}.pt")
def __repr__(self) -> str:
"""Return a string representation of the dataset."""
return f"{self.__class__.__name__}({self.split}, {len(self)})"
@property
def processed_dir(self) -> Path:
"""Wrap processed_dir to return a Path instead of a str."""
return Path(super().processed_dir)
if __name__ == "__main__":
from torch_geometric.loader import DataLoader
# load test split
ds_test = Rotor37Dataset(root="./datasets/Rotor37/", split="test")
print(ds_test)
print(ds_test[0])
# create test data loader
ld_test = DataLoader(ds_test, batch_size=8, shuffle=True)
print(ld_test)
print(next(iter(ld_test)))
# load train split
ds_train = Rotor37Dataset(root="./datasets/Rotor37/", split="train")
print(ds_train)
print(ds_train[0])
# create train data loader
ld_train = DataLoader(ds_train, batch_size=8, shuffle=True)
print(ld_train)
print(next(iter(ld_train)))

View file

@ -0,0 +1,11 @@
import datasets
train_ds = datasets.load_dataset("dataset/rotor37_data.py", split="train")
train_ds = train_ds.with_format("torch")
print(train_ds)
test_ds = datasets.load_dataset("dataset/rotor37_data.py", split="test")
test_ds = test_ds.with_format("torch")
print(test_ds)
print("yay")

View file

@ -18,7 +18,6 @@ dependencies:
- trimesh - trimesh
- scipy - scipy
- scikit-learn - scikit-learn
- h5py
- pyvista - pyvista
- datasets - datasets
#---# toolings #---# toolings