chore: move dataset to submodule

2023-02-02 11:14:52 +01:00 · 2023-02-02 11:14:52 +01:00 · 89f9112fca
parent 1d09b753ae
commit 89f9112fca
4 changed files with 13 additions and 89 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "aiornot_datasets"]
 	path = aiornot_datasets
 	url = https://huggingface.co/datasets/tocard-inc/aiornot
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit a90618df992a19c775b6b0fb7e0de0fd45a4d505
--- a/src/dataset.py
+++ b/src/dataset.py
@ -1,88 +0,0 @@
 """Dataset class AI or NOT HuggingFace competition."""
 import csv
 import pathlib
 from typing import Optional
 import datasets
 _VERSION = "1.0.0"
 _GIT_COMMIT_REVISION = "b843a82bd712648b2fe0dc043cf8a04475491d38"
 _BASE_URLS = {
    "train": f"https://huggingface.co/datasets/competitions/aiornot/resolve/{_GIT_COMMIT_REVISION}/train.zip",
    "test": f"https://huggingface.co/datasets/competitions/aiornot/resolve/{_GIT_COMMIT_REVISION}/test.zip",
    "csv": f"https://huggingface.co/datasets/competitions/aiornot/resolve/{_GIT_COMMIT_REVISION}/train.csv",
 }
 _HOMEPAGE = "https://huggingface.co/spaces/competitions/aiornot"
 _DESCRIPTION = """
 The dataset consists of approximately 31000 images, some of which have been generated by ai.
 Your task is to build a model that can identify ai generated images.
 Please use the community tab for discussion and questions.
 """
 _NAMES = [
    "NOT",
    "AI",
 ]
 class aiornot(datasets.GeneratorBasedBuilder):
    """Food-101 Images dataset."""
    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            version=_VERSION,
            features=datasets.Features(
                {
                    "image": datasets.Image(),
                    "label": datasets.ClassLabel(names=_NAMES),
                }
            ),
            supervised_keys=("image", "label"),
            homepage=_HOMEPAGE,
            task_templates=[datasets.tasks.ImageClassification(image_column="image", label_column="label")],
        )
    def _split_generators(self, dl_manager):
        train_path = pathlib.Path(dl_manager.download_and_extract(_BASE_URLS["train"]))
        test_path = pathlib.Path(dl_manager.download_and_extract(_BASE_URLS["test"]))
        csv_path = pathlib.Path(dl_manager.download(_BASE_URLS["csv"]))
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_dir": train_path / "train",
                    "csv_file": csv_path,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "data_dir": test_path / "test",
                },
            ),
        ]
    def _generate_examples(self, data_dir: pathlib.Path, csv_file: Optional[pathlib.Path] = None):
        """Generate images and labels for splits."""
        if csv_file is not None:
            with open(csv_file, "r") as f:
                reader = csv.reader(f)
                next(reader)
                for index, row in enumerate(reader):
                    yield index, {
                        "image": str(data_dir / row[0]),
                        "label": row[1],
                    }
        else:
            rglob = pathlib.Path(data_dir).rglob("*.jpg")
            for index, filepath in enumerate(rglob):
                yield index, {
                    "image": str(filepath),
                    "label": -1,
                }
--- a/src/tests/dataset.py
+++ b/src/tests/dataset.py
@ -3,9 +3,17 @@ import matplotlib.pyplot as plt
 dataset = datasets.load_dataset("src/dataset.py")
 labels = dataset["train"].features["label"].names
 print(labels)
 id2label = {k: v for k, v in enumerate(labels)}
 label2id = {v: k for k, v in enumerate(labels)}
 print(label2id)
 print(id2label)
 idx = 0
 plt.imshow(dataset["train"][idx]["image"])
-plt.title(dataset["train"].features["label"].names[dataset["train"][idx]["label"]])
+plt.title(id2label[dataset["train"][idx]["label"]])
 plt.show()
 plt.imshow(dataset["test"][idx]["image"])
		`@ -0,0 +1 @@`
							`Subproject commit a90618df992a19c775b6b0fb7e0de0fd45a4d505`