chore: move dataset to submodule
This commit is contained in:
parent
1d09b753ae
commit
89f9112fca
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
[submodule "aiornot_datasets"]
|
||||||
|
path = aiornot_datasets
|
||||||
|
url = https://huggingface.co/datasets/tocard-inc/aiornot
|
1
aiornot_datasets
Submodule
1
aiornot_datasets
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit a90618df992a19c775b6b0fb7e0de0fd45a4d505
|
|
@ -1,88 +0,0 @@
|
||||||
"""Dataset class AI or NOT HuggingFace competition."""
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import pathlib
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import datasets
|
|
||||||
|
|
||||||
_VERSION = "1.0.0"
|
|
||||||
|
|
||||||
_GIT_COMMIT_REVISION = "b843a82bd712648b2fe0dc043cf8a04475491d38"
|
|
||||||
|
|
||||||
_BASE_URLS = {
|
|
||||||
"train": f"https://huggingface.co/datasets/competitions/aiornot/resolve/{_GIT_COMMIT_REVISION}/train.zip",
|
|
||||||
"test": f"https://huggingface.co/datasets/competitions/aiornot/resolve/{_GIT_COMMIT_REVISION}/test.zip",
|
|
||||||
"csv": f"https://huggingface.co/datasets/competitions/aiornot/resolve/{_GIT_COMMIT_REVISION}/train.csv",
|
|
||||||
}
|
|
||||||
_HOMEPAGE = "https://huggingface.co/spaces/competitions/aiornot"
|
|
||||||
|
|
||||||
_DESCRIPTION = """
|
|
||||||
The dataset consists of approximately 31000 images, some of which have been generated by ai.
|
|
||||||
Your task is to build a model that can identify ai generated images.
|
|
||||||
Please use the community tab for discussion and questions.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_NAMES = [
|
|
||||||
"NOT",
|
|
||||||
"AI",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class aiornot(datasets.GeneratorBasedBuilder):
|
|
||||||
"""Food-101 Images dataset."""
|
|
||||||
|
|
||||||
def _info(self):
|
|
||||||
return datasets.DatasetInfo(
|
|
||||||
description=_DESCRIPTION,
|
|
||||||
version=_VERSION,
|
|
||||||
features=datasets.Features(
|
|
||||||
{
|
|
||||||
"image": datasets.Image(),
|
|
||||||
"label": datasets.ClassLabel(names=_NAMES),
|
|
||||||
}
|
|
||||||
),
|
|
||||||
supervised_keys=("image", "label"),
|
|
||||||
homepage=_HOMEPAGE,
|
|
||||||
task_templates=[datasets.tasks.ImageClassification(image_column="image", label_column="label")],
|
|
||||||
)
|
|
||||||
|
|
||||||
def _split_generators(self, dl_manager):
|
|
||||||
train_path = pathlib.Path(dl_manager.download_and_extract(_BASE_URLS["train"]))
|
|
||||||
test_path = pathlib.Path(dl_manager.download_and_extract(_BASE_URLS["test"]))
|
|
||||||
csv_path = pathlib.Path(dl_manager.download(_BASE_URLS["csv"]))
|
|
||||||
|
|
||||||
return [
|
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TRAIN,
|
|
||||||
gen_kwargs={
|
|
||||||
"data_dir": train_path / "train",
|
|
||||||
"csv_file": csv_path,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
datasets.SplitGenerator(
|
|
||||||
name=datasets.Split.TEST,
|
|
||||||
gen_kwargs={
|
|
||||||
"data_dir": test_path / "test",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
def _generate_examples(self, data_dir: pathlib.Path, csv_file: Optional[pathlib.Path] = None):
|
|
||||||
"""Generate images and labels for splits."""
|
|
||||||
if csv_file is not None:
|
|
||||||
with open(csv_file, "r") as f:
|
|
||||||
reader = csv.reader(f)
|
|
||||||
next(reader)
|
|
||||||
for index, row in enumerate(reader):
|
|
||||||
yield index, {
|
|
||||||
"image": str(data_dir / row[0]),
|
|
||||||
"label": row[1],
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
rglob = pathlib.Path(data_dir).rglob("*.jpg")
|
|
||||||
for index, filepath in enumerate(rglob):
|
|
||||||
yield index, {
|
|
||||||
"image": str(filepath),
|
|
||||||
"label": -1,
|
|
||||||
}
|
|
|
@ -3,9 +3,17 @@ import matplotlib.pyplot as plt
|
||||||
|
|
||||||
dataset = datasets.load_dataset("src/dataset.py")
|
dataset = datasets.load_dataset("src/dataset.py")
|
||||||
|
|
||||||
|
labels = dataset["train"].features["label"].names
|
||||||
|
print(labels)
|
||||||
|
|
||||||
|
id2label = {k: v for k, v in enumerate(labels)}
|
||||||
|
label2id = {v: k for k, v in enumerate(labels)}
|
||||||
|
print(label2id)
|
||||||
|
print(id2label)
|
||||||
|
|
||||||
idx = 0
|
idx = 0
|
||||||
plt.imshow(dataset["train"][idx]["image"])
|
plt.imshow(dataset["train"][idx]["image"])
|
||||||
plt.title(dataset["train"].features["label"].names[dataset["train"][idx]["label"]])
|
plt.title(id2label[dataset["train"][idx]["label"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
plt.imshow(dataset["test"][idx]["image"])
|
plt.imshow(dataset["test"][idx]["image"])
|
||||||
|
|
Reference in a new issue