Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

split_dataset.py 1.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
  1. from shutil import copy2
  2. import numpy as np
  3. import pandas as pd
  4. from tqdm import tqdm
  5. from scripts.params import DATASET_DIR, DATASET_VAL_TEST_SPLIT, RAW_DATASET_DIR
  6. #%% Create dirs if necessary
  7. DATASET_DIR.mkdir(exist_ok=True)
  8. #%% Parse image names in raw dir
  9. dataset_df = pd.DataFrame([
  10. {
  11. "image_name": file.name,
  12. "label": file.parent.name,
  13. "raw_path": file,
  14. "raw_split": file.parents[1].name,
  15. }
  16. for file in RAW_DATASET_DIR.glob("**/*.jpg")
  17. ]).assign(
  18. split=lambda df: df.raw_split.map(lambda split: (
  19. "train" if split == "train"
  20. else np.random.choice(list(DATASET_VAL_TEST_SPLIT), p=list(DATASET_VAL_TEST_SPLIT.values()))
  21. ))
  22. )
  23. print(dataset_df.groupby(["split", "label"]).image_name.count())
  24. (
  25. dataset_df
  26. .drop(columns=["raw_path", "raw_split"])
  27. .to_csv(DATASET_DIR / "dataset.csv", index=False)
  28. )
  29. #%% Copy images to split subdirs
  30. for split in ["train", "val", "test"]:
  31. for label in set(dataset_df.label):
  32. (DATASET_DIR / split / label).mkdir(exist_ok=True, parents=True)
  33. tqdm.pandas(desc="Copying images to split folder")
  34. dataset_df.progress_apply(lambda row: copy2(
  35. src=row["raw_path"],
  36. dst=DATASET_DIR / row["split"] / row["label"] / row["image_name"],
  37. ), axis=1)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...