sycod
commited on
Commit
·
a03ee84
1
Parent(s):
6ebb6d1
data ok and EDA begun
Browse files- EDA.ipynb +0 -0
- README.md +24 -1
- app.py +5 -4
- config.yaml +10 -1
- src/clear_hf_cache.sh +1 -0
- src/eda.py +36 -0
- src/load_data.py +156 -51
EDA.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
README.md
CHANGED
@@ -2,4 +2,27 @@
|
|
2 |
license: mit
|
3 |
datasets:
|
4 |
- frugal-ai-challenge/public-leaderboard-image
|
5 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
license: mit
|
3 |
datasets:
|
4 |
- frugal-ai-challenge/public-leaderboard-image
|
5 |
+
---
|
6 |
+
|
7 |
+
# Notices
|
8 |
+
|
9 |
+
- **Stratification is not used for data split** (splits are predefined in project)
|
10 |
+
|
11 |
+
# Installation
|
12 |
+
|
13 |
+
✅ Checklist:
|
14 |
+
- [] Check **absolute path to HF cache folder** is up-to-date in "clear_hf_cache.sh"
|
15 |
+
- [] Check **"clear_hf_cache.sh" script is executable** 👉 `chmod +x ./src/clear_hf_cache.sh`
|
16 |
+
|
17 |
+
# 🚧 TODO
|
18 |
+
|
19 |
+
- voir répartition partenaires, caméras, temporalité, annotations
|
20 |
+
- utiliser **classification binaire**
|
21 |
+
- métriques : **matrice de confusion** complète
|
22 |
+
- décrire voir **erreurs de types et conséquences**
|
23 |
+
- tester plusieurs pré-entraînements
|
24 |
+
- modèles :
|
25 |
+
- [ ] EfficientNet
|
26 |
+
- [ ] EfficientDet
|
27 |
+
- tester si amélioration avec et sans égalisation
|
28 |
+
- voir répartition physique des annotations sur l'image
|
app.py
CHANGED
@@ -7,7 +7,7 @@ load_dotenv()
|
|
7 |
|
8 |
app = FastAPI(
|
9 |
title="Frugal AI Challenge API",
|
10 |
-
description="API for the Frugal AI Challenge evaluation endpoints"
|
11 |
)
|
12 |
|
13 |
# Include all routers
|
@@ -15,6 +15,7 @@ app.include_router(text.router)
|
|
15 |
app.include_router(image.router)
|
16 |
app.include_router(audio.router)
|
17 |
|
|
|
18 |
@app.get("/")
|
19 |
async def root():
|
20 |
return {
|
@@ -22,6 +23,6 @@ async def root():
|
|
22 |
"endpoints": {
|
23 |
"text": "/text - Text classification task",
|
24 |
"image": "/image - Image classification task (coming soon)",
|
25 |
-
"audio": "/audio - Audio classification task (coming soon)"
|
26 |
-
}
|
27 |
-
}
|
|
|
7 |
|
8 |
app = FastAPI(
|
9 |
title="Frugal AI Challenge API",
|
10 |
+
description="API for the Frugal AI Challenge evaluation endpoints",
|
11 |
)
|
12 |
|
13 |
# Include all routers
|
|
|
15 |
app.include_router(image.router)
|
16 |
app.include_router(audio.router)
|
17 |
|
18 |
+
|
19 |
@app.get("/")
|
20 |
async def root():
|
21 |
return {
|
|
|
23 |
"endpoints": {
|
24 |
"text": "/text - Text classification task",
|
25 |
"image": "/image - Image classification task (coming soon)",
|
26 |
+
"audio": "/audio - Audio classification task (coming soon)",
|
27 |
+
},
|
28 |
+
}
|
config.yaml
CHANGED
@@ -1,4 +1,13 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
db_info_uri: "data_info.csv"
|
3 |
|
4 |
# log:
|
|
|
1 |
+
# From evaluation.py
|
2 |
+
repo_id: "pyronear/pyro-sdis"
|
3 |
+
split_size: 0.2
|
4 |
+
rdm_seed: 42
|
5 |
+
|
6 |
+
# Data
|
7 |
+
data_root_dir: "data"
|
8 |
+
raw_data_dir: "raw"
|
9 |
+
clr_hf_cache_script_abs_path: './src/clear_hf_cache.sh'
|
10 |
+
data_format: "keras" # "ultralytics" or "keras"
|
11 |
db_info_uri: "data_info.csv"
|
12 |
|
13 |
# log:
|
src/clear_hf_cache.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
rm -rvf ~/.cache/huggingface
|
src/eda.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Load dataset and save locally in selected format"""
|
2 |
+
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
import pandas as pd
|
6 |
+
import yaml
|
7 |
+
|
8 |
+
|
9 |
+
# Logging configuration (see all outputs, even DEBUG or INFO)
|
10 |
+
logger = logging.getLogger()
|
11 |
+
logger.setLevel(logging.INFO)
|
12 |
+
|
13 |
+
# local config
|
14 |
+
with open("config.yaml", "r") as f:
|
15 |
+
cfg = yaml.safe_load(f)
|
16 |
+
|
17 |
+
|
18 |
+
def make_autopct(values):
|
19 |
+
"""
|
20 |
+
==> Obtained from StackOverflow <==
|
21 |
+
Upgrades plt.pie(autopct=""), displaying percentages and values.
|
22 |
+
|
23 |
+
Input: list of numeric values or Pandas.Series
|
24 |
+
Output: string with percentage and value
|
25 |
+
"""
|
26 |
+
|
27 |
+
def my_autopct(pct):
|
28 |
+
total = sum(values)
|
29 |
+
val = int(round(pct * total / 100.0))
|
30 |
+
return "{p:.2f}% ({v:d})".format(p=pct, v=val)
|
31 |
+
|
32 |
+
return my_autopct
|
33 |
+
|
34 |
+
|
35 |
+
if __name__ == "__main__":
|
36 |
+
help()
|
src/load_data.py
CHANGED
@@ -1,9 +1,29 @@
|
|
1 |
-
"""Load dataset and save locally in
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
import logging
|
5 |
import os
|
6 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
# Save in Ultralytics format
|
@@ -11,41 +31,44 @@ def save_ultralytics_format(dataset_split, split, IMAGE_DIR, LABEL_DIR):
|
|
11 |
"""Save a dataset split into the Ultralytics format.
|
12 |
|
13 |
Args:
|
14 |
-
dataset_split: The dataset split (e.g
|
15 |
-
split: "train" or "val"
|
16 |
"""
|
|
|
17 |
image_split_dir = os.path.join(IMAGE_DIR, split)
|
18 |
label_split_dir = os.path.join(LABEL_DIR, split)
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
def create_df(ds, split_name, OUTPUT_DIR):
|
42 |
"""Create dataframe from dataset"""
|
|
|
43 |
df = pd.DataFrame(
|
44 |
[[i.size[0], i.size[1], i.format, i.mode] for i in ds["image"]],
|
45 |
-
columns=["width", "height", "format", "mode"]
|
46 |
)
|
47 |
df["name"] = ds["image_name"]
|
48 |
-
df["
|
|
|
|
|
|
|
49 |
df["annotations"] = ds["annotations"]
|
50 |
df["partner"] = ds["partner"]
|
51 |
df["camera"] = ds["camera"]
|
@@ -54,42 +77,124 @@ def create_df(ds, split_name, OUTPUT_DIR):
|
|
54 |
return df
|
55 |
|
56 |
|
57 |
-
def
|
58 |
-
"""
|
59 |
-
"""
|
60 |
|
61 |
# Check if data information already exists before eventually loading model
|
62 |
-
|
63 |
-
|
64 |
-
df = pd.read_csv(db_info_path, index_col=0)
|
65 |
return df
|
66 |
|
67 |
-
#
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
os.makedirs(os.path.join(IMAGE_DIR, split), exist_ok=True)
|
72 |
os.makedirs(os.path.join(LABEL_DIR, split), exist_ok=True)
|
73 |
|
74 |
-
#
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
# Save train and validation splits
|
79 |
-
save_ultralytics_format(dataset["train"], "train", IMAGE_DIR, LABEL_DIR)
|
80 |
-
save_ultralytics_format(dataset["val"], "val", IMAGE_DIR, LABEL_DIR)
|
81 |
|
82 |
# Create global dataframe from splits
|
83 |
-
df_val = create_df(dataset["val"], "val", OUTPUT_DIR)
|
84 |
# Separate train to save memory
|
85 |
-
df_train_1 = create_df(
|
86 |
-
df_train_2 = create_df(
|
87 |
-
df_train_3 = create_df(
|
|
|
|
|
|
|
88 |
# Save as one CSV
|
89 |
-
df = pd.concat(
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
df.to_csv(f)
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return df
|
94 |
|
95 |
|
|
|
1 |
+
"""Load dataset and save locally in selected format"""
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
import logging
|
5 |
import os
|
6 |
import pandas as pd
|
7 |
+
import shutil
|
8 |
+
import subprocess
|
9 |
+
import yaml
|
10 |
+
|
11 |
+
|
12 |
+
# Logging configuration (see all outputs, even DEBUG or INFO)
|
13 |
+
logger = logging.getLogger()
|
14 |
+
logger.setLevel(logging.INFO)
|
15 |
+
|
16 |
+
# local config
|
17 |
+
with open("config.yaml", "r") as f:
|
18 |
+
cfg = yaml.safe_load(f)
|
19 |
+
REPO_ID = cfg["repo_id"]
|
20 |
+
SPLIT_SIZE = cfg["split_size"]
|
21 |
+
RDM_SEED = cfg["rdm_seed"]
|
22 |
+
OUTPUT_DIR = cfg["data_root_dir"]
|
23 |
+
RAW_DATA_DIR = os.path.join(OUTPUT_DIR, cfg["raw_data_dir"])
|
24 |
+
CLR_CACHE_SCRIPT = cfg["clr_hf_cache_script_abs_path"]
|
25 |
+
DATA_FORMAT = cfg["data_format"]
|
26 |
+
DB_INFO_URI = os.path.join(OUTPUT_DIR, cfg["db_info_uri"])
|
27 |
|
28 |
|
29 |
# Save in Ultralytics format
|
|
|
31 |
"""Save a dataset split into the Ultralytics format.
|
32 |
|
33 |
Args:
|
34 |
+
dataset_split: The dataset split (e.g. dataset["train"])
|
35 |
+
split: "train", "test" or "val"
|
36 |
"""
|
37 |
+
|
38 |
image_split_dir = os.path.join(IMAGE_DIR, split)
|
39 |
label_split_dir = os.path.join(LABEL_DIR, split)
|
40 |
+
|
41 |
+
for example in dataset_split:
|
42 |
+
# Save image to appropriate folder
|
43 |
+
image = example["image"] # PIL.Image.Image
|
44 |
+
image_name = example["image_name"] # Original file name
|
45 |
+
output_image_path = os.path.join(image_split_dir, image_name)
|
46 |
+
# Save image object to disk
|
47 |
+
image.save(output_image_path)
|
48 |
+
|
49 |
+
# Save label
|
50 |
+
annotations = example["annotations"]
|
51 |
+
label_name = image_name.replace(".jpg", ".txt").replace(".png", ".txt")
|
52 |
+
output_label_path = os.path.join(label_split_dir, label_name)
|
53 |
+
# Save label file
|
54 |
+
with open(output_label_path, "w") as label_file:
|
55 |
+
label_file.write(annotations)
|
56 |
+
|
57 |
+
logging.info(f"Dataset {split} split exported to Ultralytics format")
|
58 |
+
|
59 |
+
|
60 |
+
def create_df(ds, split_name, output_dir):
|
|
|
|
|
61 |
"""Create dataframe from dataset"""
|
62 |
+
|
63 |
df = pd.DataFrame(
|
64 |
[[i.size[0], i.size[1], i.format, i.mode] for i in ds["image"]],
|
65 |
+
columns=["width", "height", "format", "mode"],
|
66 |
)
|
67 |
df["name"] = ds["image_name"]
|
68 |
+
df["split"] = split_name
|
69 |
+
df["uri"] = df["name"].apply(
|
70 |
+
lambda x: os.path.join(output_dir, "images", split_name, x)
|
71 |
+
)
|
72 |
df["annotations"] = ds["annotations"]
|
73 |
df["partner"] = ds["partner"]
|
74 |
df["camera"] = ds["camera"]
|
|
|
77 |
return df
|
78 |
|
79 |
|
80 |
+
def load_raw_data():
|
81 |
+
"""Main function for downloading, splitting and formatting data"""
|
|
|
82 |
|
83 |
# Check if data information already exists before eventually loading model
|
84 |
+
if os.path.exists(DB_INFO_URI):
|
85 |
+
df = pd.read_csv(DB_INFO_URI, index_col=0)
|
|
|
86 |
return df
|
87 |
|
88 |
+
# Load data
|
89 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
90 |
+
os.makedirs(RAW_DATA_DIR, exist_ok=True)
|
91 |
+
logging.info("⚙️ Dataset loading...")
|
92 |
+
dataset = load_dataset(REPO_ID)
|
93 |
+
train_test = dataset["train"].train_test_split(test_size=SPLIT_SIZE, seed=RDM_SEED)
|
94 |
+
ds_train = train_test["train"]
|
95 |
+
ds_val = dataset["val"]
|
96 |
+
ds_test = train_test["test"]
|
97 |
+
logging.info("✅ Dataset loaded in cache folder")
|
98 |
+
|
99 |
+
# Create directory structure
|
100 |
+
IMAGE_DIR = os.path.join(RAW_DATA_DIR, "images")
|
101 |
+
LABEL_DIR = os.path.join(RAW_DATA_DIR, "labels")
|
102 |
+
for split in ["train", "val", "test"]:
|
103 |
os.makedirs(os.path.join(IMAGE_DIR, split), exist_ok=True)
|
104 |
os.makedirs(os.path.join(LABEL_DIR, split), exist_ok=True)
|
105 |
|
106 |
+
# Save dataset splits
|
107 |
+
save_ultralytics_format(ds_train, "train", IMAGE_DIR, LABEL_DIR)
|
108 |
+
save_ultralytics_format(ds_val, "val", IMAGE_DIR, LABEL_DIR)
|
109 |
+
save_ultralytics_format(ds_test, "test", IMAGE_DIR, LABEL_DIR)
|
|
|
|
|
|
|
110 |
|
111 |
# Create global dataframe from splits
|
|
|
112 |
# Separate train to save memory
|
113 |
+
df_train_1 = create_df(ds_train[:6000], "train", RAW_DATA_DIR)
|
114 |
+
df_train_2 = create_df(ds_train[6000:12000], "train", RAW_DATA_DIR)
|
115 |
+
df_train_3 = create_df(ds_train[12000:18000], "train", RAW_DATA_DIR)
|
116 |
+
df_train_4 = create_df(ds_train[18000:], "train", RAW_DATA_DIR)
|
117 |
+
df_val = create_df(ds_val, "val", RAW_DATA_DIR)
|
118 |
+
df_test = create_df(ds_test, "test", RAW_DATA_DIR)
|
119 |
# Save as one CSV
|
120 |
+
df = pd.concat(
|
121 |
+
[df_train_1, df_train_2, df_train_3, df_train_4, df_val, df_test],
|
122 |
+
axis=0,
|
123 |
+
ignore_index=True,
|
124 |
+
)
|
125 |
+
# Create label column for classification
|
126 |
+
df["label"] = "smoke"
|
127 |
+
df.loc[df["annotations"].isna() | (df["annotations"] == ""), "label"] = "no_smoke"
|
128 |
+
# Reorder columns
|
129 |
+
df = df.loc[
|
130 |
+
:,
|
131 |
+
[
|
132 |
+
"name",
|
133 |
+
"label",
|
134 |
+
"split",
|
135 |
+
"format",
|
136 |
+
"mode",
|
137 |
+
"width",
|
138 |
+
"height",
|
139 |
+
"camera",
|
140 |
+
"partner",
|
141 |
+
"timestamp",
|
142 |
+
"annotations",
|
143 |
+
"uri",
|
144 |
+
],
|
145 |
+
]
|
146 |
+
# Save as CSV
|
147 |
+
with open(DB_INFO_URI, "wb") as f:
|
148 |
df.to_csv(f)
|
149 |
|
150 |
+
# Clear HF default cache folder after it is done (6GB)
|
151 |
+
# 💡 Check first if path up-to-date in "clear_hf_cache.sh"
|
152 |
+
logging.info("🧹 Removing HF default cache folder...")
|
153 |
+
result = subprocess.run(["bash", CLR_CACHE_SCRIPT], capture_output=True, text=True)
|
154 |
+
# logging.info(result.stdout)
|
155 |
+
logging.info("✅ HF Cache folder removed")
|
156 |
+
|
157 |
+
return df
|
158 |
+
|
159 |
+
|
160 |
+
def clean_df(df):
|
161 |
+
"""Filter data to keep only necessary"""
|
162 |
+
# Filter columns
|
163 |
+
df = df[["name", "label", "split", "uri"]]
|
164 |
+
# Remove ".jpg" in name
|
165 |
+
df.loc[:, "name"] = df.name.apply(lambda x: x[:-4])
|
166 |
+
|
167 |
+
return df
|
168 |
+
|
169 |
+
|
170 |
+
def format_data_keras(df):
|
171 |
+
"""Format data for Keras models"""
|
172 |
+
if not os.path.exists(OUTPUT_DIR):
|
173 |
+
logging.warning(f"{OUTPUT_DIR} doesn't exist: (re)load data first")
|
174 |
+
return df
|
175 |
+
|
176 |
+
# Create Keras parent folder
|
177 |
+
keras_dir = os.path.join(OUTPUT_DIR, "keras")
|
178 |
+
os.makedirs(keras_dir, exist_ok=True)
|
179 |
+
# Create splits folders
|
180 |
+
for split in df.split.unique():
|
181 |
+
split_dir = os.path.join(keras_dir, split)
|
182 |
+
os.makedirs(split_dir, exist_ok=True)
|
183 |
+
# Create labels folders
|
184 |
+
for label in df.label.unique():
|
185 |
+
label_dir = os.path.join(split_dir, label)
|
186 |
+
os.makedirs(label_dir, exist_ok=True)
|
187 |
+
|
188 |
+
# Copy images to new URI and update in dataframe
|
189 |
+
df.loc[:, "uri_dest"] = df.apply(
|
190 |
+
lambda x: os.path.join(OUTPUT_DIR, "keras", x["split"], x["label"], x["name"])
|
191 |
+
+ ".jpg",
|
192 |
+
axis=1,
|
193 |
+
)
|
194 |
+
df.apply(lambda x: shutil.copy2(x["uri"], x["uri_dest"]), axis=1)
|
195 |
+
df.drop(columns="uri", inplace=True)
|
196 |
+
df.rename(columns={"uri_dest": "uri"}, inplace=True)
|
197 |
+
|
198 |
return df
|
199 |
|
200 |
|