data ok and EDA begun

Browse files

Files changed (7) hide show

EDA.ipynb +0 -0
README.md +24 -1
app.py +5 -4
config.yaml +10 -1
src/clear_hf_cache.sh +1 -0
src/eda.py +36 -0
src/load_data.py +156 -51

EDA.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -2,4 +2,27 @@
 license: mit
 datasets:
 - frugal-ai-challenge/public-leaderboard-image
----

 license: mit
 datasets:
 - frugal-ai-challenge/public-leaderboard-image
+---
+# Notices
+- **Stratification is not used for data split** (splits are predefined in project)
+# Installation
+✅ Checklist:
+- [] Check **absolute path to HF cache folder** is up-to-date in "clear_hf_cache.sh"
+- [] Check **"clear_hf_cache.sh" script is executable** 👉 `chmod +x ./src/clear_hf_cache.sh`
+# 🚧 TODO
+- voir répartition partenaires, caméras, temporalité, annotations
+- utiliser **classification binaire**
+- métriques : **matrice de confusion** complète
+- décrire voir **erreurs de types et conséquences**
+- tester plusieurs pré-entraînements
+- modèles :
+  - [ ] EfficientNet
+  - [ ] EfficientDet
+- tester si amélioration avec et sans égalisation
+- voir répartition physique des annotations sur l'image

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ load_dotenv()
 app = FastAPI(
     title="Frugal AI Challenge API",
-    description="API for the Frugal AI Challenge evaluation endpoints"
 )
 # Include all routers
@@ -15,6 +15,7 @@ app.include_router(text.router)
 app.include_router(image.router)
 app.include_router(audio.router)
 @app.get("/")
 async def root():
     return {
@@ -22,6 +23,6 @@ async def root():
         "endpoints": {
             "text": "/text - Text classification task",
             "image": "/image - Image classification task (coming soon)",
-            "audio": "/audio - Audio classification task (coming soon)"
-        }
-    }

 app = FastAPI(
     title="Frugal AI Challenge API",
+    description="API for the Frugal AI Challenge evaluation endpoints",
 )
 # Include all routers
 app.include_router(image.router)
 app.include_router(audio.router)
 @app.get("/")
 async def root():
     return {
         "endpoints": {
             "text": "/text - Text classification task",
             "image": "/image - Image classification task (coming soon)",
+            "audio": "/audio - Audio classification task (coming soon)",
+        },
+    }

config.yaml CHANGED Viewed

@@ -1,4 +1,13 @@
-data_dir: "data"
 db_info_uri: "data_info.csv"
 # log:

+# From evaluation.py
+repo_id: "pyronear/pyro-sdis"
+split_size: 0.2
+rdm_seed: 42
+# Data
+data_root_dir: "data"
+raw_data_dir: "raw"
+clr_hf_cache_script_abs_path: './src/clear_hf_cache.sh'
+data_format: "keras"  # "ultralytics" or "keras"
 db_info_uri: "data_info.csv"
 # log:

src/clear_hf_cache.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ rm -rvf ~/.cache/huggingface

src/eda.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Load dataset and save locally in selected format"""
+import logging
+import os
+import pandas as pd
+import yaml
+# Logging configuration (see all outputs, even DEBUG or INFO)
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+# local config
+with open("config.yaml", "r") as f:
+    cfg = yaml.safe_load(f)
+def make_autopct(values):
+    """
+    ==> Obtained from StackOverflow <==
+    Upgrades plt.pie(autopct=""), displaying percentages and values.
+    Input: list of numeric values or Pandas.Series
+    Output: string with percentage and value
+    """
+    def my_autopct(pct):
+        total = sum(values)
+        val = int(round(pct * total / 100.0))
+        return "{p:.2f}%  ({v:d})".format(p=pct, v=val)
+    return my_autopct
+if __name__ == "__main__":
+    help()

src/load_data.py CHANGED Viewed

@@ -1,9 +1,29 @@
-"""Load dataset and save locally in Ultralytics format"""
 from datasets import load_dataset
 import logging
 import os
 import pandas as pd
 # Save in Ultralytics format
@@ -11,41 +31,44 @@ def save_ultralytics_format(dataset_split, split, IMAGE_DIR, LABEL_DIR):
     """Save a dataset split into the Ultralytics format.
     Args:
-        dataset_split: The dataset split (e.g., dataset["train"])
-        split: "train" or "val"
     """
     image_split_dir = os.path.join(IMAGE_DIR, split)
     label_split_dir = os.path.join(LABEL_DIR, split)
-    if len(os.listdir(image_split_dir)) > 0 or len(os.listdir(label_split_dir)) > 0:
-        logging.info(f"{image_split_dir} or {label_split_dir} not empty: passing")
-    else:
-        for example in dataset_split:
-            # Save image to appropriate folder
-            image = example["image"]  # PIL.Image.Image
-            image_name = example["image_name"]  # Original file name
-            output_image_path = os.path.join(image_split_dir, image_name)
-            # Save image object to disk
-            image.save(output_image_path)
-            # Save label
-            annotations = example["annotations"]
-            label_name = image_name.replace(".jpg", ".txt").replace(".png", ".txt")
-            output_label_path = os.path.join(label_split_dir, label_name)
-            # Save label file
-            with open(output_label_path, "w") as label_file:
-                label_file.write(annotations)
-        logging.info(f"Dataset {split} split exported to Ultralytics format")
-def create_df(ds, split_name, OUTPUT_DIR):
     """Create dataframe from dataset"""
     df = pd.DataFrame(
         [[i.size[0], i.size[1], i.format, i.mode] for i in ds["image"]],
-        columns=["width", "height", "format", "mode"]
     )
     df["name"] = ds["image_name"]
-    df["uri"] = df['name'].apply(lambda x: os.path.join(OUTPUT_DIR, "images", split_name, x))
     df["annotations"] = ds["annotations"]
     df["partner"] = ds["partner"]
     df["camera"] = ds["camera"]
@@ -54,42 +77,124 @@ def create_df(ds, split_name, OUTPUT_DIR):
     return df
-def load_data(OUTPUT_DIR, REPO_ID, DB_INFO_URI):
-    """Load data and save to local directory in Ultralytics format
-    """
     # Check if data information already exists before eventually loading model
-    db_info_path = os.path.join(OUTPUT_DIR, DB_INFO_URI)
-    if os.path.exists(db_info_path):
-        df = pd.read_csv(db_info_path, index_col=0)
         return df
-    # Create the directory structure
-    IMAGE_DIR = os.path.join(OUTPUT_DIR, "images")
-    LABEL_DIR = os.path.join(OUTPUT_DIR, "labels")
-    for split in ["train", "val"]:
         os.makedirs(os.path.join(IMAGE_DIR, split), exist_ok=True)
         os.makedirs(os.path.join(LABEL_DIR, split), exist_ok=True)
-    # Load the dataset from the Hugging Face Hub
-    dataset = load_dataset(REPO_ID)
-    logging.info("Dataset loaded in cache folder")
-    # Save train and validation splits
-    save_ultralytics_format(dataset["train"], "train", IMAGE_DIR, LABEL_DIR)
-    save_ultralytics_format(dataset["val"], "val", IMAGE_DIR, LABEL_DIR)
     # Create global dataframe from splits
-    df_val = create_df(dataset["val"], "val", OUTPUT_DIR)
     # Separate train to save memory
-    df_train_1 = create_df(dataset["train"][:10000], "train", OUTPUT_DIR)
-    df_train_2 = create_df(dataset["train"][10000:20000], "train", OUTPUT_DIR)
-    df_train_3 = create_df(dataset["train"][20000:], "train", OUTPUT_DIR)
     # Save as one CSV
-    df = pd.concat([df_val, df_train_1, df_train_2, df_train_3], axis=0, ignore_index=True)
-    with open(db_info_path, "wb") as f:
         df.to_csv(f)
     return df

+"""Load dataset and save locally in selected format"""
 from datasets import load_dataset
 import logging
 import os
 import pandas as pd
+import shutil
+import subprocess
+import yaml
+# Logging configuration (see all outputs, even DEBUG or INFO)
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+# local config
+with open("config.yaml", "r") as f:
+    cfg = yaml.safe_load(f)
+REPO_ID = cfg["repo_id"]
+SPLIT_SIZE = cfg["split_size"]
+RDM_SEED = cfg["rdm_seed"]
+OUTPUT_DIR = cfg["data_root_dir"]
+RAW_DATA_DIR = os.path.join(OUTPUT_DIR, cfg["raw_data_dir"])
+CLR_CACHE_SCRIPT = cfg["clr_hf_cache_script_abs_path"]
+DATA_FORMAT = cfg["data_format"]
+DB_INFO_URI = os.path.join(OUTPUT_DIR, cfg["db_info_uri"])
 # Save in Ultralytics format
     """Save a dataset split into the Ultralytics format.
     Args:
+        dataset_split: The dataset split (e.g. dataset["train"])
+        split: "train", "test" or "val"
     """
     image_split_dir = os.path.join(IMAGE_DIR, split)
     label_split_dir = os.path.join(LABEL_DIR, split)
+    for example in dataset_split:
+        # Save image to appropriate folder
+        image = example["image"]  # PIL.Image.Image
+        image_name = example["image_name"]  # Original file name
+        output_image_path = os.path.join(image_split_dir, image_name)
+        # Save image object to disk
+        image.save(output_image_path)
+        # Save label
+        annotations = example["annotations"]
+        label_name = image_name.replace(".jpg", ".txt").replace(".png", ".txt")
+        output_label_path = os.path.join(label_split_dir, label_name)
+        # Save label file
+        with open(output_label_path, "w") as label_file:
+            label_file.write(annotations)
+    logging.info(f"Dataset {split} split exported to Ultralytics format")
+def create_df(ds, split_name, output_dir):
     """Create dataframe from dataset"""
     df = pd.DataFrame(
         [[i.size[0], i.size[1], i.format, i.mode] for i in ds["image"]],
+        columns=["width", "height", "format", "mode"],
     )
     df["name"] = ds["image_name"]
+    df["split"] = split_name
+    df["uri"] = df["name"].apply(
+        lambda x: os.path.join(output_dir, "images", split_name, x)
+    )
     df["annotations"] = ds["annotations"]
     df["partner"] = ds["partner"]
     df["camera"] = ds["camera"]
     return df
+def load_raw_data():
+    """Main function for downloading, splitting and formatting data"""
     # Check if data information already exists before eventually loading model
+    if os.path.exists(DB_INFO_URI):
+        df = pd.read_csv(DB_INFO_URI, index_col=0)
         return df
+    # Load data
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    os.makedirs(RAW_DATA_DIR, exist_ok=True)
+    logging.info("⚙️ Dataset loading...")
+    dataset = load_dataset(REPO_ID)
+    train_test = dataset["train"].train_test_split(test_size=SPLIT_SIZE, seed=RDM_SEED)
+    ds_train = train_test["train"]
+    ds_val = dataset["val"]
+    ds_test = train_test["test"]
+    logging.info("✅ Dataset loaded in cache folder")
+    # Create directory structure
+    IMAGE_DIR = os.path.join(RAW_DATA_DIR, "images")
+    LABEL_DIR = os.path.join(RAW_DATA_DIR, "labels")
+    for split in ["train", "val", "test"]:
         os.makedirs(os.path.join(IMAGE_DIR, split), exist_ok=True)
         os.makedirs(os.path.join(LABEL_DIR, split), exist_ok=True)
+    # Save dataset splits
+    save_ultralytics_format(ds_train, "train", IMAGE_DIR, LABEL_DIR)
+    save_ultralytics_format(ds_val, "val", IMAGE_DIR, LABEL_DIR)
+    save_ultralytics_format(ds_test, "test", IMAGE_DIR, LABEL_DIR)
     # Create global dataframe from splits
     # Separate train to save memory
+    df_train_1 = create_df(ds_train[:6000], "train", RAW_DATA_DIR)
+    df_train_2 = create_df(ds_train[6000:12000], "train", RAW_DATA_DIR)
+    df_train_3 = create_df(ds_train[12000:18000], "train", RAW_DATA_DIR)
+    df_train_4 = create_df(ds_train[18000:], "train", RAW_DATA_DIR)
+    df_val = create_df(ds_val, "val", RAW_DATA_DIR)
+    df_test = create_df(ds_test, "test", RAW_DATA_DIR)
     # Save as one CSV
+    df = pd.concat(
+        [df_train_1, df_train_2, df_train_3, df_train_4, df_val, df_test],
+        axis=0,
+        ignore_index=True,
+    )
+    # Create label column for classification
+    df["label"] = "smoke"
+    df.loc[df["annotations"].isna() | (df["annotations"] == ""), "label"] = "no_smoke"
+    # Reorder columns
+    df = df.loc[
+        :,
+        [
+            "name",
+            "label",
+            "split",
+            "format",
+            "mode",
+            "width",
+            "height",
+            "camera",
+            "partner",
+            "timestamp",
+            "annotations",
+            "uri",
+        ],
+    ]
+    # Save as CSV
+    with open(DB_INFO_URI, "wb") as f:
         df.to_csv(f)
+    # Clear HF default cache folder after it is done (6GB)
+    # 💡 Check first if path up-to-date in "clear_hf_cache.sh"
+    logging.info("🧹 Removing HF default cache folder...")
+    result = subprocess.run(["bash", CLR_CACHE_SCRIPT], capture_output=True, text=True)
+    # logging.info(result.stdout)
+    logging.info("✅ HF Cache folder removed")
+    return df
+def clean_df(df):
+    """Filter data to keep only necessary"""
+    # Filter columns
+    df = df[["name", "label", "split", "uri"]]
+    # Remove ".jpg" in name
+    df.loc[:, "name"] = df.name.apply(lambda x: x[:-4])
+    return df
+def format_data_keras(df):
+    """Format data for Keras models"""
+    if not os.path.exists(OUTPUT_DIR):
+        logging.warning(f"{OUTPUT_DIR} doesn't exist: (re)load data first")
+        return df
+    # Create Keras parent folder
+    keras_dir = os.path.join(OUTPUT_DIR, "keras")
+    os.makedirs(keras_dir, exist_ok=True)
+    # Create splits folders
+    for split in df.split.unique():
+        split_dir = os.path.join(keras_dir, split)
+        os.makedirs(split_dir, exist_ok=True)
+        # Create labels folders
+        for label in df.label.unique():
+            label_dir = os.path.join(split_dir, label)
+            os.makedirs(label_dir, exist_ok=True)
+    # Copy images to new URI and update in dataframe
+    df.loc[:, "uri_dest"] = df.apply(
+        lambda x: os.path.join(OUTPUT_DIR, "keras", x["split"], x["label"], x["name"])
+        + ".jpg",
+        axis=1,
+    )
+    df.apply(lambda x: shutil.copy2(x["uri"], x["uri_dest"]), axis=1)
+    df.drop(columns="uri", inplace=True)
+    df.rename(columns={"uri_dest": "uri"}, inplace=True)
     return df