Spaces:
Build error
Build error
import argparse | |
import os | |
import yaml | |
import shutil | |
import datetime | |
import numpy as np | |
import pandas as pd | |
import yaml | |
from azure.storage.blob import BlobServiceClient | |
from pathlib import Path | |
from sklearn.model_selection import KFold | |
from collections import Counter | |
from ultralytics import YOLO | |
from utils.path_utils import * | |
STORAGE_ACCOUNT_KEY = "mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q==" | |
STORAGE_ACCOUNT_NAME = "camtagstoreaiem" | |
CONNECTION_STRING = "DefaultEndpointsProtocol=https;AccountName=camtagstoreaiem;AccountKey=mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q==;EndpointSuffix=core.windows.net" | |
CONTAINER_NAME = "upload" | |
# Get YAML file containing the training hyperparameters | |
HOME = os.getenv("APP_HOME") | |
APP_TRAIN_HP_YAML = os.path.join(HOME, os.getenv("APP_TRAIN_HP_YAML")) | |
def azure_upload(local_fname, blob_fname, overwrite=True): | |
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING) | |
blob_client = blob_service_client.get_blob_client( | |
container = CONTAINER_NAME, | |
blob = blob_fname | |
) | |
with open(local_fname, "rb") as data: | |
blob_client.upload_blob(data, overwrite=overwrite) | |
if __name__ == "__main__": | |
with open(APP_TRAIN_HP_YAML, "r") as f: | |
y = yaml.safe_load(f) | |
KSPLIT = y['ksplit'] | |
EPOCHS = y['epochs'] | |
MODEL = y['model'] | |
DATA_PATH = y['data_path'] | |
BATCH_SIZE = y['batch_size'] | |
# coco | |
coco_dataset_path = Path(DATA_PATH) | |
coco_dict = read_coco_json(coco_dataset_path / "merged.json") | |
classes = {cat['id']-1: cat['name'] for cat in coco_dict['categories']} | |
cls_idx = sorted(classes.keys()) | |
labels = sorted((coco_dataset_path / "labels").rglob("*.txt")) | |
indx = [l.stem for l in labels] | |
labels_df = pd.DataFrame([], columns=cls_idx, index=indx) | |
for label in labels: | |
label_counter = Counter() | |
with open(label, 'r') as lf: | |
lines = lf.readlines() | |
for l in lines: | |
label_counter[int(l.split(' ')[0])] += 1 | |
labels_df.loc[label.stem] = label_counter | |
labels_df = labels_df.fillna(0.0) | |
# KFOLD | |
kf = KFold( | |
n_splits = KSPLIT, | |
shuffle = True, | |
random_state = 42 | |
) | |
kfolds = list(kf.split(labels_df)) | |
folds = [f'split_{n}' for n in range(1, KSPLIT + 1)] | |
folds_df = pd.DataFrame(index=indx, columns=folds) | |
for idx, (train, val) in enumerate(kfolds, start=1): | |
folds_df[f'split_{idx}'].loc[labels_df.iloc[train].index] = 'train' | |
folds_df[f'split_{idx}'].loc[labels_df.iloc[val].index] = 'val' | |
# check distributions. balanced? | |
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx) | |
for n, (train_indices, val_indices) in enumerate(kfolds, start=1): | |
train_totals = labels_df.iloc[train_indices].sum() | |
val_totals = labels_df.iloc[val_indices].sum() | |
ratio = val_totals / (train_totals + 1E-7) | |
fold_lbl_distrb.loc[f'split_{n}'] = ratio | |
# datasets for each fold | |
save_path = Path(coco_dataset_path / f'{datetime.date.today().isoformat()}_{KSPLIT}-Fold_Cross-val') | |
save_path.mkdir(parents=True, exist_ok=True) | |
suffix = sorted((coco_dataset_path / 'images').rglob("*.*"))[0].suffix | |
images = [coco_dataset_path / "images" / l.with_suffix(suffix).name for l in labels] | |
ds_yamls = [] | |
for split in folds_df.columns: | |
# create directories | |
split_dir = save_path / split | |
split_dir.mkdir(parents=True, exist_ok=True) | |
(split_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True) | |
(split_dir / 'train' / 'labels').mkdir(parents=True, exist_ok=True) | |
(split_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True) | |
(split_dir / 'val' / 'labels').mkdir(parents=True, exist_ok=True) | |
# create yaml files | |
dataset_yaml = split_dir / f'{split}_dataset.yaml' | |
ds_yamls.append(dataset_yaml) | |
with open(dataset_yaml, 'w') as ds_y: | |
yaml.safe_dump({ | |
'path' : split_dir.resolve().as_posix(), | |
'train': 'train', | |
'val' : 'val', | |
'names': classes | |
}, ds_y) | |
for image, label in zip(images, labels): | |
for split, k_split in folds_df.loc[image.stem].items(): | |
# destination directory | |
img_to_path = save_path / split / k_split / 'images' | |
lbl_to_path = save_path / split / k_split / 'labels' | |
# copy image and label file to new directory | |
shutil.copy(image, img_to_path / image.name) | |
shutil.copy(label, lbl_to_path / label.name) | |
folds_df.to_csv(save_path / "kfold_datasplit.csv") | |
fold_lbl_distrb.to_csv(save_path / "kfold_label_distributions.csv") | |
model = YOLO(MODEL) | |
for k in range(KSPLIT): | |
dataset_yaml = ds_yamls[k] | |
model.train( | |
data = dataset_yaml, | |
epochs = EPOCHS, | |
batch = BATCH_SIZE, | |
plots = False | |
) | |
# azure upload | |
flag = '2' * (KSPLIT - 1) | |
local_fname = f'runs/detect/train{flag}/weights/best.pt' | |
blob_fname = f"kohberg/host_train_{MODEL}" | |
azure_upload(local_fname, blob_fname, overwrite=True) |