import argparse
import os
import yaml
import shutil
import datetime
import numpy as np
import pandas as pd
from import BlobServiceClient
from pathlib import Path
from sklearn.model_selection import KFold
from collections import Counter
from ultralytics import YOLO
from utils.path_utils import *
STORAGE_ACCOUNT_KEY = "mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q=="
STORAGE_ACCOUNT_NAME = "camtagstoreaiem"
CONNECTION_STRING = "DefaultEndpointsProtocol=https;AccountName=camtagstoreaiem;AccountKey=mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q==;"
# Get YAML file containing the training hyperparameters
HOME = os.getenv("APP_HOME")
APP_TRAIN_HP_YAML = os.path.join(HOME, os.getenv("APP_TRAIN_HP_YAML"))
def azure_upload(local_fname, blob_fname, overwrite=True):
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
blob_client = blob_service_client.get_blob_client(
container = CONTAINER_NAME,
blob = blob_fname
with open(local_fname, "rb") as data:
blob_client.upload_blob(data, overwrite=overwrite)
if __name__ == "__main__":
with open(APP_TRAIN_HP_YAML, "r") as f:
y = yaml.safe_load(f)
KSPLIT = y['ksplit']
EPOCHS = y['epochs']
MODEL = y['model']
DATA_PATH = y['data_path']
BATCH_SIZE = y['batch_size']
# coco
coco_dataset_path = Path(DATA_PATH)
coco_dict = read_coco_json(coco_dataset_path / "merged.json")
classes = {cat['id']-1: cat['name'] for cat in coco_dict['categories']}
cls_idx = sorted(classes.keys())
labels = sorted((coco_dataset_path / "labels").rglob("*.txt"))
indx = [l.stem for l in labels]
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)
for label in labels:
label_counter = Counter()
with open(label, 'r') as lf:
lines = lf.readlines()
for l in lines:
label_counter[int(l.split(' ')[0])] += 1
labels_df.loc[label.stem] = label_counter
labels_df = labels_df.fillna(0.0)
kf = KFold(
n_splits = KSPLIT,
shuffle = True,
random_state = 42
kfolds = list(kf.split(labels_df))
folds = [f'split_{n}' for n in range(1, KSPLIT + 1)]
folds_df = pd.DataFrame(index=indx, columns=folds)
for idx, (train, val) in enumerate(kfolds, start=1):
folds_df[f'split_{idx}'].loc[labels_df.iloc[train].index] = 'train'
folds_df[f'split_{idx}'].loc[labels_df.iloc[val].index] = 'val'
# check distributions. balanced?
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)
for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
train_totals = labels_df.iloc[train_indices].sum()
val_totals = labels_df.iloc[val_indices].sum()
ratio = val_totals / (train_totals + 1E-7)
fold_lbl_distrb.loc[f'split_{n}'] = ratio
# datasets for each fold
save_path = Path(coco_dataset_path / f'{}_{KSPLIT}-Fold_Cross-val')
save_path.mkdir(parents=True, exist_ok=True)
suffix = sorted((coco_dataset_path / 'images').rglob("*.*"))[0].suffix
images = [coco_dataset_path / "images" / l.with_suffix(suffix).name for l in labels]
ds_yamls = []
for split in folds_df.columns:
# create directories
split_dir = save_path / split
split_dir.mkdir(parents=True, exist_ok=True)
(split_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True)
(split_dir / 'train' / 'labels').mkdir(parents=True, exist_ok=True)
(split_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True)
(split_dir / 'val' / 'labels').mkdir(parents=True, exist_ok=True)
# create yaml files
dataset_yaml = split_dir / f'{split}_dataset.yaml'
with open(dataset_yaml, 'w') as ds_y:
'path' : split_dir.resolve().as_posix(),
'train': 'train',
'val' : 'val',
'names': classes
}, ds_y)
for image, label in zip(images, labels):
for split, k_split in folds_df.loc[image.stem].items():
# destination directory
img_to_path = save_path / split / k_split / 'images'
lbl_to_path = save_path / split / k_split / 'labels'
# copy image and label file to new directory
shutil.copy(image, img_to_path /
shutil.copy(label, lbl_to_path /
folds_df.to_csv(save_path / "kfold_datasplit.csv")
fold_lbl_distrb.to_csv(save_path / "kfold_label_distributions.csv")
model = YOLO(MODEL)
for k in range(KSPLIT):
dataset_yaml = ds_yamls[k]
data = dataset_yaml,
epochs = EPOCHS,
batch = BATCH_SIZE,
plots = False
# azure upload
flag = '2' * (KSPLIT - 1)
local_fname = f'runs/detect/train{flag}/weights/'
blob_fname = f"kohberg/host_train_{MODEL}"
azure_upload(local_fname, blob_fname, overwrite=True)