AIEM / trainer /train_yolov8.py
lhhj
initial ppush
463b952
import argparse
import os
import yaml
import shutil
import datetime
import numpy as np
import pandas as pd
import yaml
from azure.storage.blob import BlobServiceClient
from pathlib import Path
from sklearn.model_selection import KFold
from collections import Counter
from ultralytics import YOLO
from utils.path_utils import *
STORAGE_ACCOUNT_KEY = "mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q=="
STORAGE_ACCOUNT_NAME = "camtagstoreaiem"
CONNECTION_STRING = "DefaultEndpointsProtocol=https;AccountName=camtagstoreaiem;AccountKey=mhqTCNmdIgsnvyFnfv0r2JKfs8iG//5YVnphCq336XNxhyI72brMy6lP88I9XKVya/G9ZlAAMoNd+AStsXFe0Q==;EndpointSuffix=core.windows.net"
CONTAINER_NAME = "upload"
# Get YAML file containing the training hyperparameters
HOME = os.getenv("APP_HOME")
APP_TRAIN_HP_YAML = os.path.join(HOME, os.getenv("APP_TRAIN_HP_YAML"))
def azure_upload(local_fname, blob_fname, overwrite=True):
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
blob_client = blob_service_client.get_blob_client(
container = CONTAINER_NAME,
blob = blob_fname
)
with open(local_fname, "rb") as data:
blob_client.upload_blob(data, overwrite=overwrite)
if __name__ == "__main__":
with open(APP_TRAIN_HP_YAML, "r") as f:
y = yaml.safe_load(f)
KSPLIT = y['ksplit']
EPOCHS = y['epochs']
MODEL = y['model']
DATA_PATH = y['data_path']
BATCH_SIZE = y['batch_size']
# coco
coco_dataset_path = Path(DATA_PATH)
coco_dict = read_coco_json(coco_dataset_path / "merged.json")
classes = {cat['id']-1: cat['name'] for cat in coco_dict['categories']}
cls_idx = sorted(classes.keys())
labels = sorted((coco_dataset_path / "labels").rglob("*.txt"))
indx = [l.stem for l in labels]
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)
for label in labels:
label_counter = Counter()
with open(label, 'r') as lf:
lines = lf.readlines()
for l in lines:
label_counter[int(l.split(' ')[0])] += 1
labels_df.loc[label.stem] = label_counter
labels_df = labels_df.fillna(0.0)
# KFOLD
kf = KFold(
n_splits = KSPLIT,
shuffle = True,
random_state = 42
)
kfolds = list(kf.split(labels_df))
folds = [f'split_{n}' for n in range(1, KSPLIT + 1)]
folds_df = pd.DataFrame(index=indx, columns=folds)
for idx, (train, val) in enumerate(kfolds, start=1):
folds_df[f'split_{idx}'].loc[labels_df.iloc[train].index] = 'train'
folds_df[f'split_{idx}'].loc[labels_df.iloc[val].index] = 'val'
# check distributions. balanced?
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)
for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
train_totals = labels_df.iloc[train_indices].sum()
val_totals = labels_df.iloc[val_indices].sum()
ratio = val_totals / (train_totals + 1E-7)
fold_lbl_distrb.loc[f'split_{n}'] = ratio
# datasets for each fold
save_path = Path(coco_dataset_path / f'{datetime.date.today().isoformat()}_{KSPLIT}-Fold_Cross-val')
save_path.mkdir(parents=True, exist_ok=True)
suffix = sorted((coco_dataset_path / 'images').rglob("*.*"))[0].suffix
images = [coco_dataset_path / "images" / l.with_suffix(suffix).name for l in labels]
ds_yamls = []
for split in folds_df.columns:
# create directories
split_dir = save_path / split
split_dir.mkdir(parents=True, exist_ok=True)
(split_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True)
(split_dir / 'train' / 'labels').mkdir(parents=True, exist_ok=True)
(split_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True)
(split_dir / 'val' / 'labels').mkdir(parents=True, exist_ok=True)
# create yaml files
dataset_yaml = split_dir / f'{split}_dataset.yaml'
ds_yamls.append(dataset_yaml)
with open(dataset_yaml, 'w') as ds_y:
yaml.safe_dump({
'path' : split_dir.resolve().as_posix(),
'train': 'train',
'val' : 'val',
'names': classes
}, ds_y)
for image, label in zip(images, labels):
for split, k_split in folds_df.loc[image.stem].items():
# destination directory
img_to_path = save_path / split / k_split / 'images'
lbl_to_path = save_path / split / k_split / 'labels'
# copy image and label file to new directory
shutil.copy(image, img_to_path / image.name)
shutil.copy(label, lbl_to_path / label.name)
folds_df.to_csv(save_path / "kfold_datasplit.csv")
fold_lbl_distrb.to_csv(save_path / "kfold_label_distributions.csv")
model = YOLO(MODEL)
for k in range(KSPLIT):
dataset_yaml = ds_yamls[k]
model.train(
data = dataset_yaml,
epochs = EPOCHS,
batch = BATCH_SIZE,
plots = False
)
# azure upload
flag = '2' * (KSPLIT - 1)
local_fname = f'runs/detect/train{flag}/weights/best.pt'
blob_fname = f"kohberg/host_train_{MODEL}"
azure_upload(local_fname, blob_fname, overwrite=True)