Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

AlexK-PL commited on Sep 3, 2023

Commit

48aeee0

1 Parent(s): 4d5330a

Upload training.py

Browse files

Files changed (1) hide show

training.py +254 -0

training.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# ONE EPOCH = one forward pass and one backward pass of all the training examples.
+#
+# BATCH SIZE = the number of training examples in one forward/backward pass. The
+# higher the batch size, the more memory space you'll need.
+#
+# NUMBER OF ITERATIONS = number of passes, each pass using [batch size] number of
+# examples. To be clear, one pass = one forward pass + one backward pass.
+#
+# Example: if you have 1000 training examples, and your batch size is 500, then
+# it will take 2 iterations to complete 1 epoch.
+import os
+import time
+import math
+import torch
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
+from numpy import finfo
+from Tacotron2 import tacotron_2
+from fp16_optimizer import FP16_Optimizer
+# from distributed import apply_gradient_allreduce
+from loss_function import Tacotron2Loss
+from logger import Tacotron2Logger
+def batchnorm_to_float(module):
+    """Converts batch norm modules to FP32"""
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+        module.float()
+    for child in module.children():
+        batchnorm_to_float(child)
+    return module
+def reduce_tensor(tensor, n_gpus):
+    # this function is recorded in the computation graph. Gradients propagating to the cloned tensor will propagate to
+    # the original tensor
+    rt = tensor.clone()
+    # Each rank has a tensor and all_reduce sums up all tensors from different ranks to all ranks. Computes the average
+    # of the tensor results of all ranks (a rank is a gpu as far as I understood):
+    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    rt /= n_gpus
+    return rt
+def prepare_directories_and_logger(output_directory, log_directory, rank):
+    if rank == 0:
+        if not os.path.isdir(output_directory):
+            os.makedirs(output_directory)
+            os.chmod(output_directory, 0o775)
+        logger = Tacotron2Logger(os.path.join(output_directory, log_directory))
+        #  logger = None
+    else:
+        logger = None
+    return logger
+def warm_start_model(checkpoint_path, model):
+    assert os.path.isfile(checkpoint_path)
+    print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
+    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    model.load_state_dict(checkpoint_dict['state_dict'])
+    return model
+def load_checkpoint(checkpoint_path, model, optimizer):
+    assert os.path.isfile(checkpoint_path)
+    print("Loading checkpoint '{}'".format(checkpoint_path))
+    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    model.load_state_dict(checkpoint_dict['state_dict'])
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+    learning_rate = checkpoint_dict['learning_rate']
+    iteration = checkpoint_dict['iteration']
+    print("Loaded checkpoint '{}' from iteration {}".format(checkpoint_path, iteration))
+    return model, optimizer, learning_rate, iteration
+def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
+    print("Saving model and optimizer state at iteration {} to {}".format(iteration, filepath))
+    torch.save({'iteration': iteration,
+                'state_dict': model.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'learning_rate': learning_rate}, filepath)
+def init_distributed(hyper_params, n_gpus, rank, group_name):
+    assert torch.cuda.is_available(), "Distributed mode requires CUDA"
+    print("Initializing distributed")
+    # Set CUDA device so everything is done on the right GPU
+    torch.cuda.set_device(rank % torch.cuda.device_count())
+    # Initialize distributed communication
+    torch.distributed.init_process_group(backend=hyper_params['dist_backend'], rank=rank, world_size=n_gpus,
+                                         init_method=hyper_params['dist_url'], group_name=group_name)
+    print("Initializing distributed: Done")
+def load_model(hyper_params):
+    # according to the documentation, it is recommended to move a model to GPU before constructing the optimizer
+    # model = tacotron_2(hyper_params).cuda()
+    model = tacotron_2(hyper_params)
+    if hyper_params['fp16_run']:  # converts everything into half type (16 bits)
+        model = batchnorm_to_float(model.half())
+        model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
+    # if hyper_params['distributed_run']:
+    #     model = apply_gradient_allreduce(model)
+    return model
+def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank):
+    """Handles all the validation scoring and printing"""
+    # We change to eval() because this is an evaluation stage and not a training
+    model.eval()
+    # temporarily set all the requires_grad flag to false
+    with torch.no_grad():
+        # Sampler that restricts data loading to a subset of the dataset. Distributed sampler for distributed batch.
+        # Which samples take (randomization?)
+        val_sampler = DistributedSampler(valset) if distributed_run else None
+        # data loader wraper to the validation data (same as for the training data)
+        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=False, batch_size=batch_size,
+                                pin_memory=False, collate_fn=collate_fn)
+        val_loss = 0.0
+        for i, batch in enumerate(val_loader):
+            x, y = model.parse_batch(batch)
+            y_pred = model(x)
+            _, _, _, _, gst_scores = y_pred
+            if i == 0:
+                validation_gst_scores = gst_scores
+            else:
+                validation_gst_scores = torch.cat((validation_gst_scores, gst_scores), 0)
+            loss = criterion(y_pred, y)
+            if distributed_run:
+                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()  # gets the pure float value with item()
+            else:
+                reduced_val_loss = loss.item()
+            val_loss += reduced_val_loss
+        val_loss = val_loss / (i + 1)  # Averaged val_loss from all batches
+    model.train()
+    if rank == 0:
+        print("Validation loss {}: {:9f} ".format(iteration, val_loss))  # I changed this
+        # print("GST scores of the validation set: {}".format(validation_gst_scores.shape))
+        logger.log_validation(reduced_val_loss, model, y, y_pred, validation_gst_scores, iteration)
+# ------------------------------------------- MAIN TRAINING METHOD -------------------------------------------------- #
+def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name,
+          hyper_params, train_loader, valset, collate_fn):
+    """Training and validation method with logging results to tensorboard and stdout
+    :param output_directory (string): directory to save checkpoints
+    :param log_directory (string): directory to save tensorboard logs
+    :param checkpoint_path (string): checkpoint path
+    :param n_gpus (int): number of gpus
+    :param rank (int): rank of current gpu
+    :param hyper_params (object dictionary): dictionary with all hyper parameters
+    """
+    # Check whether is a distributed running
+    if hyper_params['distributed_run']:
+        init_distributed(hyper_params, n_gpus, rank, group_name)
+    # set the same fixed seed to reproduce same results everytime we train
+    torch.manual_seed(hyper_params['seed'])
+    torch.cuda.manual_seed(hyper_params['seed'])
+    model = load_model(hyper_params)
+    learning_rate = hyper_params['learning_rate']
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hyper_params['weight_decay'])
+    if hyper_params['fp16_run']:
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=hyper_params['dynamic_loss_scaling'])
+    # Define the criterion of the loss function. The objective.
+    criterion = Tacotron2Loss()
+    logger = prepare_directories_and_logger(output_directory, log_directory, rank)
+    # logger = ''
+    iteration = 0
+    epoch_offset = 0
+    if checkpoint_path is not None:
+        if warm_start:
+            # Re-start the model from the last checkpoint if we save the parameters and don't want to start from 0
+            model = warm_start_model(checkpoint_path, model)
+        else:
+            # CHECK THIS OUT!!!
+            model, optimizer, _learning_rate, iteration = load_checkpoint(checkpoint_path, model, optimizer)
+            if hyper_params['use_saved_learning_rate']:
+                learning_rate = _learning_rate
+            iteration += 1  # next iteration is iteration + 1
+            epoch_offset = max(0, int(iteration / len(train_loader)))
+    # Set this to make all modules and regularization aware this is the training stage:
+    model.train()
+    # MAIN LOOP
+    for epoch in range(epoch_offset, hyper_params['epochs']):
+        print("Epoch: {}".format(epoch))
+        for i, batch in enumerate(train_loader):
+            start = time.perf_counter()
+            # CHECK THIS OUT!!!
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = learning_rate
+            model.zero_grad()
+            input_data, output_target = model.parse_batch(batch)
+            output_predicted = model(input_data)
+            loss = criterion(output_predicted, output_target)
+            if hyper_params['distributed_run']:
+                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
+            else:
+                reduced_loss = loss.item()
+            if hyper_params['fp16_run']:
+                optimizer.backward(loss)  # transformed optimizer into fp16 type
+                grad_norm = optimizer.clip_fp32_grads(hyper_params['grad_clip_thresh'])
+            else:
+                loss.backward()
+                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hyper_params['grad_clip_thresh'])
+            # Performs a single optimization step (parameter update)
+            optimizer.step()
+            # This boolean controls overflow when running in fp16 optimizer
+            overflow = optimizer.overflow if hyper_params['fp16_run'] else False
+            # If overflow is True, it will not enter. If isnan is True, it will not enter neither.
+            if not overflow and not math.isnan(reduced_loss) and rank == 0:
+                duration = time.perf_counter() - start
+                print("Train loss {} {:.6f} Grand Norm {:.6f} {:.2f}s/it".format(iteration, reduced_loss,
+                                                                                 grad_norm, duration))
+                # logs training information of the current iteration
+                logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration)
+            # Every iters_per_checkpoint steps there is a validation of the model and its updated parameters
+            if not overflow and (iteration % hyper_params['iters_per_checkpoint'] == 0):
+                validate(model, criterion, valset, iteration, hyper_params['batch_size'], n_gpus, collate_fn,
+                         logger, hyper_params['distributed_run'], rank)
+                if rank == 0:
+                    checkpoint_path = os.path.join(output_directory, "checkpoint_{}".format(iteration))
+                    save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
+            iteration += 1