AlexK-PL commited on
Commit
17ca749
·
1 Parent(s): 48aeee0

Upload 6 files

Browse files
Files changed (6) hide show
  1. data_preparation.py +102 -0
  2. distributed.py +180 -0
  3. fp16_optimizer.py +385 -0
  4. loss_function.py +25 -0
  5. loss_scaler.py +79 -0
  6. multiproc.py +23 -0
data_preparation.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.utils.data
6
+
7
+ import nn_layers
8
+ from scipy.io.wavfile import read
9
+ from text import text_to_sequence
10
+ from hyper_parameters import tacotron_params
11
+
12
+
13
+ class DataPreparation(torch.utils.data.Dataset):
14
+
15
+ def __init__(self, audiopaths_and_text, tacotron_hyperparams):
16
+ self.audiopaths_and_text = audiopaths_and_text
17
+ self.audio_text_parameters = tacotron_hyperparams
18
+ self.stft = nn_layers.TacotronSTFT(tacotron_hyperparams['filter_length'], tacotron_hyperparams['hop_length'],
19
+ tacotron_hyperparams['win_length'], tacotron_hyperparams['n_mel_channels'],
20
+ self.audio_text_parameters['sampling_rate'],
21
+ tacotron_hyperparams['mel_fmin'], tacotron_hyperparams['mel_fmax'])
22
+ random.seed(1234)
23
+ random.shuffle(self.audiopaths_and_text)
24
+
25
+ def load_audiowav_torch(self, audiopath, samp_rate):
26
+ sr, data = read(audiopath)
27
+ assert samp_rate == sr, "Sample rate does not match with the configuration"
28
+
29
+ return torch.FloatTensor(data.astype(np.float32))
30
+
31
+ def melspec_textSequence_pair(self, audiopath_and_text):
32
+ wav_path, sentence = audiopath_and_text[0], audiopath_and_text[1]
33
+ # wav to torch tensor
34
+ wav_torch = self.load_audiowav_torch(wav_path, self.audio_text_parameters['sampling_rate'])
35
+ wav_torch_norm = wav_torch / self.audio_text_parameters['max_wav_value']
36
+ wav_torch_norm = wav_torch_norm.unsqueeze(0)
37
+ wav_torch_norm = torch.autograd.Variable(wav_torch_norm, requires_grad=False)
38
+ mel_spec = self.stft.mel_spectrogram(wav_torch_norm)
39
+ mel_spec = torch.squeeze(mel_spec, 0)
40
+ # text to torch integer tensor sequence
41
+ sentence_sequence = torch.IntTensor(text_to_sequence(sentence, self.audio_text_parameters['text_cleaners']))
42
+
43
+ return sentence_sequence, mel_spec
44
+
45
+ def __getitem__(self, index):
46
+ return self.melspec_textSequence_pair(self.audiopaths_and_text[index])
47
+
48
+ def __len__(self):
49
+ return len(self.audiopaths_and_text)
50
+
51
+
52
+ class DataCollate:
53
+
54
+ def __init__(self, number_frames_step):
55
+ self.number_frames_step = number_frames_step
56
+
57
+ def __call__(self, batch):
58
+ inp_lengths, sorted_decreasing = torch.sort(torch.LongTensor([len(x[0]) for x in batch]),
59
+ dim=0, descending=True)
60
+ max_length_in = inp_lengths[0]
61
+
62
+ # padding sentences sequences for a fixed-length tensor size
63
+ sentences_padded = torch.LongTensor(len(batch), max_length_in)
64
+ sentences_padded.zero_()
65
+ for i in range(len(sorted_decreasing)):
66
+ int_seq_sentence = batch[sorted_decreasing[i]][0]
67
+ # all slots of a line until the end of the sentence. The rest, 0's
68
+ sentences_padded[i, :int_seq_sentence.size(0)] = int_seq_sentence
69
+
70
+ # length of the mel filterbank used
71
+ num_melfilters = batch[0][1].size(0)
72
+
73
+ # longest recorded spectrogram representation + 1 space to mark the end
74
+ max_length_target = max([x[1].size(1) for x in batch]) # THERE IS A CHANGE FROM THE ORIGINAL CODE!!!
75
+ # add extra space if the number of frames per step is higher than 1
76
+ if max_length_target % self.number_frames_step != 0:
77
+ max_length_target += self.number_frames_step - max_length_target % self.number_frames_step
78
+ assert max_length_target % self.number_frames_step == 0
79
+
80
+ # padding mel spectrogram representations. The output is a 3D tensor
81
+ melspec_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
82
+ melspec_padded.zero_()
83
+
84
+ # GST new prosody matrices definition with zero padding:
85
+ prosody_padded = torch.FloatTensor(len(batch), num_melfilters, max_length_target)
86
+ prosody_padded.zero_()
87
+
88
+ gate_padded = torch.FloatTensor(len(batch), max_length_target)
89
+ gate_padded.zero_()
90
+ output_lengths = torch.LongTensor(len(batch))
91
+
92
+ for j in range(len(sorted_decreasing)):
93
+ melspec = batch[sorted_decreasing[j]][1]
94
+ melspec_padded[j, :, :melspec.size(1)] = melspec
95
+
96
+ # GST filling padded prosody matrix:
97
+ prosody_padded[j, :, :melspec.size(1)] = melspec
98
+
99
+ gate_padded[j, melspec.size(1) - 1:] = 1
100
+ output_lengths[j] = melspec.size(1)
101
+
102
+ return sentences_padded, inp_lengths, melspec_padded, gate_padded, output_lengths, prosody_padded
distributed.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.distributed as dist
3
+ from torch.nn.modules import Module
4
+ from torch.autograd import Variable
5
+
6
+
7
+ def _flatten_dense_tensors(tensors):
8
+ """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
9
+ same dense type.
10
+ Since inputs are dense, the resulting tensor will be a concatenated 1D
11
+ buffer. Element-wise operation on this buffer will be equivalent to
12
+ operating individually.
13
+ Arguments:
14
+ tensors (Iterable[Tensor]): dense tensors to flatten.
15
+ Returns:
16
+ A contiguous 1D buffer containing input tensors.
17
+ """
18
+ if len(tensors) == 1:
19
+ return tensors[0].contiguous().view(-1)
20
+ flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
21
+ return flat
22
+
23
+
24
+ def _unflatten_dense_tensors(flat, tensors):
25
+ """View a flat buffer using the sizes of tensors. Assume that tensors are of
26
+ same dense type, and that flat is given by _flatten_dense_tensors.
27
+ Arguments:
28
+ flat (Tensor): flattened dense tensors to unflatten.
29
+ tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
30
+ unflatten flat.
31
+ Returns:
32
+ Unflattened dense tensors with sizes same as tensors and values from
33
+ flat.
34
+ """
35
+ outputs = []
36
+ offset = 0
37
+ for tensor in tensors:
38
+ numel = tensor.numel()
39
+ outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
40
+ offset += numel
41
+ return tuple(outputs)
42
+
43
+
44
+ '''
45
+ This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
46
+ launcher included with this example. It assumes that your run is using multiprocess with 1
47
+ GPU/process, that the model is on the correct device, and that torch.set_device has been
48
+ used to set the device.
49
+ Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
50
+ and will be allreduced at the finish of the backward pass.
51
+ '''
52
+
53
+
54
+ class DistributedDataParallel(Module):
55
+
56
+ def __init__(self, module):
57
+ super(DistributedDataParallel, self).__init__()
58
+ # fallback for PyTorch 0.3
59
+ if not hasattr(dist, '_backend'):
60
+ self.warn_on_half = True
61
+ else:
62
+ self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
63
+
64
+ self.module = module
65
+
66
+ for p in self.module.state_dict().values():
67
+ if not torch.is_tensor(p):
68
+ continue
69
+ dist.broadcast(p, 0)
70
+
71
+ def allreduce_params():
72
+ if(self.needs_reduction):
73
+ self.needs_reduction = False
74
+ buckets = {}
75
+ for param in self.module.parameters():
76
+ if param.requires_grad and param.grad is not None:
77
+ tp = type(param.data)
78
+ if tp not in buckets:
79
+ buckets[tp] = []
80
+ buckets[tp].append(param)
81
+ if self.warn_on_half:
82
+ if torch.cuda.HalfTensor in buckets:
83
+ print("WARNING: gloo dist backend for half parameters may be extremely slow." +
84
+ " It is recommended to use the NCCL backend in this case. This currently requires" +
85
+ "PyTorch built from top of tree master.")
86
+ self.warn_on_half = False
87
+
88
+ for tp in buckets:
89
+ bucket = buckets[tp]
90
+ grads = [param.grad.data for param in bucket]
91
+ coalesced = _flatten_dense_tensors(grads)
92
+ dist.all_reduce(coalesced)
93
+ coalesced /= dist.get_world_size()
94
+ for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
95
+ buf.copy_(synced)
96
+
97
+ for param in list(self.module.parameters()):
98
+ def allreduce_hook(*unused):
99
+ param._execution_engine.queue_callback(allreduce_params)
100
+ if param.requires_grad:
101
+ param.register_hook(allreduce_hook)
102
+
103
+ def forward(self, *inputs, **kwargs):
104
+ self.needs_reduction = True
105
+ return self.module(*inputs, **kwargs)
106
+
107
+ '''
108
+ def _sync_buffers(self):
109
+ buffers = list(self.module._all_buffers())
110
+ if len(buffers) > 0:
111
+ # cross-node buffer sync
112
+ flat_buffers = _flatten_dense_tensors(buffers)
113
+ dist.broadcast(flat_buffers, 0)
114
+ for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
115
+ buf.copy_(synced)
116
+ def train(self, mode=True):
117
+ # Clear NCCL communicator and CUDA event cache of the default group ID,
118
+ # These cache will be recreated at the later call. This is currently a
119
+ # work-around for a potential NCCL deadlock.
120
+ if dist._backend == dist.dist_backend.NCCL:
121
+ dist._clear_group_cache()
122
+ super(DistributedDataParallel, self).train(mode)
123
+ self.module.train(mode)
124
+ '''
125
+
126
+
127
+ '''
128
+ Modifies existing model to do gradient allreduce, but doesn't change class
129
+ so you don't need "module"
130
+ '''
131
+
132
+
133
+ def apply_gradient_allreduce(module):
134
+ if not hasattr(dist, '_backend'):
135
+ module.warn_on_half = True
136
+ else:
137
+ module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
138
+
139
+ for p in module.state_dict().values():
140
+ if not torch.is_tensor(p):
141
+ continue
142
+ dist.broadcast(p, 0)
143
+
144
+ def allreduce_params():
145
+ if module.needs_reduction:
146
+ module.needs_reduction = False
147
+ buckets = {}
148
+ for param in module.parameters():
149
+ if param.requires_grad and param.grad is not None:
150
+ tp = type(param.data)
151
+ if tp not in buckets:
152
+ buckets[tp] = []
153
+ buckets[tp].append(param)
154
+ if module.warn_on_half:
155
+ if torch.cuda.HalfTensor in buckets:
156
+ print("WARNING: gloo dist backend for half parameters may be extremely slow." +
157
+ " It is recommended to use the NCCL backend in this case. This currently requires" +
158
+ "PyTorch built from top of tree master.")
159
+ module.warn_on_half = False
160
+
161
+ for tp in buckets:
162
+ bucket = buckets[tp]
163
+ grads = [param.grad.data for param in bucket]
164
+ coalesced = _flatten_dense_tensors(grads)
165
+ dist.all_reduce(coalesced)
166
+ coalesced /= dist.get_world_size()
167
+ for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
168
+ buf.copy_(synced)
169
+
170
+ for param in list(module.parameters()):
171
+ def allreduce_hook(*unused):
172
+ Variable._execution_engine.queue_callback(allreduce_params)
173
+ if param.requires_grad:
174
+ param.register_hook(allreduce_hook)
175
+
176
+ def set_needs_reduction(self, input, output):
177
+ self.needs_reduction = True
178
+
179
+ module.register_forward_hook(set_needs_reduction)
180
+ return module
fp16_optimizer.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from torch.autograd import Variable
4
+ from torch.nn.parameter import Parameter
5
+ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
6
+
7
+ from loss_scaler import DynamicLossScaler, LossScaler
8
+
9
+ FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
10
+ HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
11
+
12
+
13
+ def conversion_helper(val, conversion):
14
+ """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
15
+ if not isinstance(val, (tuple, list)):
16
+ return conversion(val)
17
+ rtn = [conversion_helper(v, conversion) for v in val]
18
+ if isinstance(val, tuple):
19
+ rtn = tuple(rtn)
20
+ return rtn
21
+
22
+
23
+ def fp32_to_fp16(val):
24
+ """Convert fp32 `val` to fp16"""
25
+ def half_conversion(val):
26
+ val_typecheck = val
27
+ if isinstance(val_typecheck, (Parameter, Variable)):
28
+ val_typecheck = val.data
29
+ if isinstance(val_typecheck, FLOAT_TYPES):
30
+ val = val.half()
31
+ return val
32
+ return conversion_helper(val, half_conversion)
33
+
34
+
35
+ def fp16_to_fp32(val):
36
+ """Convert fp16 `val` to fp32"""
37
+ def float_conversion(val):
38
+ val_typecheck = val
39
+ if isinstance(val_typecheck, (Parameter, Variable)):
40
+ val_typecheck = val.data
41
+ if isinstance(val_typecheck, HALF_TYPES):
42
+ val = val.float()
43
+ return val
44
+ return conversion_helper(val, float_conversion)
45
+
46
+
47
+ class FP16_Module(nn.Module):
48
+ def __init__(self, module):
49
+ super(FP16_Module, self).__init__()
50
+ self.add_module('module', module.half())
51
+
52
+ def forward(self, *inputs, **kwargs):
53
+ return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
54
+
55
+
56
+ class FP16_Optimizer(object):
57
+ """
58
+ FP16_Optimizer is designed to wrap an existing PyTorch optimizer,
59
+ and enable an fp16 model to be trained using a master copy of fp32 weights.
60
+
61
+ Args:
62
+ optimizer (torch.optim.optimizer): Existing optimizer containing initialized fp16 parameters. Internally, FP16_Optimizer replaces the passed optimizer's fp16 parameters with new fp32 parameters copied from the original ones. FP16_Optimizer also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy after each step.
63
+ static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale fp16 gradients computed by the model. Scaled gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so static_loss_scale should not affect learning rate.
64
+ dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any static_loss_scale option.
65
+
66
+ """
67
+
68
+ def __init__(self, optimizer, static_loss_scale=1.0, dynamic_loss_scale=False):
69
+ if not torch.cuda.is_available:
70
+ raise SystemError('Cannot use fp16 without CUDA')
71
+
72
+ self.fp16_param_groups = []
73
+ self.fp32_param_groups = []
74
+ self.fp32_flattened_groups = []
75
+ for i, param_group in enumerate(optimizer.param_groups):
76
+ print("FP16_Optimizer processing param group {}:".format(i))
77
+ fp16_params_this_group = []
78
+ fp32_params_this_group = []
79
+ for param in param_group['params']:
80
+ if param.requires_grad:
81
+ if param.type() == 'torch.cuda.HalfTensor':
82
+ print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
83
+ .format(param.size()))
84
+ fp16_params_this_group.append(param)
85
+ elif param.type() == 'torch.cuda.FloatTensor':
86
+ print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
87
+ .format(param.size()))
88
+ fp32_params_this_group.append(param)
89
+ else:
90
+ raise TypeError("Wrapped parameters must be either "
91
+ "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
92
+ "Received {}".format(param.type()))
93
+
94
+ fp32_flattened_this_group = None
95
+ if len(fp16_params_this_group) > 0:
96
+ fp32_flattened_this_group = _flatten_dense_tensors(
97
+ [param.detach().data.clone().float() for param in fp16_params_this_group])
98
+
99
+ fp32_flattened_this_group = Variable(fp32_flattened_this_group, requires_grad = True)
100
+
101
+ fp32_flattened_this_group.grad = fp32_flattened_this_group.new(
102
+ *fp32_flattened_this_group.size())
103
+
104
+ # python's lovely list concatenation via +
105
+ if fp32_flattened_this_group is not None:
106
+ param_group['params'] = [fp32_flattened_this_group] + fp32_params_this_group
107
+ else:
108
+ param_group['params'] = fp32_params_this_group
109
+
110
+ self.fp16_param_groups.append(fp16_params_this_group)
111
+ self.fp32_param_groups.append(fp32_params_this_group)
112
+ self.fp32_flattened_groups.append(fp32_flattened_this_group)
113
+
114
+ # print("self.fp32_flattened_groups = ", self.fp32_flattened_groups)
115
+ # print("self.fp16_param_groups = ", self.fp16_param_groups)
116
+
117
+ self.optimizer = optimizer.__class__(optimizer.param_groups)
118
+
119
+ # self.optimizer.load_state_dict(optimizer.state_dict())
120
+
121
+ self.param_groups = self.optimizer.param_groups
122
+
123
+ if dynamic_loss_scale:
124
+ self.dynamic_loss_scale = True
125
+ self.loss_scaler = DynamicLossScaler()
126
+ else:
127
+ self.dynamic_loss_scale = False
128
+ self.loss_scaler = LossScaler(static_loss_scale)
129
+
130
+ self.overflow = False
131
+ self.first_closure_call_this_step = True
132
+
133
+ def zero_grad(self):
134
+ """
135
+ Zero fp32 and fp16 parameter grads.
136
+ """
137
+ self.optimizer.zero_grad()
138
+ for fp16_group in self.fp16_param_groups:
139
+ for param in fp16_group:
140
+ if param.grad is not None:
141
+ param.grad.detach_() # This does appear in torch.optim.optimizer.zero_grad(),
142
+ # but I'm not sure why it's needed.
143
+ param.grad.zero_()
144
+
145
+ def _check_overflow(self):
146
+ params = []
147
+ for group in self.fp16_param_groups:
148
+ for param in group:
149
+ params.append(param)
150
+ for group in self.fp32_param_groups:
151
+ for param in group:
152
+ params.append(param)
153
+ self.overflow = self.loss_scaler.has_overflow(params)
154
+
155
+ def _update_scale(self, has_overflow=False):
156
+ self.loss_scaler.update_scale(has_overflow)
157
+
158
+ def _copy_grads_fp16_to_fp32(self):
159
+ for fp32_group, fp16_group in zip(self.fp32_flattened_groups, self.fp16_param_groups):
160
+ if len(fp16_group) > 0:
161
+ # This might incur one more deep copy than is necessary.
162
+ fp32_group.grad.data.copy_(
163
+ _flatten_dense_tensors([fp16_param.grad.data for fp16_param in fp16_group]))
164
+
165
+ def _downscale_fp32(self):
166
+ if self.loss_scale != 1.0:
167
+ for param_group in self.optimizer.param_groups:
168
+ for param in param_group['params']:
169
+ param.grad.data.mul_(1./self.loss_scale)
170
+
171
+ def clip_fp32_grads(self, clip=-1):
172
+ if not self.overflow:
173
+ fp32_params = []
174
+ for param_group in self.optimizer.param_groups:
175
+ for param in param_group['params']:
176
+ fp32_params.append(param)
177
+ if clip > 0:
178
+ return torch.nn.utils.clip_grad_norm_(fp32_params, clip)
179
+
180
+ def _copy_params_fp32_to_fp16(self):
181
+ for fp16_group, fp32_group in zip(self.fp16_param_groups, self.fp32_flattened_groups):
182
+ if len(fp16_group) > 0:
183
+ for fp16_param, fp32_data in zip(fp16_group, _unflatten_dense_tensors(fp32_group.data, fp16_group)):
184
+ fp16_param.data.copy_(fp32_data)
185
+
186
+ def state_dict(self):
187
+ """
188
+ Returns a dict containing the current state of this FP16_Optimizer instance.
189
+ This dict contains attributes of FP16_Optimizer, as well as the state_dict
190
+ of the contained Pytorch optimizer.
191
+
192
+ Untested.
193
+ """
194
+ state_dict = {}
195
+ state_dict['loss_scaler'] = self.loss_scaler
196
+ state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
197
+ state_dict['overflow'] = self.overflow
198
+ state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
199
+ state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
200
+ return state_dict
201
+
202
+ def load_state_dict(self, state_dict):
203
+ """
204
+ Loads a state_dict created by an earlier call to state_dict.
205
+
206
+ Untested.
207
+ """
208
+ self.loss_scaler = state_dict['loss_scaler']
209
+ self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
210
+ self.overflow = state_dict['overflow']
211
+ self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
212
+ self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
213
+
214
+ def step(self, closure=None): # could add clip option.
215
+ """
216
+ If no closure is supplied, step should be called after fp16_optimizer_obj.backward(loss).
217
+ step updates the fp32 master copy of parameters using the optimizer supplied to
218
+ FP16_Optimizer's constructor, then copies the updated fp32 params into the fp16 params
219
+ originally referenced by Fp16_Optimizer's constructor, so the user may immediately run
220
+ another forward pass using their model.
221
+
222
+ If a closure is supplied, step may be called without a prior call to self.backward(loss).
223
+ However, the user should take care that any loss.backward() call within the closure
224
+ has been replaced by fp16_optimizer_obj.backward(loss).
225
+
226
+ Args:
227
+ closure (optional): Closure that will be supplied to the underlying optimizer originally passed to FP16_Optimizer's constructor. closure should call zero_grad on the FP16_Optimizer object, compute the loss, call .backward(loss), and return the loss.
228
+
229
+ Closure example::
230
+
231
+ # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
232
+ # existing pytorch optimizer.
233
+ for input, target in dataset:
234
+ def closure():
235
+ optimizer.zero_grad()
236
+ output = model(input)
237
+ loss = loss_fn(output, target)
238
+ optimizer.backward(loss)
239
+ return loss
240
+ optimizer.step(closure)
241
+
242
+ .. note::
243
+ The only changes that need to be made compared to
244
+ `ordinary optimizer closures`_ are that "optimizer" itself should be an instance of
245
+ FP16_Optimizer, and that the call to loss.backward should be replaced by
246
+ optimizer.backward(loss).
247
+
248
+ .. warning::
249
+ Currently, calling step with a closure is not compatible with dynamic loss scaling.
250
+
251
+ .. _`ordinary optimizer closures`:
252
+ http://pytorch.org/docs/master/optim.html#optimizer-step-closure
253
+ """
254
+ if closure is not None and isinstance(self.loss_scaler, DynamicLossScaler):
255
+ raise TypeError("Using step with a closure is currently not "
256
+ "compatible with dynamic loss scaling.")
257
+
258
+ scale = self.loss_scaler.loss_scale
259
+ self._update_scale(self.overflow)
260
+
261
+ if self.overflow:
262
+ print("OVERFLOW! Skipping step. Attempted loss scale: {}".format(scale))
263
+ return
264
+
265
+ if closure is not None:
266
+ self._step_with_closure(closure)
267
+ else:
268
+ self.optimizer.step()
269
+
270
+ self._copy_params_fp32_to_fp16()
271
+
272
+ return
273
+
274
+ def _step_with_closure(self, closure):
275
+ def wrapped_closure():
276
+ if self.first_closure_call_this_step:
277
+ """
278
+ We expect that the fp16 params are initially fresh on entering self.step(),
279
+ so _copy_params_fp32_to_fp16() is unnecessary the first time wrapped_closure()
280
+ is called within self.optimizer.step().
281
+ """
282
+ self.first_closure_call_this_step = False
283
+ else:
284
+ """
285
+ If self.optimizer.step() internally calls wrapped_closure more than once,
286
+ it may update the fp32 params after each call. However, self.optimizer
287
+ doesn't know about the fp16 params at all. If the fp32 params get updated,
288
+ we can't rely on self.optimizer to refresh the fp16 params. We need
289
+ to handle that manually:
290
+ """
291
+ self._copy_params_fp32_to_fp16()
292
+
293
+ """
294
+ Our API expects the user to give us ownership of the backward() call by
295
+ replacing all calls to loss.backward() with optimizer.backward(loss).
296
+ This requirement holds whether or not the call to backward() is made within
297
+ a closure.
298
+ If the user is properly calling optimizer.backward(loss) within "closure,"
299
+ calling closure() here will give the fp32 master params fresh gradients
300
+ for the optimizer to play with,
301
+ so all wrapped_closure needs to do is call closure() and return the loss.
302
+ """
303
+ temp_loss = closure()
304
+ return temp_loss
305
+
306
+ self.optimizer.step(wrapped_closure)
307
+
308
+ self.first_closure_call_this_step = True
309
+
310
+ def backward(self, loss, update_fp32_grads=True):
311
+ """
312
+ fp16_optimizer_obj.backward performs the following conceptual operations:
313
+
314
+ fp32_loss = loss.float() (see first Note below)
315
+
316
+ scaled_loss = fp32_loss*loss_scale
317
+
318
+ scaled_loss.backward(), which accumulates scaled gradients into the .grad attributes of the
319
+ fp16 model's leaves.
320
+
321
+ fp16 grads are then copied to the stored fp32 params' .grad attributes (see second Note).
322
+
323
+ Finally, fp32 grads are divided by loss_scale.
324
+
325
+ In this way, after fp16_optimizer_obj.backward, the fp32 parameters have fresh gradients,
326
+ and fp16_optimizer_obj.step may be called.
327
+
328
+ .. note::
329
+ Converting the loss to fp32 before applying the loss scale provides some
330
+ additional safety against overflow if the user has supplied an fp16 value.
331
+ However, for maximum overflow safety, the user should
332
+ compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
333
+ fp16_optimizer_obj.backward.
334
+
335
+ .. note::
336
+ The gradients found in an fp16 model's leaves after a call to
337
+ fp16_optimizer_obj.backward should not be regarded as valid in general,
338
+ because it's possible
339
+ they have been scaled (and in the case of dynamic loss scaling,
340
+ the scale factor may silently change over time).
341
+ If the user wants to inspect gradients after a call to fp16_optimizer_obj.backward,
342
+ he/she should query the .grad attribute of FP16_Optimizer's stored fp32 parameters.
343
+
344
+ Args:
345
+ loss: The loss output by the user's model. loss may be either float or half (but see first Note above).
346
+ update_fp32_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay this copy, which is useful to eliminate redundant fp16->fp32 grad copies if fp16_optimizer_obj.backward is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling fp16_optimizer_obj.update_fp32_grads before calling fp16_optimizer_obj.step.
347
+
348
+ Example::
349
+
350
+ # Ordinary operation:
351
+ optimizer.backward(loss)
352
+
353
+ # Naive operation with multiple losses (technically valid, but less efficient):
354
+ # fp32 grads will be correct after the second call, but
355
+ # the first call incurs an unnecessary fp16->fp32 grad copy.
356
+ optimizer.backward(loss1)
357
+ optimizer.backward(loss2)
358
+
359
+ # More efficient way to handle multiple losses:
360
+ # The fp16->fp32 grad copy is delayed until fp16 grads from all
361
+ # losses have been accumulated.
362
+ optimizer.backward(loss1, update_fp32_grads=False)
363
+ optimizer.backward(loss2, update_fp32_grads=False)
364
+ optimizer.update_fp32_grads()
365
+ """
366
+ self.loss_scaler.backward(loss.float())
367
+ if update_fp32_grads:
368
+ self.update_fp32_grads()
369
+
370
+ def update_fp32_grads(self):
371
+ """
372
+ Copy the .grad attribute from stored references to fp16 parameters to
373
+ the .grad attribute of the master fp32 parameters that are directly
374
+ updated by the optimizer. :attr:`update_fp32_grads` only needs to be called if
375
+ fp16_optimizer_obj.backward was called with update_fp32_grads=False.
376
+ """
377
+ if self.dynamic_loss_scale:
378
+ self._check_overflow()
379
+ if self.overflow: return
380
+ self._copy_grads_fp16_to_fp32()
381
+ self._downscale_fp32()
382
+
383
+ @property
384
+ def loss_scale(self):
385
+ return self.loss_scaler.loss_scale
loss_function.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+
4
+ class Tacotron2Loss(nn.Module):
5
+ def __init__(self):
6
+ super(Tacotron2Loss, self).__init__()
7
+
8
+ def forward(self, model_output, targets):
9
+ mel_target, gate_target = targets[0], targets[1]
10
+ mel_target.requires_grad = False
11
+ gate_target.requires_grad = False
12
+ # Ensures dimension 1 will be size 1, the rest can be adapted. It is a column of length 189 with all zeroes
13
+ # till the end of the current sequence, which is filled with 1's
14
+ gate_target = gate_target.view(-1, 1)
15
+
16
+ mel_out, mel_out_postnet, gate_out, _, _ = model_output
17
+ gate_out = gate_out.view(-1, 1)
18
+ # Mean Square Error (L2) loss function for decoder generation + post net generation
19
+ mel_loss = nn.MSELoss()(mel_out, mel_target) + \
20
+ nn.MSELoss()(mel_out_postnet, mel_target)
21
+ # Binary Cross Entropy with a Sigmoid layer combined. It is more efficient than using a plain Sigmoid
22
+ # followed by a BCELoss as, by combining the operations into one layer, we take advantage of the log-sum-exp
23
+ # trick for numerical stability
24
+ gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
25
+ return mel_loss + gate_loss
loss_scaler.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ class LossScaler:
4
+
5
+ def __init__(self, scale=1):
6
+ self.cur_scale = scale
7
+
8
+ # `params` is a list / generator of torch.Variable
9
+ def has_overflow(self, params):
10
+ return False
11
+
12
+ # `x` is a torch.Tensor
13
+ def _has_inf_or_nan(x):
14
+ return False
15
+
16
+ # `overflow` is boolean indicating whether we overflowed in gradient
17
+ def update_scale(self, overflow):
18
+ pass
19
+
20
+ @property
21
+ def loss_scale(self):
22
+ return self.cur_scale
23
+
24
+ def scale_gradient(self, module, grad_in, grad_out):
25
+ return tuple(self.loss_scale * g for g in grad_in)
26
+
27
+ def backward(self, loss):
28
+ scaled_loss = loss*self.loss_scale
29
+ scaled_loss.backward()
30
+
31
+ class DynamicLossScaler:
32
+
33
+ def __init__(self,
34
+ init_scale=2**32,
35
+ scale_factor=2.,
36
+ scale_window=1000):
37
+ self.cur_scale = init_scale
38
+ self.cur_iter = 0
39
+ self.last_overflow_iter = -1
40
+ self.scale_factor = scale_factor
41
+ self.scale_window = scale_window
42
+
43
+ # `params` is a list / generator of torch.Variable
44
+ def has_overflow(self, params):
45
+ for p in params:
46
+ if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
47
+ return True
48
+
49
+ return False
50
+
51
+ # `x` is a torch.Tensor
52
+ def _has_inf_or_nan(x):
53
+ cpu_sum = float(x.float().sum())
54
+ if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
55
+ return True
56
+ return False
57
+
58
+ # `overflow` is boolean indicating whether we overflowed in gradient
59
+ def update_scale(self, overflow):
60
+ if overflow:
61
+ #self.cur_scale /= self.scale_factor
62
+ self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
63
+ self.last_overflow_iter = self.cur_iter
64
+ else:
65
+ if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
66
+ self.cur_scale *= self.scale_factor
67
+ # self.cur_scale = 1
68
+ self.cur_iter += 1
69
+
70
+ @property
71
+ def loss_scale(self):
72
+ return self.cur_scale
73
+
74
+ def scale_gradient(self, module, grad_in, grad_out):
75
+ return tuple(self.loss_scale * g for g in grad_in)
76
+
77
+ def backward(self, loss):
78
+ scaled_loss = loss*self.loss_scale
79
+ scaled_loss.backward()
multiproc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ import sys
4
+ import subprocess
5
+
6
+ argslist = list(sys.argv)[1:]
7
+ num_gpus = torch.cuda.device_count()
8
+ argslist.append('--n_gpus={}'.format(num_gpus))
9
+ workers = []
10
+ job_id = time.strftime("%Y_%m_%d-%H%M%S")
11
+ argslist.append("--group_name=group_{}".format(job_id))
12
+
13
+ for i in range(num_gpus):
14
+ argslist.append('--rank={}'.format(i))
15
+ stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
16
+ "w")
17
+ print(argslist)
18
+ p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
19
+ workers.append(p)
20
+ argslist = argslist[:-1]
21
+
22
+ for p in workers:
23
+ p.wait()