Spaces:
Sleeping
Sleeping
Delete loss_scaler.py
Browse files- loss_scaler.py +0 -79
loss_scaler.py
DELETED
@@ -1,79 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
class LossScaler:
|
4 |
-
|
5 |
-
def __init__(self, scale=1):
|
6 |
-
self.cur_scale = scale
|
7 |
-
|
8 |
-
# `params` is a list / generator of torch.Variable
|
9 |
-
def has_overflow(self, params):
|
10 |
-
return False
|
11 |
-
|
12 |
-
# `x` is a torch.Tensor
|
13 |
-
def _has_inf_or_nan(x):
|
14 |
-
return False
|
15 |
-
|
16 |
-
# `overflow` is boolean indicating whether we overflowed in gradient
|
17 |
-
def update_scale(self, overflow):
|
18 |
-
pass
|
19 |
-
|
20 |
-
@property
|
21 |
-
def loss_scale(self):
|
22 |
-
return self.cur_scale
|
23 |
-
|
24 |
-
def scale_gradient(self, module, grad_in, grad_out):
|
25 |
-
return tuple(self.loss_scale * g for g in grad_in)
|
26 |
-
|
27 |
-
def backward(self, loss):
|
28 |
-
scaled_loss = loss*self.loss_scale
|
29 |
-
scaled_loss.backward()
|
30 |
-
|
31 |
-
class DynamicLossScaler:
|
32 |
-
|
33 |
-
def __init__(self,
|
34 |
-
init_scale=2**32,
|
35 |
-
scale_factor=2.,
|
36 |
-
scale_window=1000):
|
37 |
-
self.cur_scale = init_scale
|
38 |
-
self.cur_iter = 0
|
39 |
-
self.last_overflow_iter = -1
|
40 |
-
self.scale_factor = scale_factor
|
41 |
-
self.scale_window = scale_window
|
42 |
-
|
43 |
-
# `params` is a list / generator of torch.Variable
|
44 |
-
def has_overflow(self, params):
|
45 |
-
for p in params:
|
46 |
-
if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
|
47 |
-
return True
|
48 |
-
|
49 |
-
return False
|
50 |
-
|
51 |
-
# `x` is a torch.Tensor
|
52 |
-
def _has_inf_or_nan(x):
|
53 |
-
cpu_sum = float(x.float().sum())
|
54 |
-
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
|
55 |
-
return True
|
56 |
-
return False
|
57 |
-
|
58 |
-
# `overflow` is boolean indicating whether we overflowed in gradient
|
59 |
-
def update_scale(self, overflow):
|
60 |
-
if overflow:
|
61 |
-
#self.cur_scale /= self.scale_factor
|
62 |
-
self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
|
63 |
-
self.last_overflow_iter = self.cur_iter
|
64 |
-
else:
|
65 |
-
if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
|
66 |
-
self.cur_scale *= self.scale_factor
|
67 |
-
# self.cur_scale = 1
|
68 |
-
self.cur_iter += 1
|
69 |
-
|
70 |
-
@property
|
71 |
-
def loss_scale(self):
|
72 |
-
return self.cur_scale
|
73 |
-
|
74 |
-
def scale_gradient(self, module, grad_in, grad_out):
|
75 |
-
return tuple(self.loss_scale * g for g in grad_in)
|
76 |
-
|
77 |
-
def backward(self, loss):
|
78 |
-
scaled_loss = loss*self.loss_scale
|
79 |
-
scaled_loss.backward()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|