jinaai
/

xlm-roberta-flash-implementation

Transformers

xlm-roberta

🇪🇺 Region: EU

Model card Files Files and versions

xet

Community

jupyterjazz commited on Sep 2, 2024

Commit

90873c4

verified ·

1 Parent(s): d8cbc92

Update rotary.py

Browse files

Files changed (1) hide show

rotary.py +17 -4

rotary.py CHANGED Viewed

@@ -493,9 +493,15 @@ class RotaryEmbedding(torch.nn.Module):
     @base.setter
     def base(self, new_base):
-        if new_base > 0:
-            self._base = float(new_base)
-            self.inv_freq = self._compute_inv_freq(device=self.inv_freq.device)
         else:
             raise ValueError("Rotary base value must be positive")
@@ -508,21 +514,27 @@ class RotaryEmbedding(torch.nn.Module):
             )
         )
-    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
         # Reset the tables if the sequence length has changed,
         # if we're on a new device (possibly due to tracing for instance),
         # or if we're switching from inference mode to training
         if (
             seqlen > self._seq_len_cached
             or self._cos_cached is None
             or self._cos_cached.device != device
             or self._cos_cached.dtype != dtype
             or (self.training and self._cos_cached.is_inference())
         ):
             self._seq_len_cached = seqlen
             # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
             # And the output of arange can be quite large, so bf16 would lose a lot of precision.
             # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
             if self.pos_idx_in_fp32:
                 t = torch.arange(seqlen, device=device, dtype=torch.float32)
                 # We want fp32 here as well since inv_freq will be multiplied with t, and the output
@@ -536,6 +548,7 @@ class RotaryEmbedding(torch.nn.Module):
             else:
                 t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                 inv_freq = self.inv_freq
             # Don't do einsum, it converts fp32 to fp16 under AMP
             # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
             freqs = torch.outer(t, inv_freq)

     @base.setter
     def base(self, new_base):
+        new_base = float(new_base)
+        if new_base > 0 and new_base != self._base:
+            self._base = new_base
+            self._update_cos_sin_cache(
+                self._seq_len_cached,
+                device=self.inv_freq.device,
+                dtype=self._cos_cached.dtype if self._cos_cached is not None else None,
+                rotary_base_changed=True,
+            )
         else:
             raise ValueError("Rotary base value must be positive")
             )
         )
+    def _update_cos_sin_cache(
+        self, seqlen, device=None, dtype=None, rotary_base_changed=False
+    ):
         # Reset the tables if the sequence length has changed,
         # if we're on a new device (possibly due to tracing for instance),
         # or if we're switching from inference mode to training
+        # or if the rotary base value was changed
         if (
             seqlen > self._seq_len_cached
             or self._cos_cached is None
             or self._cos_cached.device != device
             or self._cos_cached.dtype != dtype
             or (self.training and self._cos_cached.is_inference())
+            or rotary_base_changed
         ):
             self._seq_len_cached = seqlen
             # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
             # And the output of arange can be quite large, so bf16 would lose a lot of precision.
             # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if rotary_base_changed:
+                self.inv_freq = self._compute_inv_freq(device=self.inv_freq.device)
             if self.pos_idx_in_fp32:
                 t = torch.arange(seqlen, device=device, dtype=torch.float32)
                 # We want fp32 here as well since inv_freq will be multiplied with t, and the output
             else:
                 t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
                 inv_freq = self.inv_freq
             # Don't do einsum, it converts fp32 to fp16 under AMP
             # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
             freqs = torch.outer(t, inv_freq)