crumbly
/

gpt2-linear-xl-sharded-bf16

Text Generation

Model card Files Files and versions

crumb commited on Jun 11, 2024

Commit

5cb957c

·

verified ·

1 Parent(s): 8224b23

Update modeling_gpt2l.py

Files changed (1) hide show

modeling_gpt2l.py +2 -2

modeling_gpt2l.py CHANGED Viewed

@@ -169,11 +169,11 @@ class Attention(nn.Module):
             query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
         query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
         value = self.split_heads(value)
         if layer_past is not None:
             past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
-            key = torch.cat((past_key, key), dim=-1)
             value = torch.cat((past_value, value), dim=-2)
         if use_cache is True:

             query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
         query = self.split_heads(query)
+        key = self.split_heads(key) # Dude what? @ the k=True
         value = self.split_heads(value)
         if layer_past is not None:
             past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+            key = torch.cat((past_key, key), dim=-2) # this was dim=-1??? I'm trying to patch in flash attention and this is giving me TROUBLE
             value = torch.cat((past_value, value), dim=-2)
         if use_cache is True: