Spaces:

DAMO-NLP-SG
/

CLEX-Chat

Runtime error

App Files Files Community

Guanzheng commited on Oct 26, 2023

Commit

5d8ca76

1 Parent(s): 75ec811

Update modeling_llama.py

Browse files

Files changed (1) hide show

modeling_llama.py +28 -47

modeling_llama.py CHANGED Viewed

@@ -311,60 +311,41 @@ class LlamaAttention(nn.Module):
             query_states = query_states * log_n
-        if query_states.shape[-2] == 1 or query_states.shape[-2] != key_states.shape[-2] and not use_flashattn:
-            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
                 raise ValueError(
-                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                    f" {attn_weights.size()}"
                 )
-            if attention_mask is not None:
-                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                    raise ValueError(
-                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                    )
-                attn_weights = attn_weights + attention_mask
-                attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
-            # upcast attention to fp32
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-            attn_output = torch.matmul(attn_weights, value_states)
-            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-                raise ValueError(
-                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                    f" {attn_output.size()}"
-                )
-            attn_output = attn_output.transpose(1, 2)
-            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-            attn_output = self.o_proj(attn_output)
-            if not output_attentions:
-                attn_weights = None
-            return attn_output, attn_weights, past_key_value
-        # use flash attention
-        elif past_key_value is not None:
-            from flash_attn.flash_attn_interface import  flash_attn_with_kvcache
-            output = flash_attn_with_kvcache(
-                        query_states.transpose(1, 2),
-                        key_states.transpose(1, 2),
-                        value_states.transpose(1, 2),
-                        cache_seqlens=kv_seq_len,
-                        causal=True,
-                    )
-            attn_output = self.o_proj(rearrange(output, "b s h d -> b s (h d)"))
-        else:
-            qkv = torch.stack(
-                [query_states, key_states, value_states], dim=2
-            )  # [bsz, nh, 3, q_len, hd]
-            qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-            attn_output = self.flash_attn_forward(qkv)
-        return attn_output, None, past_key_value
 class LlamaDecoderLayer(nn.Module):

             query_states = query_states * log_n
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
 class LlamaDecoderLayer(nn.Module):