Spaces:

DAMO-NLP-SG
/

CLEX-Chat

Runtime error

App Files Files Community

Guanzheng commited on Oct 26, 2023

Commit

39684af

1 Parent(s): a572663

Update modeling_llama.py

Browse files

Files changed (1) hide show

modeling_llama.py +77 -52

modeling_llama.py CHANGED Viewed

@@ -30,8 +30,8 @@ from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from configuration_clex import CLEXLlamaConfig
-from clex_layer import LlamaCLEXScalingRotaryEmbedding
 from einops import rearrange
 import importlib.metadata
 import importlib.util
@@ -60,14 +60,10 @@ def is_flash_attn_available():
         return False
     # Let's add an extra check to see if cuda is available
-    import torch
     return _is_package_available("flash_attn") and torch.cuda.is_available()
-if is_flash_attn_available():
-    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func, flash_attn_qkvpacked_func, flash_attn_with_kvcache
-    # from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-    from flash_attn.bert_padding import unpad_input, pad_input
@@ -170,14 +166,17 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, q_len, position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos[:, :, -q_len:, :]) + (rotate_half(q) * sin[:, :, -q_len:, :])
-    k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -232,7 +231,10 @@ class LlamaAttention(nn.Module):
         attention_mask: [bsz, q_len]
         """
         bsz, q_len, *_ = qkv.size()
         if key_padding_mask is None:
@@ -283,63 +285,86 @@ class LlamaAttention(nn.Module):
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
         if pack_cos_sin is not None:
             cos, sin = pack_cos_sin.to(query_states.device)
         else:
             cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         key_position_ids = torch.arange(kv_seq_len, dtype=torch.long, device=position_ids.device).unsqueeze(0).view(-1, kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, q_len, key_position_ids)
         if past_key_value is not None:
             # reuse k, v, self_attention
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        past_key_value = (key_states, value_states) if use_cache else None
-        use_flashatn =  self.config.use_flashattn and is_flash_attn_available()
         if self.log_scale:
             log_n = torch.log(torch.tensor(kv_seq_len*1.0)).to(query_states.device, dtype=query_states.dtype) / \
                      torch.log(torch.tensor(self.config.max_position_embeddings)).to(query_states.device, dtype=query_states.dtype)
             query_states = query_states * log_n
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
 class LlamaDecoderLayer(nn.Module):
@@ -629,14 +654,14 @@ class LlamaModel(LlamaPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         # embed positions
-        # if attention_mask is None:
-        #     attention_mask = torch.ones(
-        #         (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-        #     )
-        # attention_mask = self._prepare_decoder_attention_mask(
-        #     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        # )
-        attention_mask = None
         hidden_states = inputs_embeds

 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_clex import CLEXLlamaConfig
+from .clex_layer import LlamaCLEXScalingRotaryEmbedding
 from einops import rearrange
 import importlib.metadata
 import importlib.util
         return False
     # Let's add an extra check to see if cuda is available
     return _is_package_available("flash_attn") and torch.cuda.is_available()
     return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin,  position_ids, key_position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos_q = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin_q = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    cos_k = cos[key_position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin_k = sin[key_position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos_q) + (rotate_half(q) * sin_q)
+    k_embed = (k * cos_k) + (rotate_half(k) * sin_k)
     return q_embed, k_embed
         attention_mask: [bsz, q_len]
         """
+        if is_flash_attn_available():
+            from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func, flash_attn_qkvpacked_func, flash_attn_with_kvcache
+            # from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+            from flash_attn.bert_padding import unpad_input, pad_input
         bsz, q_len, *_ = qkv.size()
         if key_padding_mask is None:
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
+            cache_key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        else:
+            cache_key_states = key_states
         if pack_cos_sin is not None:
             cos, sin = pack_cos_sin.to(query_states.device)
         else:
             cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         key_position_ids = torch.arange(kv_seq_len, dtype=torch.long, device=position_ids.device).unsqueeze(0).view(-1, kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, cache_key_states, cos, sin, position_ids, key_position_ids)
         if past_key_value is not None:
             # reuse k, v, self_attention
+            # key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (cache_key_states, value_states) if use_cache else None
+        use_flashattn =  self.config.use_flashattn and is_flash_attn_available()
         if self.log_scale:
             log_n = torch.log(torch.tensor(kv_seq_len*1.0)).to(query_states.device, dtype=query_states.dtype) / \
                      torch.log(torch.tensor(self.config.max_position_embeddings)).to(query_states.device, dtype=query_states.dtype)
             query_states = query_states * log_n
+        if query_states.shape[-2] == 1 or query_states.shape[-2] != key_states.shape[-2] and not use_flashattn:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
                 raise ValueError(
+                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
                 )
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+                attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
+            attn_output = attn_output.transpose(1, 2)
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+            attn_output = self.o_proj(attn_output)
+            if not output_attentions:
+                attn_weights = None
+            return attn_output, attn_weights, past_key_value
+        # use flash attention
+        elif past_key_value is not None:
+            from flash_attn.flash_attn_interface import  flash_attn_with_kvcache
+            output = flash_attn_with_kvcache(
+                        query_states.transpose(1, 2),
+                        key_states.transpose(1, 2),
+                        value_states.transpose(1, 2),
+                        cache_seqlens=kv_seq_len,
+                        causal=True,
+                    )
+            attn_output = self.o_proj(rearrange(output, "b s h d -> b s (h d)"))
+        else:
+            qkv = torch.stack(
+                [query_states, key_states, value_states], dim=2
+            )  # [bsz, nh, 3, q_len, hd]
+            qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+            attn_output = self.flash_attn_forward(qkv)
+        return attn_output, None, past_key_value
 class LlamaDecoderLayer(nn.Module):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        # attention_mask = None
         hidden_states = inputs_embeds