BluebrainAI
/

parallel-mean-bottleneck-gpt2-medium-wikitext

@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 3.1861
-- Accuracy: 0.4193
-- Perplexity: 24.1930
-- Bleu: 0.1440
 ## Model description
@@ -52,23 +52,23 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch  | Step | Validation Loss | Accuracy | Perplexity | Bleu   |
 |:-------------:|:------:|:----:|:---------------:|:--------:|:----------:|:------:|
-| 6.0438        | 0.2806 | 500  | 5.9200          | 0.1897   | 372.4009   | 0.0359 |
-| 5.0422        | 0.5612 | 1000 | 4.8934          | 0.2636   | 133.4091   | 0.0610 |
-| 4.3494        | 0.8418 | 1500 | 4.2389          | 0.3183   | 69.3337    | 0.0833 |
-| 3.9486        | 1.1223 | 2000 | 3.8856          | 0.3521   | 48.6953    | 0.1037 |
-| 3.7605        | 1.4029 | 2500 | 3.7143          | 0.3671   | 41.0301    | 0.1206 |
-| 3.6544        | 1.6835 | 3000 | 3.5898          | 0.3781   | 36.2282    | 0.1332 |
-| 3.5527        | 1.9641 | 3500 | 3.5051          | 0.3862   | 33.2836    | 0.1349 |
-| 3.4346        | 2.2447 | 4000 | 3.4410          | 0.3919   | 31.2181    | 0.1335 |
-| 3.374         | 2.5253 | 4500 | 3.3867          | 0.3972   | 29.5672    | 0.1354 |
-| 3.3442        | 2.8058 | 5000 | 3.3410          | 0.4017   | 28.2468    | 0.1405 |
-| 3.2251        | 3.0864 | 5500 | 3.3072          | 0.4055   | 27.3093    | 0.1404 |
-| 3.2187        | 3.3670 | 6000 | 3.2781          | 0.4088   | 26.5242    | 0.1401 |
-| 3.1975        | 3.6476 | 6500 | 3.2494          | 0.4118   | 25.7753    | 0.1433 |
-| 3.172         | 3.9282 | 7000 | 3.2276          | 0.4142   | 25.2178    | 0.1445 |
-| 3.1055        | 4.2088 | 7500 | 3.2109          | 0.4163   | 24.8014    | 0.1447 |
-| 3.0676        | 4.4893 | 8000 | 3.1977          | 0.4178   | 24.4763    | 0.1453 |
-| 3.0779        | 4.7699 | 8500 | 3.1861          | 0.4193   | 24.1930    | 0.1440 |
 ### Framework versions

 This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 3.1864
+- Accuracy: 0.4195
+- Perplexity: 24.2005
+- Bleu: 0.1476
 ## Model description
 | Training Loss | Epoch  | Step | Validation Loss | Accuracy | Perplexity | Bleu   |
 |:-------------:|:------:|:----:|:---------------:|:--------:|:----------:|:------:|
+| 6.0443        | 0.2806 | 500  | 5.9164          | 0.1901   | 371.0844   | 0.0350 |
+| 5.0429        | 0.5612 | 1000 | 4.8947          | 0.2638   | 133.5839   | 0.0647 |
+| 4.3531        | 0.8418 | 1500 | 4.2426          | 0.3176   | 69.5891    | 0.0829 |
+| 3.9503        | 1.1223 | 2000 | 3.8874          | 0.3517   | 48.7842    | 0.1050 |
+| 3.7613        | 1.4029 | 2500 | 3.7124          | 0.3672   | 40.9504    | 0.1211 |
+| 3.6548        | 1.6835 | 3000 | 3.5911          | 0.3780   | 36.2753    | 0.1308 |
+| 3.5531        | 1.9641 | 3500 | 3.5068          | 0.3860   | 33.3428    | 0.1340 |
+| 3.4344        | 2.2447 | 4000 | 3.4411          | 0.3920   | 31.2224    | 0.1356 |
+| 3.3743        | 2.5253 | 4500 | 3.3875          | 0.3972   | 29.5917    | 0.1389 |
+| 3.3443        | 2.8058 | 5000 | 3.3429          | 0.4016   | 28.3017    | 0.1373 |
+| 3.225         | 3.0864 | 5500 | 3.3080          | 0.4055   | 27.3310    | 0.1419 |
+| 3.2185        | 3.3670 | 6000 | 3.2781          | 0.4090   | 26.5258    | 0.1463 |
+| 3.1972        | 3.6476 | 6500 | 3.2500          | 0.4121   | 25.7899    | 0.1453 |
+| 3.1719        | 3.9282 | 7000 | 3.2268          | 0.4144   | 25.1990    | 0.1465 |
+| 3.1052        | 4.2088 | 7500 | 3.2109          | 0.4162   | 24.8018    | 0.1472 |
+| 3.0672        | 4.4893 | 8000 | 3.1978          | 0.4179   | 24.4788    | 0.1469 |
+| 3.0773        | 4.7699 | 8500 | 3.1864          | 0.4195   | 24.2005    | 0.1476 |
 ### Framework versions

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b856df68c6b22ef948d2a23fc33da1b4184146dc6e62d5ecec96c9501811235a
 size 1419322880

 version https://git-lfs.github.com/spec/v1
+oid sha256:12bcf19c73feb91c89b081e737e677739d1c08d1066a1832f28d0d205e67e3f6
 size 1419322880

modeling_parallel_gpt2.py CHANGED Viewed

@@ -1,4 +1,3 @@
 """PyTorch OpenAI GPT-2 model modified to support parallel-gpt2, code copied from Huggingface"""
@@ -274,6 +273,7 @@ class ParallelGPT2Model(ParallelGPT2PretrainedModel):
                     use_cache,
                     output_attentions,
                 )
             else:
                 outputs_left = block_left(
                     hidden_states,
@@ -295,6 +295,7 @@ class ParallelGPT2Model(ParallelGPT2PretrainedModel):
                     use_cache=use_cache,
                     output_attentions=output_attentions,
                 )
             if self.config.bottleneck_method=="concat":
                 hidden_states = torch.cat((outputs_left[0], outputs_right[0]), dim=-1)
                 hidden_states = self.bottleneck(hidden_states)
@@ -341,6 +342,226 @@ class ParallelGPT2Model(ParallelGPT2PretrainedModel):
         )
 class ParallelGPT2LMHeadModel(ParallelGPT2PretrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]

 """PyTorch OpenAI GPT-2 model modified to support parallel-gpt2, code copied from Huggingface"""
                     use_cache,
                     output_attentions,
                 )
+                # outputs_right = outputs_left
             else:
                 outputs_left = block_left(
                     hidden_states,
                     use_cache=use_cache,
                     output_attentions=output_attentions,
                 )
+                # outputs_right = outputs_left
             if self.config.bottleneck_method=="concat":
                 hidden_states = torch.cat((outputs_left[0], outputs_right[0]), dim=-1)
                 hidden_states = self.bottleneck(hidden_states)
         )
+    def forward_test(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
+        # Attention mask.
+        _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None
+        attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None
+        if self._attn_implementation == "flash_attention_2":
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif _use_sdpa:
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask=attention_mask,
+                input_shape=(batch_size, input_shape[-1]),
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_length,
+            )
+        else:
+            if attention_mask is not None:
+                # We create a 3D attention mask from a 2D tensor mask.
+                # Sizes are [batch_size, 1, 1, to_seq_length]
+                # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+                # this attention mask is more simple than the triangular masking of causal attention
+                # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+                attention_mask = attention_mask[:, None, None, :]
+                # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+                # masked positions, this operation will create a tensor which is 0.0 for
+                # positions we want to attend and the dtype's smallest value for masked positions.
+                # Since we are adding it to the raw scores before the softmax, this is
+                # effectively the same as removing these entirely.
+                attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+                attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            if _use_sdpa:
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    mask=encoder_attention_mask, dtype=inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+            elif not self._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        presents = () if use_cache else None
+        self_attentions = () if output_attentions else None
+        cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i in range(0, len(self.h), 2):
+            block_left, layer_past_left = self.h[i], past_key_values[i]
+            block_right, layer_past_right = self.h[i+1], past_key_values[i+1]
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            import copy
+            avg_block = copy.deepcopy(block_left)
+            state_left = block_left.state_dict()
+            state_right = block_right.state_dict()
+            new_state = {k: torch.min(state_left[k], state_right[k]) for k in state_left}
+            # new_state = {k: (state_left[k] + state_right[k]) for k in state_left}
+            avg_block.load_state_dict(new_state)
+            if self.gradient_checkpointing and self.training:
+                outputs = self._gradient_checkpointing_func(
+                    avg_block.__call__,
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    use_cache,
+                    output_attentions,
+                )
+            else:
+                outputs = avg_block(
+                    hidden_states,
+                    layer_past=layer_past_left,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                     encoder_attention_mask=encoder_attention_mask,
+                     use_cache=use_cache,
+                     output_attentions=output_attentions,
+                 )
+                # outputs_right = outputs_left
+            if self.config.bottleneck_method=="concat":
+                hidden_states = torch.cat((outputs[0], outputs[0]), dim=-1)
+                hidden_states = self.bottleneck(hidden_states)
+            elif self.config.bottleneck_method=="add":
+                hidden_states = (outputs[0] + outputs[0]) ## taking add
+            elif self.config.bottleneck_method=="mean":
+                hidden_states = (outputs[0] + outputs[0]) / 2 ## taking mean
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                self_attentions = self_attentions + (outputs[2 if use_cache else 1],)
+                if self.config.add_cross_attention:
+                    cross_attentions = cross_attentions + (outputs[3 if use_cache else 2],)
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, self_attentions, cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=self_attentions,
+            cross_attentions=cross_attentions,
+        )
 class ParallelGPT2LMHeadModel(ParallelGPT2PretrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]