shivanandmn commited on
Commit
e9478da
·
verified ·
1 Parent(s): bcbb9e6

Model save

Browse files
Files changed (3) hide show
  1. README.md +21 -21
  2. model.safetensors +1 -1
  3. modeling_parallel_gpt2.py +222 -1
README.md CHANGED
@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 3.1861
21
- - Accuracy: 0.4193
22
- - Perplexity: 24.1930
23
- - Bleu: 0.1440
24
 
25
  ## Model description
26
 
@@ -52,23 +52,23 @@ The following hyperparameters were used during training:
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Accuracy | Perplexity | Bleu |
54
  |:-------------:|:------:|:----:|:---------------:|:--------:|:----------:|:------:|
55
- | 6.0438 | 0.2806 | 500 | 5.9200 | 0.1897 | 372.4009 | 0.0359 |
56
- | 5.0422 | 0.5612 | 1000 | 4.8934 | 0.2636 | 133.4091 | 0.0610 |
57
- | 4.3494 | 0.8418 | 1500 | 4.2389 | 0.3183 | 69.3337 | 0.0833 |
58
- | 3.9486 | 1.1223 | 2000 | 3.8856 | 0.3521 | 48.6953 | 0.1037 |
59
- | 3.7605 | 1.4029 | 2500 | 3.7143 | 0.3671 | 41.0301 | 0.1206 |
60
- | 3.6544 | 1.6835 | 3000 | 3.5898 | 0.3781 | 36.2282 | 0.1332 |
61
- | 3.5527 | 1.9641 | 3500 | 3.5051 | 0.3862 | 33.2836 | 0.1349 |
62
- | 3.4346 | 2.2447 | 4000 | 3.4410 | 0.3919 | 31.2181 | 0.1335 |
63
- | 3.374 | 2.5253 | 4500 | 3.3867 | 0.3972 | 29.5672 | 0.1354 |
64
- | 3.3442 | 2.8058 | 5000 | 3.3410 | 0.4017 | 28.2468 | 0.1405 |
65
- | 3.2251 | 3.0864 | 5500 | 3.3072 | 0.4055 | 27.3093 | 0.1404 |
66
- | 3.2187 | 3.3670 | 6000 | 3.2781 | 0.4088 | 26.5242 | 0.1401 |
67
- | 3.1975 | 3.6476 | 6500 | 3.2494 | 0.4118 | 25.7753 | 0.1433 |
68
- | 3.172 | 3.9282 | 7000 | 3.2276 | 0.4142 | 25.2178 | 0.1445 |
69
- | 3.1055 | 4.2088 | 7500 | 3.2109 | 0.4163 | 24.8014 | 0.1447 |
70
- | 3.0676 | 4.4893 | 8000 | 3.1977 | 0.4178 | 24.4763 | 0.1453 |
71
- | 3.0779 | 4.7699 | 8500 | 3.1861 | 0.4193 | 24.1930 | 0.1440 |
72
 
73
 
74
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 3.1864
21
+ - Accuracy: 0.4195
22
+ - Perplexity: 24.2005
23
+ - Bleu: 0.1476
24
 
25
  ## Model description
26
 
 
52
 
53
  | Training Loss | Epoch | Step | Validation Loss | Accuracy | Perplexity | Bleu |
54
  |:-------------:|:------:|:----:|:---------------:|:--------:|:----------:|:------:|
55
+ | 6.0443 | 0.2806 | 500 | 5.9164 | 0.1901 | 371.0844 | 0.0350 |
56
+ | 5.0429 | 0.5612 | 1000 | 4.8947 | 0.2638 | 133.5839 | 0.0647 |
57
+ | 4.3531 | 0.8418 | 1500 | 4.2426 | 0.3176 | 69.5891 | 0.0829 |
58
+ | 3.9503 | 1.1223 | 2000 | 3.8874 | 0.3517 | 48.7842 | 0.1050 |
59
+ | 3.7613 | 1.4029 | 2500 | 3.7124 | 0.3672 | 40.9504 | 0.1211 |
60
+ | 3.6548 | 1.6835 | 3000 | 3.5911 | 0.3780 | 36.2753 | 0.1308 |
61
+ | 3.5531 | 1.9641 | 3500 | 3.5068 | 0.3860 | 33.3428 | 0.1340 |
62
+ | 3.4344 | 2.2447 | 4000 | 3.4411 | 0.3920 | 31.2224 | 0.1356 |
63
+ | 3.3743 | 2.5253 | 4500 | 3.3875 | 0.3972 | 29.5917 | 0.1389 |
64
+ | 3.3443 | 2.8058 | 5000 | 3.3429 | 0.4016 | 28.3017 | 0.1373 |
65
+ | 3.225 | 3.0864 | 5500 | 3.3080 | 0.4055 | 27.3310 | 0.1419 |
66
+ | 3.2185 | 3.3670 | 6000 | 3.2781 | 0.4090 | 26.5258 | 0.1463 |
67
+ | 3.1972 | 3.6476 | 6500 | 3.2500 | 0.4121 | 25.7899 | 0.1453 |
68
+ | 3.1719 | 3.9282 | 7000 | 3.2268 | 0.4144 | 25.1990 | 0.1465 |
69
+ | 3.1052 | 4.2088 | 7500 | 3.2109 | 0.4162 | 24.8018 | 0.1472 |
70
+ | 3.0672 | 4.4893 | 8000 | 3.1978 | 0.4179 | 24.4788 | 0.1469 |
71
+ | 3.0773 | 4.7699 | 8500 | 3.1864 | 0.4195 | 24.2005 | 0.1476 |
72
 
73
 
74
  ### Framework versions
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b856df68c6b22ef948d2a23fc33da1b4184146dc6e62d5ecec96c9501811235a
3
  size 1419322880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12bcf19c73feb91c89b081e737e677739d1c08d1066a1832f28d0d205e67e3f6
3
  size 1419322880
modeling_parallel_gpt2.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  """PyTorch OpenAI GPT-2 model modified to support parallel-gpt2, code copied from Huggingface"""
3
 
4
 
@@ -274,6 +273,7 @@ class ParallelGPT2Model(ParallelGPT2PretrainedModel):
274
  use_cache,
275
  output_attentions,
276
  )
 
277
  else:
278
  outputs_left = block_left(
279
  hidden_states,
@@ -295,6 +295,7 @@ class ParallelGPT2Model(ParallelGPT2PretrainedModel):
295
  use_cache=use_cache,
296
  output_attentions=output_attentions,
297
  )
 
298
  if self.config.bottleneck_method=="concat":
299
  hidden_states = torch.cat((outputs_left[0], outputs_right[0]), dim=-1)
300
  hidden_states = self.bottleneck(hidden_states)
@@ -341,6 +342,226 @@ class ParallelGPT2Model(ParallelGPT2PretrainedModel):
341
  )
342
 
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  class ParallelGPT2LMHeadModel(ParallelGPT2PretrainedModel, GenerationMixin):
345
  _tied_weights_keys = ["lm_head.weight"]
346
 
 
 
1
  """PyTorch OpenAI GPT-2 model modified to support parallel-gpt2, code copied from Huggingface"""
2
 
3
 
 
273
  use_cache,
274
  output_attentions,
275
  )
276
+ # outputs_right = outputs_left
277
  else:
278
  outputs_left = block_left(
279
  hidden_states,
 
295
  use_cache=use_cache,
296
  output_attentions=output_attentions,
297
  )
298
+ # outputs_right = outputs_left
299
  if self.config.bottleneck_method=="concat":
300
  hidden_states = torch.cat((outputs_left[0], outputs_right[0]), dim=-1)
301
  hidden_states = self.bottleneck(hidden_states)
 
342
  )
343
 
344
 
345
+
346
+ def forward_test(
347
+ self,
348
+ input_ids: Optional[torch.LongTensor] = None,
349
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
350
+ attention_mask: Optional[torch.FloatTensor] = None,
351
+ token_type_ids: Optional[torch.LongTensor] = None,
352
+ position_ids: Optional[torch.LongTensor] = None,
353
+ head_mask: Optional[torch.FloatTensor] = None,
354
+ inputs_embeds: Optional[torch.FloatTensor] = None,
355
+ encoder_hidden_states: Optional[torch.Tensor] = None,
356
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
357
+ use_cache: Optional[bool] = None,
358
+ output_attentions: Optional[bool] = None,
359
+ output_hidden_states: Optional[bool] = None,
360
+ return_dict: Optional[bool] = None,
361
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
362
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
363
+ output_hidden_states = (
364
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
365
+ )
366
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
367
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
368
+
369
+ if input_ids is not None and inputs_embeds is not None:
370
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
371
+ elif input_ids is not None:
372
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
373
+ input_shape = input_ids.size()
374
+ input_ids = input_ids.view(-1, input_shape[-1])
375
+ batch_size = input_ids.shape[0]
376
+ elif inputs_embeds is not None:
377
+ input_shape = inputs_embeds.size()[:-1]
378
+ batch_size = inputs_embeds.shape[0]
379
+ else:
380
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
381
+
382
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
383
+
384
+ if token_type_ids is not None:
385
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
386
+
387
+ if past_key_values is None:
388
+ past_length = 0
389
+ past_key_values = tuple([None] * len(self.h))
390
+ else:
391
+ past_length = past_key_values[0][0].size(-2)
392
+ if position_ids is None:
393
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
394
+ position_ids = position_ids.unsqueeze(0)
395
+
396
+ if inputs_embeds is None:
397
+ inputs_embeds = self.wte(input_ids)
398
+ position_embeds = self.wpe(position_ids)
399
+ hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
400
+
401
+ # Attention mask.
402
+ _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None
403
+ attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None
404
+ if self._attn_implementation == "flash_attention_2":
405
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
406
+ elif _use_sdpa:
407
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
408
+ attention_mask=attention_mask,
409
+ input_shape=(batch_size, input_shape[-1]),
410
+ inputs_embeds=inputs_embeds,
411
+ past_key_values_length=past_length,
412
+ )
413
+ else:
414
+ if attention_mask is not None:
415
+ # We create a 3D attention mask from a 2D tensor mask.
416
+ # Sizes are [batch_size, 1, 1, to_seq_length]
417
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
418
+ # this attention mask is more simple than the triangular masking of causal attention
419
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
420
+ attention_mask = attention_mask[:, None, None, :]
421
+
422
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
423
+ # masked positions, this operation will create a tensor which is 0.0 for
424
+ # positions we want to attend and the dtype's smallest value for masked positions.
425
+ # Since we are adding it to the raw scores before the softmax, this is
426
+ # effectively the same as removing these entirely.
427
+ attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
428
+ attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
429
+
430
+ # If a 2D or 3D attention mask is provided for the cross-attention
431
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
432
+ if self.config.add_cross_attention and encoder_hidden_states is not None:
433
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
434
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
435
+ if encoder_attention_mask is None:
436
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
437
+ if _use_sdpa:
438
+ encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
439
+ mask=encoder_attention_mask, dtype=inputs_embeds.dtype, tgt_len=input_shape[-1]
440
+ )
441
+ elif not self._attn_implementation == "flash_attention_2":
442
+ encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
443
+ else:
444
+ encoder_attention_mask = None
445
+
446
+ # Prepare head mask if needed
447
+ # 1.0 in head_mask indicate we keep the head
448
+ # attention_probs has shape bsz x n_heads x N x N
449
+ # head_mask has shape n_layer x batch x n_heads x N x N
450
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
451
+
452
+ if token_type_ids is not None:
453
+ token_type_embeds = self.wte(token_type_ids)
454
+ hidden_states = hidden_states + token_type_embeds
455
+
456
+ hidden_states = self.drop(hidden_states)
457
+
458
+ output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
459
+
460
+ if self.gradient_checkpointing and self.training:
461
+ if use_cache:
462
+ logger.warning_once(
463
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
464
+ )
465
+ use_cache = False
466
+
467
+ presents = () if use_cache else None
468
+ self_attentions = () if output_attentions else None
469
+ cross_attentions = () if output_attentions and self.config.add_cross_attention else None
470
+ all_hidden_states = () if output_hidden_states else None
471
+ for i in range(0, len(self.h), 2):
472
+ block_left, layer_past_left = self.h[i], past_key_values[i]
473
+ block_right, layer_past_right = self.h[i+1], past_key_values[i+1]
474
+ # Model parallel
475
+ if self.model_parallel:
476
+ torch.cuda.set_device(hidden_states.device)
477
+ # Ensure layer_past is on same device as hidden_states (might not be correct)
478
+ if layer_past is not None:
479
+ layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
480
+ # Ensure that attention_mask is always on the same device as hidden_states
481
+ if attention_mask is not None:
482
+ attention_mask = attention_mask.to(hidden_states.device)
483
+ if isinstance(head_mask, torch.Tensor):
484
+ head_mask = head_mask.to(hidden_states.device)
485
+ if output_hidden_states:
486
+ all_hidden_states = all_hidden_states + (hidden_states,)
487
+ import copy
488
+ avg_block = copy.deepcopy(block_left)
489
+ state_left = block_left.state_dict()
490
+ state_right = block_right.state_dict()
491
+ new_state = {k: torch.min(state_left[k], state_right[k]) for k in state_left}
492
+ # new_state = {k: (state_left[k] + state_right[k]) for k in state_left}
493
+ avg_block.load_state_dict(new_state)
494
+
495
+ if self.gradient_checkpointing and self.training:
496
+ outputs = self._gradient_checkpointing_func(
497
+ avg_block.__call__,
498
+ hidden_states,
499
+ None,
500
+ attention_mask,
501
+ head_mask[i],
502
+ encoder_hidden_states,
503
+ encoder_attention_mask,
504
+ use_cache,
505
+ output_attentions,
506
+ )
507
+ else:
508
+ outputs = avg_block(
509
+ hidden_states,
510
+ layer_past=layer_past_left,
511
+ attention_mask=attention_mask,
512
+ head_mask=head_mask[i],
513
+ encoder_hidden_states=encoder_hidden_states,
514
+ encoder_attention_mask=encoder_attention_mask,
515
+ use_cache=use_cache,
516
+ output_attentions=output_attentions,
517
+ )
518
+
519
+ # outputs_right = outputs_left
520
+ if self.config.bottleneck_method=="concat":
521
+ hidden_states = torch.cat((outputs[0], outputs[0]), dim=-1)
522
+ hidden_states = self.bottleneck(hidden_states)
523
+ elif self.config.bottleneck_method=="add":
524
+ hidden_states = (outputs[0] + outputs[0]) ## taking add
525
+ elif self.config.bottleneck_method=="mean":
526
+ hidden_states = (outputs[0] + outputs[0]) / 2 ## taking mean
527
+ if use_cache is True:
528
+ presents = presents + (outputs[1],)
529
+
530
+ if output_attentions:
531
+ self_attentions = self_attentions + (outputs[2 if use_cache else 1],)
532
+ if self.config.add_cross_attention:
533
+ cross_attentions = cross_attentions + (outputs[3 if use_cache else 2],)
534
+
535
+ # Model Parallel: If it's the last layer for that device, put things on the next device
536
+ if self.model_parallel:
537
+ for k, v in self.device_map.items():
538
+ if i == v[-1] and "cuda:" + str(k) != self.last_device:
539
+ hidden_states = hidden_states.to("cuda:" + str(k + 1))
540
+
541
+ hidden_states = self.ln_f(hidden_states)
542
+
543
+ hidden_states = hidden_states.view(output_shape)
544
+ # Add last hidden state
545
+ if output_hidden_states:
546
+ all_hidden_states = all_hidden_states + (hidden_states,)
547
+
548
+ if not return_dict:
549
+ return tuple(
550
+ v
551
+ for v in [hidden_states, presents, all_hidden_states, self_attentions, cross_attentions]
552
+ if v is not None
553
+ )
554
+
555
+ return BaseModelOutputWithPastAndCrossAttentions(
556
+ last_hidden_state=hidden_states,
557
+ past_key_values=presents,
558
+ hidden_states=all_hidden_states,
559
+ attentions=self_attentions,
560
+ cross_attentions=cross_attentions,
561
+ )
562
+
563
+
564
+
565
  class ParallelGPT2LMHeadModel(ParallelGPT2PretrainedModel, GenerationMixin):
566
  _tied_weights_keys = ["lm_head.weight"]
567