Crystalcareai commited on
Commit
cbafcfb
·
verified ·
1 Parent(s): 8d44852

Update modeling_quiet.py

Browse files
Files changed (1) hide show
  1. modeling_quiet.py +2 -11
modeling_quiet.py CHANGED
@@ -1693,8 +1693,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
1693
  )
1694
 
1695
  prev_hidden_states = hidden_states
1696
- print(f"1696 Hidden states contains NaN: {torch.isnan(hidden_states).any().item()}")
1697
-
1698
  hidden_states = outputs[0]
1699
  prev_rm_logits = rm_logits # for policy gradient
1700
  prev_rm_tokens = cur_rm_tokens # for policy gradient
@@ -1818,12 +1816,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
1818
  if not attempted or self.comparison_mode:
1819
  rm_hidden_states = hidden_states
1820
  # print("Magnitude of RM hidden states before RM head", rm_hidden_states.norm())
1821
- print(f"RM hidden states contains NaN: {torch.isnan(rm_hidden_states).any().item()}")
1822
-
1823
  rm_logits = apply_head(self.lm_head, rm_hidden_states, detach=self.optimize_lm_head_only_at_start)
1824
-
1825
- print(f"RM logits contains NaN: {torch.isnan(rm_logits).any().item()}")
1826
-
1827
 
1828
  # don't allow it to predict the thinking token
1829
  if self.tokenizer_has_start_thought_token:
@@ -1885,11 +1878,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
1885
 
1886
  if not contains_thought:
1887
  with torch.set_grad_enabled(not self.train_only_thinking_embedding):
1888
- print(f"Probabilities_2d contains NaN: {torch.isnan(probabilities_2d).any().item()}")
1889
-
1890
  inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype))
1891
- print(f"Inputs_embeds contains NaN: {torch.isnan(inputs_embeds).any().item()}")
1892
-
1893
  else:
1894
  thought_id = self.start_token_id if contains_start else self.end_token_id
1895
  cur_thought_embedding = start_embedding if contains_start else end_embedding
@@ -1926,6 +1915,8 @@ class QuietForCausalLM(QuietPreTrainedModel):
1926
  else:
1927
  loss_logits = logits
1928
  shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
 
 
1929
  # print("initial_loss_logits contains NaN:", torch.isnan(initial_loss_logits).any().item())
1930
  # print("logits contains NaN:", torch.isnan(logits).any().item())
1931
  # print("loss_logits contains NaN:", torch.isnan(loss_logits).any().item())
 
1693
  )
1694
 
1695
  prev_hidden_states = hidden_states
 
 
1696
  hidden_states = outputs[0]
1697
  prev_rm_logits = rm_logits # for policy gradient
1698
  prev_rm_tokens = cur_rm_tokens # for policy gradient
 
1816
  if not attempted or self.comparison_mode:
1817
  rm_hidden_states = hidden_states
1818
  # print("Magnitude of RM hidden states before RM head", rm_hidden_states.norm())
 
 
1819
  rm_logits = apply_head(self.lm_head, rm_hidden_states, detach=self.optimize_lm_head_only_at_start)
 
 
 
1820
 
1821
  # don't allow it to predict the thinking token
1822
  if self.tokenizer_has_start_thought_token:
 
1878
 
1879
  if not contains_thought:
1880
  with torch.set_grad_enabled(not self.train_only_thinking_embedding):
 
 
1881
  inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype))
 
 
1882
  else:
1883
  thought_id = self.start_token_id if contains_start else self.end_token_id
1884
  cur_thought_embedding = start_embedding if contains_start else end_embedding
 
1915
  else:
1916
  loss_logits = logits
1917
  shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
1918
+ import pdb; pdb.set_trace()
1919
+
1920
  # print("initial_loss_logits contains NaN:", torch.isnan(initial_loss_logits).any().item())
1921
  # print("logits contains NaN:", torch.isnan(logits).any().item())
1922
  # print("loss_logits contains NaN:", torch.isnan(loss_logits).any().item())