Update modeling_quiet.py
Browse files- modeling_quiet.py +2 -11
modeling_quiet.py
CHANGED
@@ -1693,8 +1693,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
|
|
1693 |
)
|
1694 |
|
1695 |
prev_hidden_states = hidden_states
|
1696 |
-
print(f"1696 Hidden states contains NaN: {torch.isnan(hidden_states).any().item()}")
|
1697 |
-
|
1698 |
hidden_states = outputs[0]
|
1699 |
prev_rm_logits = rm_logits # for policy gradient
|
1700 |
prev_rm_tokens = cur_rm_tokens # for policy gradient
|
@@ -1818,12 +1816,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
|
|
1818 |
if not attempted or self.comparison_mode:
|
1819 |
rm_hidden_states = hidden_states
|
1820 |
# print("Magnitude of RM hidden states before RM head", rm_hidden_states.norm())
|
1821 |
-
print(f"RM hidden states contains NaN: {torch.isnan(rm_hidden_states).any().item()}")
|
1822 |
-
|
1823 |
rm_logits = apply_head(self.lm_head, rm_hidden_states, detach=self.optimize_lm_head_only_at_start)
|
1824 |
-
|
1825 |
-
print(f"RM logits contains NaN: {torch.isnan(rm_logits).any().item()}")
|
1826 |
-
|
1827 |
|
1828 |
# don't allow it to predict the thinking token
|
1829 |
if self.tokenizer_has_start_thought_token:
|
@@ -1885,11 +1878,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
|
|
1885 |
|
1886 |
if not contains_thought:
|
1887 |
with torch.set_grad_enabled(not self.train_only_thinking_embedding):
|
1888 |
-
print(f"Probabilities_2d contains NaN: {torch.isnan(probabilities_2d).any().item()}")
|
1889 |
-
|
1890 |
inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype))
|
1891 |
-
print(f"Inputs_embeds contains NaN: {torch.isnan(inputs_embeds).any().item()}")
|
1892 |
-
|
1893 |
else:
|
1894 |
thought_id = self.start_token_id if contains_start else self.end_token_id
|
1895 |
cur_thought_embedding = start_embedding if contains_start else end_embedding
|
@@ -1926,6 +1915,8 @@ class QuietForCausalLM(QuietPreTrainedModel):
|
|
1926 |
else:
|
1927 |
loss_logits = logits
|
1928 |
shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
|
|
|
|
|
1929 |
# print("initial_loss_logits contains NaN:", torch.isnan(initial_loss_logits).any().item())
|
1930 |
# print("logits contains NaN:", torch.isnan(logits).any().item())
|
1931 |
# print("loss_logits contains NaN:", torch.isnan(loss_logits).any().item())
|
|
|
1693 |
)
|
1694 |
|
1695 |
prev_hidden_states = hidden_states
|
|
|
|
|
1696 |
hidden_states = outputs[0]
|
1697 |
prev_rm_logits = rm_logits # for policy gradient
|
1698 |
prev_rm_tokens = cur_rm_tokens # for policy gradient
|
|
|
1816 |
if not attempted or self.comparison_mode:
|
1817 |
rm_hidden_states = hidden_states
|
1818 |
# print("Magnitude of RM hidden states before RM head", rm_hidden_states.norm())
|
|
|
|
|
1819 |
rm_logits = apply_head(self.lm_head, rm_hidden_states, detach=self.optimize_lm_head_only_at_start)
|
|
|
|
|
|
|
1820 |
|
1821 |
# don't allow it to predict the thinking token
|
1822 |
if self.tokenizer_has_start_thought_token:
|
|
|
1878 |
|
1879 |
if not contains_thought:
|
1880 |
with torch.set_grad_enabled(not self.train_only_thinking_embedding):
|
|
|
|
|
1881 |
inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype))
|
|
|
|
|
1882 |
else:
|
1883 |
thought_id = self.start_token_id if contains_start else self.end_token_id
|
1884 |
cur_thought_embedding = start_embedding if contains_start else end_embedding
|
|
|
1915 |
else:
|
1916 |
loss_logits = logits
|
1917 |
shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
|
1918 |
+
import pdb; pdb.set_trace()
|
1919 |
+
|
1920 |
# print("initial_loss_logits contains NaN:", torch.isnan(initial_loss_logits).any().item())
|
1921 |
# print("logits contains NaN:", torch.isnan(logits).any().item())
|
1922 |
# print("loss_logits contains NaN:", torch.isnan(loss_logits).any().item())
|