Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 5, 2024

Commit

bca4d85

verified ·

1 Parent(s): a3dfbb3

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +56 -46

modeling_quiet.py CHANGED Viewed

@@ -1423,61 +1423,71 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		logits = self.lm_head(mixed_hidden_states)
 		return logits
-	def custom_generate(
 		self,
 		input_ids,
 		attention_mask=None,
-		position_ids=None,
-		past_key_values=None,
-		inputs_embeds=None,
-		use_cache=None,
-		output_attentions=None,
-		output_hidden_states=None,
-		return_dict=None,
-		max_new_tokens=512,
-		temperature=1.1,
 		streamer=None,
 		**kwargs,
 	):
 		batch_size, seq_len = input_ids.shape
-		assert past_key_values is None, "past_key_values not supported yet"
-		assert position_ids is None, "position_ids not supported yet"
-		# Generate up to max_new_tokens
-		for _ in range(max_new_tokens):
-			model_inputs = self.prepare_inputs_for_generation(
-				input_ids,
-				attention_mask=attention_mask,
-				inputs_embeds=inputs_embeds,
-				use_cache=use_cache,
-			)
-			outputs = self.model(**model_inputs)
-			next_token_logits = self.infer(
-				input_ids=input_ids,
-				attention_mask=attention_mask,
-				position_ids=position_ids,
-				past_key_values=outputs.past_key_values,
-				inputs_embeds=inputs_embeds,
-				use_cache=use_cache,
-				output_attentions=output_attentions,
-				output_hidden_states=output_hidden_states,
-				return_dict=return_dict,
-			)
-			next_token_logits = next_token_logits[:, -1, :]
-			next_tokens = torch.argmax(next_token_logits, dim=-1)
-			input_ids = torch.cat([input_ids, next_tokens.unsqueeze(-1)], dim=-1)
-			if streamer is not None:
-				streamer.put(next_tokens)
-			if next_tokens == self.tokenizer.convert_tokens_to_ids("<|/assistant|>"):
-				break
-		return input_ids
 	@add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
 	@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)

 		logits = self.lm_head(mixed_hidden_states)
 		return logits
+	def custom_generate(model, input_ids, attention_mask, max_length, streamer=None, **kwargs):
+		# Set up some variables
+		batch_size, seq_len = input_ids.shape
+		max_length = max_length if max_length is not None else model.config.max_length
+		max_new_tokens = max_length - seq_len
+		temperature = kwargs.get("temperature", 1.0)
+		with torch.no_grad():
+			for cur_token_idx in range(max_new_tokens):
+				# Run a forward pass to get the logits for the next token
+				outputs = model(
+					input_ids=input_ids,
+					attention_mask=attention_mask,
+					use_cache=True,
+				)
+				logits = outputs.logits[:, -1, :]
+				# Sample the next token from the logits
+				next_token_logits = logits / temperature
+				next_token_id = torch.multinomial(torch.nn.functional.softmax(next_token_logits, dim=-1), num_samples=1)
+				# Append the new token to the input sequence
+				input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+				attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device)], dim=-1)
+				# Stream the new token if a streamer is provided
+				if streamer is not None:
+					streamer.put(next_token_id)
+				# Check if the end token is generated for all sequences in the batch
+				if next_token_id.eq(model.config.eos_token_id).all():
+					break
+		return input_ids
+	# Add this to QuietForCausalLM forward method to support custom generate
+	@torch.no_grad()
+	def generate(
 		self,
 		input_ids,
 		attention_mask=None,
+		max_length=None,
 		streamer=None,
 		**kwargs,
 	):
+		# Prepare inputs
 		batch_size, seq_len = input_ids.shape
+		if attention_mask is None:
+			attention_mask = torch.ones_like(input_ids)
+		# Call the custom generate function
+		output_ids = custom_generate(
+			self,
+			input_ids=input_ids,
+			attention_mask=attention_mask,
+			max_length=max_length,
+			streamer=streamer,
+			**kwargs,
+		)
+		return output_ids
 	@add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
 	@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)