Spaces:

BaggerOfWords
/

MOSAIC

Sleeping

App Files Files Community

MOSAIC / mosaic.py

BaggerOfWords

Issues with ZeroGPU

4d0b859 20 days ago

raw

history blame contribute delete

15.9 kB

	from typing import List, Optional, Dict
	import numpy as np
	import torch
	import transformers
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch.nn.functional as F

	torch.set_grad_enabled(False)

	def apply_top_p_with_epsilon(logits: torch.Tensor, top_p: float, epsilon: float = 1e-10) -> torch.Tensor:
	"""
	Applies a top-p (nucleus) filtering to logits but, instead of setting
	the logits of non-selected tokens to -inf (which would result in zero probability),
	sets them to log(epsilon), so that the support remains the same.

	Parameters:
	logits: Tensor of shape (batch, seq_len, vocab_size)
	top_p: The nucleus threshold (e.g. 0.7, 0.8, etc.)
	epsilon: The small value to assign to tokens not selected.

	Returns:
	new_logits: Tensor with the same shape as logits.
	"""
	# Compute probabilities from logits
	probs = F.softmax(logits, dim=-1)
	# Sort probabilities (descending) along the vocabulary dimension.
	sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
	# Compute the cumulative sum along the sorted probabilities.
	cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
	# Create a mask: True for tokens to keep.
	# We keep tokens until cumulative_probs <= top_p.
	keep_mask = cumulative_probs <= top_p

	# Ensure that at least one token is kept per example: if none are kept, keep the top one.
	# Here we check along the vocab dimension.
	no_token_kept = keep_mask.sum(dim=-1, keepdim=True) == 0
	if no_token_kept.any():
	# For positions where no token was kept, set the first token (highest probability) to True.
	# Note: torch.scatter_ returns a modified tensor.
	# We create a tensor of zeros (False) and then scatter True into the first column.
	fix_mask = torch.zeros_like(keep_mask, dtype=torch.bool)
	fix_mask.scatter_(-1, torch.zeros_like(keep_mask[..., :1], dtype=torch.long), True)
	keep_mask = torch.where(no_token_kept, fix_mask, keep_mask)

	# Now, create new logits: copy the original logits.
	new_logits = logits.clone()
	# For tokens that are not kept (i.e. where keep_mask is False), set their logit to log(epsilon)
	new_logits[~keep_mask] = torch.log(torch.tensor(epsilon, device=logits.device, dtype=logits.dtype))
	return new_logits

	class Mosaic(object):
	def __init__(
	self,
	model_name_or_paths: List[str],
	use_bfloat16: bool = True,
	max_token_observed: int = 512,
	unigram: Optional[str] = None,
	custom_config: Optional[List[bool]] = None,
	stupid_mode: bool = False,
	one_model_mode: bool = False
	) -> None:
	"""
	If `loaded_models` is provided, re-use any entries matching
	model_name_or_paths; otherwise load and optionally register
	into that dict.
	"""
	self.models = []

	for model_name_or_path in model_name_or_paths:
	# load from pre-trained hub or path
	model = AutoModelForCausalLM.from_pretrained(
	model_name_or_path,
	device_map="auto",
	trust_remote_code=True,
	torch_dtype=torch.bfloat16 if use_bfloat16 else torch.float32,
	)
	model.eval()

	self.models.append(model)
	print(f"Loaded model: {model_name_or_path}")

	self.one_model_mode = one_model_mode

	if stupid_mode:
	self.max_iters = 0
	else:
	self.max_iters = 1000

	self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_paths[-1])
	if not self.tokenizer.pad_token:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	self.max_token_observed = max_token_observed

	self.nb_models = len(self.models)
	self.unigram_path = unigram

	if custom_config is None:
	custom_config = [False] * self.nb_models
	self.custom_config = custom_config

	def _tokenize(self, batch: list[str]) -> transformers.BatchEncoding:
	encodings = self.tokenizer(
	batch,
	return_tensors="pt",
	padding="longest",
	truncation=True,
	max_length=self.max_token_observed,
	return_token_type_ids=False)
	return encodings

	def trim_logits(self, logits, max_length=32000):
	# Check the shape of the logits tensor
	if logits.shape[2] > max_length:
	# Slice the tensor to keep only the first max_length elements along the last dimension
	logits = logits[:, :, :max_length]
	return logits

	@torch.inference_mode()
	def _get_logits(self, encodings: transformers.BatchEncoding) -> List[torch.Tensor]:
	# If one_model_mode is active, we simulate multiple models by applying top-p with different thresholds.
	if self.one_model_mode:
	# Compute base logits from the single model.
	model = self.models[0]
	device = next(model.parameters()).device
	model_encodings = encodings.to(device)
	base_logits = model(**model_encodings).logits
	# Optionally trim logits:
	# base_logits = self.trim_logits(base_logits)
	# Define the top-p thresholds (e.g., four different values)
	top_p_values = [0.7, 0.8, 0.9, 0.95]
	# Epsilon value for non-selected tokens (you can adjust this if needed)
	epsilon = 1e-10
	logits_list = []
	for top_p in top_p_values:
	warped_logits = apply_top_p_with_epsilon(base_logits, top_p, epsilon)
	logits_list.append(warped_logits)
	else:
	# Normal mode: use each model in self.models.
	logits_list = []
	for i, model in enumerate(self.models):
	device = next(model.parameters()).device
	model_encodings = encodings.to(device)
	logits = model(**model_encodings).logits
	# Optionally trim logits:
	# logits = self.trim_logits(logits)
	logits_list.append(logits)
	if device.type == "cuda":
	torch.cuda.synchronize(device)

	if self.unigram_path:
	batch_size, seq_len, voc_size = logits_list[0].shape
	unigram_proba = torch.load(self.unigram_path)
	unigram_proba += 1e-10
	unigram_logits = torch.log(unigram_proba)
	# Optionally center logits if needed:
	logits = logits_list[0] - logits_list[0].mean(dim=-1, keepdim=True)
	expanded_unigram_logits = unigram_logits.unsqueeze(0).unsqueeze(0).expand(batch_size, seq_len, voc_size)
	logits_list.append(expanded_unigram_logits)
	return logits_list

	def get_softmax_probabilities(self, input_text):
	encodings = self._tokenize(input_text)
	logits_list = self._get_logits(encodings)
	probabilities_list = softmax_probabilities_all_models(logits_list)
	return encodings, logits_list, probabilities_list

	def compute_arimoto_torch(self, input_text, max_iters=1000):
	encodings, logits_list, tensors_list = self.get_softmax_probabilities(input_text)
	nb_models = len(tensors_list)
	seq_len = len(encodings.input_ids[0])
	voc_size = tensors_list[0].shape[-1]

	device = tensors_list[0].device
	# Move all tensors in tensors_list to the device of the first tensor
	tensors_list = [tensor.to(device) for tensor in tensors_list]

	# Stack all model predictions along a new dimension to form a (seq_len, nb_models, voc_size) tensor
	probabilities_tensor = torch.stack([t[0] for t in tensors_list], dim=1).to(tensors_list[0].device)

	# Run the Blahut-Arimoto algorithm on the entire batch
	capacity, p = blahut_arimoto_torch(probabilities_tensor, max_iters=max_iters)

	# Prepare the weighted sum tensor, initially zeros
	weighted_sum_tensor = torch.zeros_like(tensors_list[0])

	# Here, we need an additional mechanism if 'p' shapes or logic require different handling
	# Assuming 'p' is now (seq_len, nb_models), apply weights to each model's output
	for i in range(nb_models):
	weighted_sum_tensor += p[:, i:i+1] * tensors_list[i]

	return encodings, weighted_sum_tensor, tensors_list, p, logits_list

	def compute_scores(self, input_text):
	encodings, weighted_sum_tensor, probabilities_list, arimoto_weights, logits_list = self.compute_arimoto_torch(input_text, max_iters=self.max_iters)
	log_ppl, ppl, nll = perplexity(encodings, weighted_sum_tensor)
	ppl_list = perplexity_all_models(encodings, logits_list)
	x_ppl_list = cross_entropy(weighted_sum_tensor, probabilities_list)
	return log_ppl, x_ppl_list, arimoto_weights, nll, ppl_list

	def compute_end_score(self, input_text):
	encodings, weighted_sum_tensor, probabilities_list, arimoto_weights, logits_list = self.compute_arimoto_torch(input_text)
	log_ppl, ppl, nll = perplexity(encodings, weighted_sum_tensor)
	ppl_list = perplexity_all_models(encodings, logits_list)
	x_ppl_list = cross_entropy(weighted_sum_tensor, probabilities_list)
	log_ppl_value = log_ppl.item()
	x_ppl_values = [x.item() for x in x_ppl_list]
	final_score = log_ppl_value - x_ppl_values[0] #Ensure your "reference model" is given as first argument
	return final_score

	def perplexity(encodings, weighted_sum_tensor):
	shifted_probabilities = weighted_sum_tensor[..., :-1, :].contiguous()
	shifted_labels = encodings.input_ids[..., 1:].contiguous()
	shifted_attention_mask = encodings.attention_mask[..., 1:].contiguous()

	device = shifted_probabilities.device

	# Ensure all tensors are moved to the same device
	shifted_probabilities = shifted_probabilities.to(device)
	shifted_labels = shifted_labels.to(device)
	shifted_attention_mask = shifted_attention_mask.to(device)

	actual_next_token_probabilities = torch.gather(shifted_probabilities, 2, shifted_labels.unsqueeze(-1)).squeeze(-1)

	nll = -torch.log(actual_next_token_probabilities + 1e-12)
	nll_masked = nll * shifted_attention_mask

	# Calculate the average NLL per sequence, taking into account only the valid (non-padded) tokens
	average_nll = torch.sum(nll_masked, dim=1) / torch.sum(shifted_attention_mask, dim=1)

	# Calculate perplexity per sequence
	perplexity = torch.exp(average_nll)
	return average_nll, perplexity, nll_masked

	def cross_entropy(weighted_sum_tensor, probabilities_list):
	device = weighted_sum_tensor.device
	x_ppl_list = []

	# Compute log of weighted_sum_tensor outside the loop since it doesn't depend on m2_probabilities
	log_M1 = torch.log(weighted_sum_tensor).to(device)

	for m2_probabilities in probabilities_list:
	m2_probabilities = m2_probabilities.to(device)
	# Ensure m2_probabilities is correctly shaped for batch matrix multiplication
	# log_M1 shape is already (batch_size, sequence_length, vocabulary_size)
	# We need m2_probabilities in shape (batch_size, vocabulary_size, sequence_length) for bmm
	m2_probabilities_transposed = m2_probabilities.transpose(1, 2)

	# Perform batch matrix multiplication
	# Resulting shape: (batch_size, sequence_length, sequence_length)
	# We sum over the vocabulary dimension, effectively computing the dot product for each sequence position
	dot_products = torch.bmm(log_M1, m2_probabilities_transposed)

	# Since we're interested in the diagonal (dot products of corresponding vectors), we extract it
	# The diagonal for each item in the batch gives us the dot products we're interested in
	# torch.diagonal doesn't support batched operations directly, so we need to workaround
	dot_products_diagonal = torch.einsum('bii->bi', dot_products) # Using einsum to extract diagonals for batch

	# Compute the mean of the dot_products_diagonal across the sequence dimension
	# This gives us the average dot product per sequence, which is then negated
	x_ppl = -torch.mean(dot_products_diagonal, dim=1)

	x_ppl_list.append(x_ppl)
	x_ppl_tensor = torch.stack(x_ppl_list)
	return x_ppl_list #, x_ppl_tensor

	def softmax_probabilities_all_models(logits_list: List[torch.Tensor]) -> List[torch.Tensor]:
	"""
	Calculates the softmax probabilities for the entire sequence of tokens for each model.

	Parameters:
	- logits_list: List[torch.Tensor]
	A list containing the logits tensor for each model.

	Returns:
	- List[torch.Tensor]: A list of tensors, where each tensor is the softmax probabilities
	for one model across the entire sequence of tokens.
	"""
	softmax_fn = torch.nn.Softmax(dim=-1)
	probabilities_list = []

	for logits in logits_list:
	# Calculate softmax probabilities across the vocabulary for each token position
	softmax_probabilities = softmax_fn(logits)
	probabilities_list.append(softmax_probabilities)

	return probabilities_list

	def perplexity_logits(encoding, logits):
	# Ensure encoding tensors are moved to the same device as logits
	device = logits.device
	logits = torch.clamp(logits, min=-20, max=50)

	encoding_input_ids = encoding.input_ids.to(device)
	encoding_attention_mask = encoding.attention_mask.to(device)

	ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
	shifted_logits = logits[..., :-1, :].contiguous()
	shifted_labels = encoding_input_ids[..., 1:].contiguous()
	shifted_attention_mask = encoding_attention_mask[..., 1:].contiguous()

	# Calculate Cross-Entropy loss
	cross_entropy_loss = ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels)
	# Apply attention mask
	masked_ce_loss = cross_entropy_loss * shifted_attention_mask
	# Calculate perplexity
	ppl = masked_ce_loss.sum(1) / shifted_attention_mask.sum(1)
	# Move result to CPU and convert to numpy for further processing if needed
	ppl = ppl.to("cpu").float().numpy()

	return ppl

	def perplexity_all_models(encoding, logits_list):
	ppl_list = []
	for logits in logits_list:
	ppl = perplexity_logits(encoding, logits)
	ppl_list.append(ppl)
	return ppl_list

	def blahut_arimoto_torch(W, epsilon=1e-6, max_iters=1000):
	"""
	Batch-process Blahut-Arimoto using PyTorch for multiple sequences.
	"""
	seq_len, nb_models, voc_size = W.shape
	p = torch.full((seq_len, nb_models), 1.0 / nb_models, device=W.device, dtype=W.dtype)
	prod_exp = torch.ones((seq_len, nb_models), device=W.device, dtype=W.dtype)

	for _ in range(max_iters):
	# Calculate the marginal probabilities
	sum_p_w = torch.bmm(p.unsqueeze(1), W).squeeze(1) # Resultant shape: (seq_len, voc_size)

	# Calculate normalized probabilities
	W_normalized = W / sum_p_w.unsqueeze(1) # Broadcasting to shape (seq_len, nb_models, voc_size)

	# Avoid numerical issues with logarithms
	W_normalized[W_normalized == 0] = torch.finfo(W.dtype).eps
	log_term = torch.log(W_normalized)
	log_term[torch.isnan(log_term) \| torch.isinf(log_term)] = 0

	# Compute product exponentials and update probabilities
	prod_exp = torch.exp(torch.sum(W * log_term, axis=2)) # Sum across voc_size
	p_new = (p * prod_exp) / torch.sum(p * prod_exp, dim=1, keepdim=True)

	# Check convergence
	if torch.max(torch.abs(p - p_new)) < epsilon:
	break
	p = p_new

	# Compute channel capacity
	capacity = torch.log(torch.sum(p * prod_exp, dim=1)) / torch.log(torch.tensor(2.0, device=W.device))
	return capacity, p