Spaces:

anamargarida
/

Trial

Sleeping

App Files Files Community

Trial / modeling_st2.py

anamargarida

Update modeling_st2.py

55b8086 verified 6 months ago

raw

history blame

18.2 kB

	import torch
	from torch import nn
	from typing import Optional
	from transformers import (
	AutoModel,
	AutoTokenizer,
	AutoConfig,
	AutoModelForSequenceClassification
	)
	import os
	from safetensors.torch import save_file



	class SignalDetector(nn.Module):
	def __init__(self, model_and_tokenizer_path) -> None:
	super().__init__()
	self.tokenizer = AutoTokenizer.from_pretrained(model_and_tokenizer_path)
	self.signal_detector = AutoModelForSequenceClassification.from_pretrained(model_and_tokenizer_path)
	self.signal_detector.eval()
	self.signal_detector.cuda()

	@torch.no_grad()
	def predict(self, text: str) -> int:
	input_ids = self.tokenizer.encode(text)
	input_ids = torch.tensor([input_ids]).cuda()
	outputs = self.signal_detector(input_ids)
	return outputs[0].argmax().item()


	class ST2ModelV2(nn.Module):
	def __init__(self, args):
	super(ST2ModelV2, self).__init__()
	self.args = args

	self.config = AutoConfig.from_pretrained("roberta-large")
	self.model = AutoModel.from_pretrained("roberta-large")
	self.tokenizer = AutoTokenizer.from_pretrained("roberta-large")

	classifier_dropout = self.args.dropout
	self.dropout = nn.Dropout(classifier_dropout)
	self.classifier = nn.Linear(self.config.hidden_size, 6)

	if args.mlp:
	self.classifier = nn.Sequential(
	nn.Linear(self.config.hidden_size, self.config.hidden_size),
	nn.ReLU(),
	nn.Linear(self.config.hidden_size, 6),
	nn.Tanh(),
	nn.Linear(6, 6),
	)

	if args.add_signal_bias:
	self.signal_phrases_layer = nn.Parameter(
	torch.normal(
	mean=self.model.embeddings.word_embeddings.weight.data.mean(),
	std=self.model.embeddings.word_embeddings.weight.data.std(),
	size=(1, self.config.hidden_size),
	)
	)

	if self.args.signal_classification and not self.args.pretrained_signal_detector:
	self.signal_classifier = nn.Linear(self.config.hidden_size, 2)

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	signal_bias_mask: Optional[torch.Tensor] = None,
	head_mask: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	start_positions: Optional[torch.Tensor] = None, # [batch_size, 3]
	end_positions: Optional[torch.Tensor] = None, # [batch_size, 3]
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	):
	r"""
	start_positions (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for position (index) of the start of the labelled span for computing the token classification loss.
	Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
	are not taken into account for computing the loss.
	end_positions (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for position (index) of the end of the labelled span for computing the token classification loss.
	Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
	are not taken into account for computing the loss.
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if signal_bias_mask is not None and not self.args.signal_bias_on_top_of_lm:
	inputs_embeds = self.signal_phrases_bias(input_ids, signal_bias_mask)

	outputs = self.model(
	# input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	head_mask=head_mask,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	else:

	outputs = self.model(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	head_mask=head_mask,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = outputs[0]
	if signal_bias_mask is not None and self.args.signal_bias_on_top_of_lm:
	sequence_output[signal_bias_mask == 1] += self.signal_phrases_layer

	sequence_output = self.dropout(sequence_output)
	logits = self.classifier(sequence_output) # [batch_size, max_seq_length, 6]
	start_arg0_logits, end_arg0_logits, start_arg1_logits, end_arg1_logits, start_sig_logits, end_sig_logits = logits.split(1, dim=-1)
	start_arg0_logits = start_arg0_logits.squeeze(-1).contiguous()
	end_arg0_logits = end_arg0_logits.squeeze(-1).contiguous()
	start_arg1_logits = start_arg1_logits.squeeze(-1).contiguous()
	end_arg1_logits = end_arg1_logits.squeeze(-1).contiguous()
	start_sig_logits = start_sig_logits.squeeze(-1).contiguous()
	end_sig_logits = end_sig_logits.squeeze(-1).contiguous()

	# start_arg0_logits -= (1 - attention_mask) * 1e4
	# end_arg0_logits -= (1 - attention_mask) * 1e4
	# start_arg1_logits -= (1 - attention_mask) * 1e4
	# end_arg1_logits -= (1 - attention_mask) * 1e4

	# start_arg0_logits[:, 0] = -1e4
	# end_arg0_logits[:, 0] = -1e4
	# start_arg1_logits[:, 0] = -1e4
	# end_arg1_logits[:, 0] = -1e4

	signal_classification_logits = None
	if self.args.signal_classification and not self.args.pretrained_signal_detector:
	signal_classification_logits = self.signal_classifier(sequence_output[:, 0, :])
	# start_logits = start_logits.squeeze(-1).contiguous()
	# end_logits = end_logits.squeeze(-1).contiguous()

	arg0_loss = None
	arg1_loss = None
	sig_loss = None
	total_loss = None
	signal_classification_loss = None
	if start_positions is not None and end_positions is not None:
	loss_fct = nn.CrossEntropyLoss()

	start_arg0_loss = loss_fct(start_arg0_logits, start_positions[:, 0])
	end_arg0_loss = loss_fct(end_arg0_logits, end_positions[:, 0])
	arg0_loss = (start_arg0_loss + end_arg0_loss) / 2

	start_arg1_loss = loss_fct(start_arg1_logits, start_positions[:, 1])
	end_arg1_loss = loss_fct(end_arg1_logits, end_positions[:, 1])
	arg1_loss = (start_arg1_loss + end_arg1_loss) / 2

	# sig_loss = 0.
	start_sig_loss = loss_fct(start_sig_logits, start_positions[:, 2])
	end_sig_loss = loss_fct(end_sig_logits, end_positions[:, 2])
	sig_loss = (start_sig_loss + end_sig_loss) / 2

	if sig_loss.isnan():
	sig_loss = 0.

	if self.args.signal_classification and not self.args.pretrained_signal_detector:
	signal_classification_labels = end_positions[:, 2] != -100
	signal_classification_loss = loss_fct(signal_classification_logits, signal_classification_labels.long())
	total_loss = (arg0_loss + arg1_loss + sig_loss + signal_classification_loss) / 4
	else:
	total_loss = (arg0_loss + arg1_loss + sig_loss) / 3


	return {
	'start_arg0_logits': start_arg0_logits,
	'end_arg0_logits': end_arg0_logits,
	'start_arg1_logits': start_arg1_logits,
	'end_arg1_logits': end_arg1_logits,
	'start_sig_logits': start_sig_logits,
	'end_sig_logits': end_sig_logits,
	'signal_classification_logits': signal_classification_logits,
	'arg0_loss': arg0_loss,
	'arg1_loss': arg1_loss,
	'sig_loss': sig_loss,
	'signal_classification_loss': signal_classification_loss,
	'loss': total_loss,
	}

	"""
	def save_pretrained(self, save_directory):

	#Save model state dict as safetensor, configuration, and tokenizer files.

	# Ensure the directory exists
	os.makedirs(save_directory, exist_ok=True)

	# Save model state dict as safetensor (use torch.save for PyTorch model)
	model_path = os.path.join(save_directory, "model.safetensor")
	save_file(self.state_dict(), model_path)

	# Save config if available
	config_save_path = os.path.join(save_directory, 'config.json')
	self.config.to_json_file(config_save_path)


	# Save tokenizer
	if hasattr(self, 'tokenizer') and self.tokenizer is not None:
	tokenizer_save_path = os.path.join(save_directory, 'tokenizer')
	self.tokenizer.save_pretrained(tokenizer_save_path)
	"""


	def save_pretrained(self, save_directory):
	"""
	Save model state dict as safetensor, PyTorch .bin format, configuration, and tokenizer files.
	"""
	# Ensure the directory exists
	os.makedirs(save_directory, exist_ok=True)

	# Save model state dict as safetensor
	model_path_safetensor = os.path.join(save_directory, "model.safetensors")
	save_file(self.state_dict(), model_path_safetensor) # Save as .safetensors

	# Save model state dict as PyTorch .bin (traditional format)
	model_path_bin = os.path.join(save_directory, "pytorch_model.bin")
	torch.save(self.state_dict(), model_path_bin) # Save as .bin using PyTorch's torch.save()

	# Save config if available
	config_save_path = os.path.join(save_directory, 'config.json')
	self.config.to_json_file(config_save_path)

	"""
	# Save tokenizer if it exists
	if hasattr(self, 'tokenizer') and self.tokenizer is not None:
	tokenizer_save_path = os.path.join(save_directory, 'tokenizer')
	self.tokenizer.save_pretrained(tokenizer_save_path)
	"""


	def signal_phrases_bias(self, input_ids, signal_bias_mask):
	inputs_embeds = self.model.get_input_embeddings()(input_ids)
	inputs_embeds[signal_bias_mask == 1] += self.signal_phrases_layer # self.signal_phrases_layer(inputs_embeds[signal_bias_mask == 1])

	return inputs_embeds

	def position_selector(
	self,
	start_cause_logits,
	start_effect_logits,
	end_cause_logits,
	end_effect_logits,
	attention_mask,
	word_ids,
	):
	# basic post processing (removing logits from [CLS], [SEP], [PAD])
	start_cause_logits -= (1 - attention_mask) * 1e4
	end_cause_logits -= (1 - attention_mask) * 1e4
	start_effect_logits -= (1 - attention_mask) * 1e4
	end_effect_logits -= (1 - attention_mask) * 1e4

	start_cause_logits[0] = -1e4
	end_cause_logits[0] = -1e4
	start_effect_logits[0] = -1e4
	end_effect_logits[0] = -1e4

	start_cause_logits[len(word_ids) - 1] = -1e4
	end_cause_logits[len(word_ids) - 1] = -1e4
	start_effect_logits[len(word_ids) - 1] = -1e4
	end_effect_logits[len(word_ids) - 1] = -1e4

	start_cause_logits = torch.log(torch.softmax(start_cause_logits, dim=-1))
	end_cause_logits = torch.log(torch.softmax(end_cause_logits, dim=-1))
	start_effect_logits = torch.log(torch.softmax(start_effect_logits, dim=-1))
	end_effect_logits = torch.log(torch.softmax(end_effect_logits, dim=-1))

	max_arg0_before_arg1 = None
	for i in range(len(end_cause_logits)):
	if attention_mask[i] == 0:
	break
	for j in range(i + 1, len(start_effect_logits)):
	if attention_mask[j] == 0:
	break

	if max_arg0_before_arg1 is None:
	max_arg0_before_arg1 = ((i, j), end_cause_logits[i] + start_effect_logits[j])
	else:
	if end_cause_logits[i] + start_effect_logits[j] > max_arg0_before_arg1[1]:
	max_arg0_before_arg1 = ((i, j), end_cause_logits[i] + start_effect_logits[j])

	max_arg0_after_arg1 = None
	for i in range(len(end_effect_logits)):
	if attention_mask[i] == 0:
	break
	for j in range(i + 1, len(start_cause_logits)):
	if attention_mask[j] == 0:
	break
	if max_arg0_after_arg1 is None:
	max_arg0_after_arg1 = ((i, j), start_cause_logits[j] + end_effect_logits[i])
	else:
	if start_cause_logits[j] + end_effect_logits[i] > max_arg0_after_arg1[1]:
	max_arg0_after_arg1 = ((i, j), start_cause_logits[j] + end_effect_logits[i])

	if max_arg0_before_arg1[1].item() > max_arg0_after_arg1[1].item():
	end_cause, start_effect = max_arg0_before_arg1[0]
	start_cause_logits[end_cause + 1:] = -1e4
	start_cause = start_cause_logits.argmax().item()

	end_effect_logits[:start_effect] = -1e4
	end_effect = end_effect_logits.argmax().item()
	else:
	end_effect, start_cause = max_arg0_after_arg1[0]
	end_cause_logits[:start_cause] = -1e4
	end_cause = end_cause_logits.argmax().item()

	start_effect_logits[end_effect + 1:] = -1e4
	start_effect = start_effect_logits.argmax().item()

	return start_cause, end_cause, start_effect, end_effect


	def beam_search_position_selector(
	self,
	start_cause_logits,
	start_effect_logits,
	end_cause_logits,
	end_effect_logits,
	attention_mask,
	word_ids,
	topk=5
	):
	# basic post processing (removing logits from [CLS], [SEP], [PAD])

	start_cause_logits -= (1 - attention_mask) * 1e4
	end_cause_logits -= (1 - attention_mask) * 1e4
	start_effect_logits -= (1 - attention_mask) * 1e4
	end_effect_logits -= (1 - attention_mask) * 1e4

	start_cause_logits[0] = -1e4
	end_cause_logits[0] = -1e4
	start_effect_logits[0] = -1e4
	end_effect_logits[0] = -1e4

	start_cause_logits[len(word_ids) - 1] = -1e4
	end_cause_logits[len(word_ids) - 1] = -1e4
	start_effect_logits[len(word_ids) - 1] = -1e4
	end_effect_logits[len(word_ids) - 1] = -1e4

	start_cause_logits = torch.log(torch.softmax(start_cause_logits, dim=-1))
	end_cause_logits = torch.log(torch.softmax(end_cause_logits, dim=-1))
	start_effect_logits = torch.log(torch.softmax(start_effect_logits, dim=-1))
	end_effect_logits = torch.log(torch.softmax(end_effect_logits, dim=-1))

	scores = dict()
	for i in range(len(end_cause_logits)):
	if attention_mask[i] == 0:
	break
	for j in range(i + 1, len(start_effect_logits)):
	if attention_mask[j] == 0:
	break
	scores[str((i, j, "before"))] = end_cause_logits[i].item() + start_effect_logits[j].item()

	for i in range(len(end_effect_logits)):
	if attention_mask[i] == 0:
	break
	for j in range(i + 1, len(start_cause_logits)):
	if attention_mask[j] == 0:
	break
	scores[str((i, j, "after"))] = start_cause_logits[j].item() + end_effect_logits[i].item()


	topk_scores = dict()
	for i, (index, score) in enumerate(sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]):
	if eval(index)[2] == 'before':
	end_cause = eval(index)[0]
	start_effect = eval(index)[1]

	this_start_cause_logits = start_cause_logits.clone()
	this_start_cause_logits[end_cause + 1:] = -1e9
	start_cause_values, start_cause_indices = this_start_cause_logits.topk(topk)

	this_end_effect_logits = end_effect_logits.clone()
	this_end_effect_logits[:start_effect] = -1e9
	end_effect_values, end_effect_indices = this_end_effect_logits.topk(topk)

	for m in range(len(start_cause_values)):
	for n in range(len(end_effect_values)):
	topk_scores[str((start_cause_indices[m].item(), end_cause, start_effect, end_effect_indices[n].item()))] = score + start_cause_values[m].item() + end_effect_values[n].item()

	elif eval(index)[2] == 'after':
	start_cause = eval(index)[1]
	end_effect = eval(index)[0]

	this_end_cause_logits = end_cause_logits.clone()
	this_end_cause_logits[:start_cause] = -1e9
	end_cause_values, end_cause_indices = this_end_cause_logits.topk(topk)

	this_start_effect_logits = start_effect_logits.clone()
	this_start_effect_logits[end_effect + 1:] = -1e9
	start_effect_values, start_effect_indices = this_start_effect_logits.topk(topk)

	for m in range(len(end_cause_values)):
	for n in range(len(start_effect_values)):
	topk_scores[str((start_cause, end_cause_indices[m].item(), start_effect_indices[n].item(), end_effect))] = score + end_cause_values[m].item() + start_effect_values[n].item()

	first, second = sorted(topk_scores.items(), key=lambda x: x[1], reverse=True)[:2]
	return eval(first[0]), eval(second[0]), first[1], second[1], topk_scores