artificial-styletts2 / audiocraft /conditioners.py

fx sounds batch inference

86b9ce4 30 days ago

2.37 kB

	import warnings
	from transformers import T5EncoderModel, T5Tokenizer # type: ignore
	import torch
	from torch import nn


	class T5Conditioner(nn.Module):

	MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
	"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large",
	"google/flan-t5-xl", "google/flan-t5-xxl"]
	MODELS_DIMS = {
	"t5-small": 512,
	"t5-base": 768,
	"t5-large": 1024,
	"t5-3b": 1024,
	"t5-11b": 1024,
	"google/flan-t5-small": 512,
	"google/flan-t5-base": 768,
	"google/flan-t5-large": 1024,
	"google/flan-t5-3b": 1024,
	"google/flan-t5-11b": 1024,
	}

	def __init__(self,
	name,
	output_dim,
	device,
	finetune=False):
	print(f'{finetune=}')
	assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
	super().__init__()
	self.dim = self.MODELS_DIMS[name]
	self.output_dim = output_dim
	self.output_proj = nn.Linear(self.dim, output_dim)
	self.device = device
	self.name = name

	self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
	t5 = T5EncoderModel.from_pretrained(name).eval() #.train(mode=finetune)
	if finetune:
	self.t5 = t5
	else:
	# this makes sure that the t5 models is not part
	# of the saved checkpoint
	self.__dict__['t5'] = t5.to(device)


	def tokenize(self, x):

	entries = [xi if xi is not None else "" for xi in x]


	inputs = self.t5_tokenizer(entries,
	return_tensors='pt',
	padding=True).to(self.device)

	return inputs # 'input_ids' 'attentio mask'

	def forward(self, descriptions):

	d = self.tokenize(descriptions)

	with torch.no_grad():
	embeds = self.t5(input_ids=d['input_ids'],
	attention_mask=d['attention_mask']
	).last_hidden_state # no kvcache for txt conditioning
	embeds = self.output_proj(embeds.to(self.output_proj.weight))
	embeds = (embeds * d['attention_mask'].unsqueeze(-1))

	return embeds # , d['attention_mask']