| # from transformers.models.wav2vec2 import Wav2Vec2Model, Wav2Vec2FeatureExtractor | |
| # import torchaudio | |
| # import torch | |
| # import torch.nn as nn | |
| def get_valence_score(file_path): | |
| # class VADPredictor(nn.Module): | |
| # """Model to predict VAD Scores""" | |
| # def __init__(self, pretrained_model_name="facebook/wav2vec2-base-960h", freeze_feature_extractor=True): | |
| # super(VADPredictor, self).__init__() | |
| # self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name) | |
| # if freeze_feature_extractor: | |
| # for param in self.wav2vec2.feature_extractor.parameters(): | |
| # param.requires_grad = False | |
| # hidden_size = self.wav2vec2.config.hidden_size | |
| # self.valence_layers = nn.Sequential( | |
| # nn.Linear(hidden_size, 256), | |
| # nn.ReLU(), | |
| # nn.Dropout(0.3), | |
| # nn.Linear(256,64), | |
| # nn.Linear(64,1) | |
| # ) | |
| # self.arousal_layers = nn.Sequential( | |
| # nn.Linear(hidden_size, 256), | |
| # nn.ReLU(), | |
| # nn.Dropout(0.3), | |
| # nn.Linear(256,64), | |
| # nn.Linear(64,1) | |
| # ) | |
| # self.dominance_layers = nn.Sequential( | |
| # nn.Linear(hidden_size, 256), | |
| # nn.ReLU(), | |
| # nn.Dropout(0.3), | |
| # nn.Linear(256,64), | |
| # nn.Linear(64,1) | |
| # ) | |
| # def forward(self, input_values, attention_mask=None): | |
| # outputs = self.wav2vec2(input_values, attention_mask=attention_mask) | |
| # last_hidden_state = outputs.last_hidden_state | |
| # pooled_output = torch.mean(last_hidden_state, dim=1) | |
| # valence = self.valence_layers(pooled_output) | |
| # arousal = self.arousal_layers(pooled_output) | |
| # dominance = self.dominance_layers(pooled_output) | |
| # return { | |
| # 'valence': valence.squeeze(-1), | |
| # 'arousal': arousal.squeeze(-1), | |
| # 'dominance': dominance.squeeze(-1) | |
| # } | |
| # model = VADPredictor() | |
| # model.load_state_dict(torch.load(r"D:\Intern\shankh\DUMP\vad_predictor_model.pt", map_location=torch.device("cpu"))) | |
| # model.eval() | |
| # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
| # # Load and process audio | |
| # file_path = file_path | |
| # waveform, sr = torchaudio.load(file_path) | |
| # # Convert to mono | |
| # if waveform.shape[0] > 1: | |
| # waveform = waveform.mean(dim=0, keepdim=True) | |
| # # Resample to 16000 Hz | |
| # if sr != 16000: | |
| # resampler = torchaudio.transforms.Resample(sr, 16000) | |
| # waveform = resampler(waveform) | |
| # sr = 16000 | |
| # # Normalize | |
| # waveform = waveform / waveform.abs().max() | |
| # # Parameters | |
| # segment_sec = 1 | |
| # segment_samples = int(segment_sec * sr) | |
| # valence_scores = [] | |
| # # Inference per segment | |
| # with torch.no_grad(): | |
| # for start in range(0, waveform.shape[1] - segment_samples + 1, segment_samples): | |
| # segment = waveform[:, start:start+segment_samples] | |
| # input_values = feature_extractor(segment.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values | |
| # output = model(input_values) | |
| # val = output['valence'].item() | |
| # valence_scores.append(val) | |
| valence_scores = 5.0 | |
| return valence_scores | |