Spaces:

jrmd
/

DIT_DL2_WAV2VEC2_BERT

Sleeping

App Files Files Community

jrmd commited on Jul 14

Commit

200202c

0 Parent(s):

initial commit

Browse files

Files changed (22) hide show

.gitattributes +5 -0
.gitignore +2 -0
SpeechSentimentModelConfusionMatrix.png +3 -0
audiospeechsentimentanalysis_jrmdiouf.py +650 -0
bert_tokenizer_local/special_tokens_map.json +7 -0
bert_tokenizer_local/tokenizer.json +0 -0
bert_tokenizer_local/tokenizer_config.json +56 -0
bert_tokenizer_local/vocab.txt +0 -0
categories.bin +3 -0
custom_bert_model.bin +3 -0
demo.py +46 -0
demo_api_client.py +16 -0
id10012_0AXjxNXiEzo_00001.flac +3 -0
max_len.pkl +3 -0
wandb_chart_eval.png +3 -0
wandb_chart_train.png +3 -0
wav2vec2_local/config.json +109 -0
wav2vec2_local/model.safetensors +3 -0
wav2vec2_local/preprocessor_config.json +10 -0
wav2vec2_local/special_tokens_map.json +6 -0
wav2vec2_local/tokenizer_config.json +51 -0
wav2vec2_local/vocab.json +34 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+*.png filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ .gradio

SpeechSentimentModelConfusionMatrix.png ADDED Viewed

Git LFS Details

SHA256: b304b4a2287962ca64e7a68d29ad345667ca2b2fafb8712828a80780dba67a28
Pointer size: 130 Bytes
Size of remote file: 28.5 kB

audiospeechsentimentanalysis_jrmdiouf.py ADDED Viewed

	@@ -0,0 +1,650 @@

+# -*- coding: utf-8 -*-
+"""AudioSpeechSentimentAnalysis_JRMDIOUF.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1tizgeMs7DXaZPQO3V253paATKev0ra0m
+"""
+#!pip install transformers
+#!pip install wandb
+import os
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+import pickle
+import re
+from typing import DefaultDict
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchaudio
+import torchaudio.functional as F
+import wandb
+# from google.colab import userdata
+# from huggingface_hub import login
+from sklearn.metrics import (
+    accuracy_score,
+    confusion_matrix,
+    precision_score,
+    recall_score,
+)
+from torch.utils.data import DataLoader, Dataset, Subset
+from transformers import AutoTokenizer, BertModel, Wav2Vec2ForCTC, Wav2Vec2Processor
+"""hf_token = userdata.get("HF_TOKEN")
+wandb_token = userdata.get("WAND_TOKEN")"""
+# Commented out IPython magic to ensure Python compatibility.
+# %env HF_TOKEN_ENV=$hf_token
+"""!wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/dev.tsv
+!wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/fine-tune.tsv
+!wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/test.tsv
+!wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/audio/dev.zip
+!wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/audio/fine-tune.zip
+!wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/audio/test.zip
+if not os.path.exists("dev_raw"):
+    print("dev_raw folder not found. Unzipping dev.zip...")
+    !unzip -q dev.zip
+else:
+    print("dev_raw folder already exists. Skipping unzip.")
+if not os.path.exists("fine-tune_raw"):
+    print("fine-tune_raw folder not found. Unzipping fine-tune.zip...")
+    !unzip -q fine-tune.zip
+else:
+    print("fine-tune_raw folder already exists. Skipping unzip.")
+if not os.path.exists("test_raw"):
+    print("test_raw folder not found. Unzipping test.zip...")
+    !unzip -q test.zip
+else:
+    print("test_raw folder already exists. Skipping unzip.")"""
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+NUM_EPOCHS = 5
+BATCH_SIZE = 16
+SAVED_CUSTOM_BERT_TOKEN_MAX_LEN_PATH = "max_len.pkl"
+SAVED_CUSTOM_BERT_TOKENIZER_DIR = "bert_tokenizer_local"
+SAVED_CUSTOM_BERT_MODEL_PATH = "custom_bert_model.bin"
+SAVED_TARGET_CAT_PATH = "categories.bin"
+TRAIN_DS_PATH = "fine-tune.tsv"
+TEST_DS_PATH = "test.tsv"
+BERT_BASE_MODEL = "google-bert/bert-base-uncased"
+INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE = 30
+SAVED_AUDIO_MODEL_DIR_PATH = "wav2vec2_local"
+AUDIO_BASE_MODEL = "facebook/wav2vec2-base-960h"
+PROCESSOR_NAME = "preprocessor_config.json"
+MODEL_NAME = "config.json"
+SENTIMENT_MODALITIES = ["Neutral", "Positive", "Negative"]
+class CustomBertDataset(Dataset):
+    def __init__(
+        self,
+        file_path,
+        audio_folder,
+        model_path=BERT_BASE_MODEL,
+        saved_target_cats_path=SAVED_TARGET_CAT_PATH,
+        saved_max_len_path=SAVED_CUSTOM_BERT_TOKEN_MAX_LEN_PATH,
+    ):
+        self.model_path = model_path
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        self.lines = open(file_path).readlines()
+        self.lines = np.array(
+            [
+                [
+                    re.split(r"\t+", line.replace("\n", ""))[1],
+                    re.split(r"\t+", line.replace("\n", ""))[4],
+                    re.split(r"\t+", line.replace("\n", ""))[0],
+                ]
+                for i, line in enumerate(self.lines)
+                if line != "\n" and i != 0
+            ]
+        )
+        self.elem_cats = self.lines[:, 1]
+        self.corpus = self.lines[:, 0]
+        self.audio_files_id = self.lines[:, 2]
+        # We have to proceed in this order here
+        self.corpus = [
+            sent.lower()
+            for sent, cat in zip(self.corpus, self.elem_cats)
+            if cat in SENTIMENT_MODALITIES
+        ]
+        self.audio_files = np.array(
+            [
+                os.path.join(audio_folder, f"{file_name}.flac")
+                for file_name, cat in zip(self.audio_files_id, self.elem_cats)
+                if cat in SENTIMENT_MODALITIES
+            ]
+        )
+        self.elem_cats = [cat for cat in self.elem_cats if cat in SENTIMENT_MODALITIES]
+        self.unique_cats = sorted(list(set(self.elem_cats)))
+        self.num_class = len(self.unique_cats)
+        self.cats_dict = {cat: i for i, cat in enumerate(self.unique_cats)}
+        self.targets = np.array([self.cats_dict[cat] for cat in self.elem_cats])
+        torch.save(self.unique_cats, saved_target_cats_path)
+        self.tokenizer.save_pretrained(SAVED_CUSTOM_BERT_TOKENIZER_DIR)
+        """entry_dict = DefaultDict(list)
+      for i in range(len(self.corpus)):
+          entry_dict[self.targets[i]].append(self.corpus[i])
+      self.final_corpus = []
+      self.final_targets = []
+      n=0
+      while n < len(self.corpus):
+        for key in entry_dict.keys():
+          if len(entry_dict[key]) > 0:
+            self.final_corpus.append(entry_dict[key].pop(0))
+            self.final_targets.append(key)
+            n+=1
+      self.corpus = np.array(self.final_corpus)
+      self.targets = np.array(self.final_targets)"""
+        self.max_len = 0
+        for sent in self.corpus:
+            input_ids = self.tokenizer.encode(sent, add_special_tokens=True)
+            self.max_len = max(self.max_len, len(input_ids))
+        self.max_len = min(self.max_len, 512)
+        print(f"Max length : {self.max_len}")
+        print(f"Nombre de classes : {self.num_class}")
+        print(f"Exemples de targets : {np.unique(self.targets)}")
+        # Save max_len
+        with open(saved_max_len_path, "wb") as f:
+            pickle.dump(self.max_len, f)
+        print(f"max_len saved to {saved_max_len_path}")
+    def __len__(self):
+        return len(self.elem_cats)
+    def __getitem__(self, idx):
+        text = self.corpus[idx]
+        target = self.targets[idx]
+        # Vérification : target doit être entre 0 et num_class - 1
+        if target < 0 or target >= self.num_class:
+            raise ValueError(
+                f"Target out of bounds: {target} not in [0, {self.num_class - 1}]"
+            )
+        encoded_input = self.tokenizer.encode_plus(
+            text,
+            max_length=self.max_len,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        return (
+            encoded_input["input_ids"].squeeze(0),
+            encoded_input["attention_mask"].squeeze(0),
+            torch.tensor(target, dtype=torch.long),
+            self.audio_files[idx],
+        )
+        # return np.array(encoded_input), torch.tensor(target, dtype=torch.long)
+class CustomBertModel(nn.Module):
+    def __init__(self, num_class, model_path=BERT_BASE_MODEL):
+        super(CustomBertModel, self).__init__()
+        self.model_path = model_path
+        self.num_class = num_class
+        self.bert = BertModel.from_pretrained(self.model_path)
+        # Freeze of the parameters of this layer for the training process
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        # self.proj_intermediate = nn.Sequential(nn.Linear(self.bert.config.hidden_size, INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE),nn.Linear(INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE, INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE), INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE),nn.Linear(INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE, INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE))
+        self.proj_lin = nn.Linear(self.bert.config.hidden_size, self.num_class)
+    def forward(self, input_ids, attention_mask):
+        x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        x = x.last_hidden_state[:, 0, :]
+        # x = self.proj_intermediate(x)
+        x = self.proj_lin(x)
+        return x
+def train_step(model, train_dataloader, loss_fn, optimizer):
+    num_iterations = len(train_dataloader)
+    for i in range(NUM_EPOCHS):
+        print(f"Training Epoch n° {i}")
+        model.train()
+        for j, batch in enumerate(train_dataloader):
+            input = batch[:][0]
+            attention = batch[:][1]
+            target = batch[:][2]
+            output = model(input.to(device), attention.to(device))
+            loss = loss_fn(output, target.to(device))
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            run.log({"Training loss": loss})
+            print(f"Epoch {i+1} | step {j+1} / {num_iterations} | loss : {loss}")
+    # Save model
+    torch.save(model.state_dict(), SAVED_CUSTOM_BERT_MODEL_PATH)
+    print(f"Custom BERT Model saved at {SAVED_CUSTOM_BERT_MODEL_PATH}")
+def eval_step(
+    test_dataloader,
+    loss_fn,
+    num_class,
+    saved_model_path=SAVED_CUSTOM_BERT_MODEL_PATH,
+    saved_target_cats_path=SAVED_TARGET_CAT_PATH,
+):
+    y_pred = []
+    y_true = []
+    num_iterations = len(test_dataloader)
+    # Load the saved model
+    saved_model = CustomBertModel(num_class)
+    saved_model.load_state_dict(
+        torch.load(saved_model_path, weights_only=False)
+    )  # Explicitly set weights_only to False
+    saved_model = saved_model.to(device)
+    saved_model.eval()  # Set the model to evaluation mode
+    print(f"Model loaded from path :{saved_model_path}")
+    with torch.no_grad():
+        for j, batch in enumerate(test_dataloader):
+            input = batch[:][0]
+            attention = batch[:][1]
+            target = batch[:][2]
+            output = saved_model(input.to(device), attention.to(device))
+            loss = loss_fn(output, target.to(device))
+            run.log({"Eval loss": loss})
+            print(f"Step {j+1} / {num_iterations} | Eval loss : {loss}")
+            y_pred.extend(output.cpu().numpy().argmax(axis=1))
+            y_true.extend(target.cpu().numpy())
+    class_labels = torch.load(saved_target_cats_path, weights_only=False)
+    true_labels = [class_labels[i] for i in y_true]
+    pred_labels = [class_labels[i] for i in y_pred]
+    print(f"Accuracy : {accuracy_score(true_labels, pred_labels)}")
+    cm = confusion_matrix(true_labels, pred_labels, labels=class_labels)
+    df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)
+    sns.heatmap(df_cm, annot=True, fmt="d")
+    plt.title("Confusion Matrix for Sentiment analysis dataset")
+    plt.xlabel("Predicted Label")
+    plt.ylabel("True Label")
+    plt.show()
+def eval_pipeline_step(
+    test_dataloader,
+    loss_fn,
+    num_class,
+    audio_model_dir=SAVED_AUDIO_MODEL_DIR_PATH,
+    audio_model_name=MODEL_NAME,
+    audio_processor_name=PROCESSOR_NAME,
+    saved_model_path=SAVED_CUSTOM_BERT_MODEL_PATH,
+    saved_target_cats_path=SAVED_TARGET_CAT_PATH,
+):
+    y_pred = []
+    y_true = []
+    num_iterations = len(test_dataloader)
+    # Load the saved model
+    saved_model = CustomBertModel(num_class)
+    saved_model.load_state_dict(
+        torch.load(saved_model_path, weights_only=False)
+    )  # Explicitly set weights_only to False
+    saved_model = saved_model.to(device)
+    saved_model.eval()  # Set the model to evaluation mode
+    print(f"Model loaded from path :{saved_model_path}")
+    audio_processor = None
+    audio_model = None
+    processor_path = os.path.join(
+        audio_model_dir, audio_processor_name
+    )  # Check for a key file, like the preprocessor config
+    model_path = os.path.join(
+        audio_model_dir, audio_model_name
+    )  # Check for a key file, like the model config
+    if (
+        os.path.exists(audio_model_dir)
+        and os.path.exists(processor_path)
+        and os.path.exists(model_path)
+    ):
+        print("Local Wav2Vec2 processor and model found. Loading from local directory.")
+        audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)
+        audio_model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
+    else:
+        print(
+            "Local Wav2Vec2 processor and model not found. Downloading from Hugging Face Hub."
+        )
+        audio_processor = Wav2Vec2Processor.from_pretrained(AUDIO_BASE_MODEL)
+        audio_model = Wav2Vec2ForCTC.from_pretrained(AUDIO_BASE_MODEL)
+        # Optionally save the downloaded model and processor for future use
+        audio_processor.save_pretrained(audio_model_dir)
+        audio_model.save_pretrained(audio_model_dir)
+        print(f"Wav2Vec2 processor and model downloaded and saved to {audio_model_dir}")
+    # Move audio model to GPU
+    audio_model = audio_model.to(device)
+    audio_model.eval()
+    with torch.no_grad():
+        for j, batch in enumerate(test_dataloader):
+            target = batch[:][2]
+            audio_file_path = batch[:][3]
+            encoded_inputs = []
+            attention_masks = []
+            bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
+            sample_rate = bundle.sample_rate
+            for audio_file in audio_file_path:
+                waveform, sr = torchaudio.load(audio_file)
+                if sr != sample_rate:
+                    print("Resampling")
+                    resampler = torchaudio.transforms.Resample(
+                        orig_freq=sr, new_freq=sample_rate
+                    )
+                    waveform = resampler(waveform)
+                # Move waveform to GPU before processing
+                input_values = audio_processor(
+                    waveform.squeeze().numpy(),
+                    sampling_rate=sample_rate,
+                    return_tensors="pt",
+                ).input_values.to(device)
+                with torch.no_grad():
+                    logits = audio_model(input_values).logits
+                predicted_ids_hf = torch.argmax(logits, dim=-1)
+                transcript_hf = audio_processor.decode(
+                    predicted_ids_hf[0].cpu().numpy()
+                )  # Move predicted_ids_hf back to CPU for decoding
+                transcript_hf = (
+                    transcript_hf.lower() if transcript_hf is not None else None
+                )
+                encoded_input = test_dataloader.dataset.tokenizer.encode_plus(
+                    transcript_hf,
+                    max_length=test_dataloader.dataset.max_len,
+                    padding="max_length",
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                encoded_inputs.append(encoded_input["input_ids"].squeeze(0))
+                attention_masks.append(encoded_input["attention_mask"].squeeze(0))
+            text_input = torch.stack(encoded_inputs)
+            attention = torch.stack(attention_masks)
+            output = saved_model(text_input.to(device), attention.to(device))
+            loss = loss_fn(output, target.to(device))
+            run.log({"Pipeline Eval loss": loss})
+            print(f"Step {j+1} / {num_iterations} | Pipeline Eval loss : {loss}")
+            y_pred.extend(output.cpu().numpy().argmax(axis=1))
+            y_true.extend(target.cpu().numpy())
+    class_labels = torch.load(saved_target_cats_path, weights_only=False)
+    true_labels = [class_labels[i] for i in y_true]
+    pred_labels = [class_labels[i] for i in y_pred]
+    print(f"Pipeline Accuracy : {accuracy_score(true_labels, pred_labels)}")
+    cm = confusion_matrix(true_labels, pred_labels, labels=class_labels)
+    df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)
+    sns.heatmap(df_cm, annot=True, fmt="d")
+    plt.title("Confusion Matrix for Sentiment analysis Pipeline")
+    plt.xlabel("Predicted Label")
+    plt.ylabel("True Label")
+    plt.show()
+def get_audio_sentiment(
+    input_audio_path,
+    num_class=len(SENTIMENT_MODALITIES),
+    audio_model_dir=SAVED_AUDIO_MODEL_DIR_PATH,
+    audio_model_name=MODEL_NAME,
+    audio_processor_name=PROCESSOR_NAME,
+    saved_model_path=SAVED_CUSTOM_BERT_MODEL_PATH,
+    saved_target_cats_path=SAVED_TARGET_CAT_PATH,
+    tokenizer_save_directory=SAVED_CUSTOM_BERT_TOKENIZER_DIR,
+    saved_max_len_path=SAVED_CUSTOM_BERT_TOKEN_MAX_LEN_PATH,
+):
+    # Load the saved model
+    saved_model = CustomBertModel(num_class)
+    saved_model.load_state_dict(
+        torch.load(
+            saved_model_path, weights_only=False, map_location=torch.device(device)
+        )
+    )  # Explicitly set weights_only to False
+    saved_model = saved_model.to(device)
+    saved_model.eval()  # Set the model to evaluation mode
+    print(f"Model loaded from path :{saved_model_path}")
+    loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_directory)
+    max_len = 0
+    with open(saved_max_len_path, "rb") as f:
+        max_len = pickle.load(f)
+    audio_processor = None
+    audio_model = None
+    processor_path = os.path.join(
+        audio_model_dir, audio_processor_name
+    )  # Check for a key file, like the preprocessor config
+    model_path = os.path.join(
+        audio_model_dir, audio_model_name
+    )  # Check for a key file, like the model config
+    if (
+        os.path.exists(audio_model_dir)
+        and os.path.exists(processor_path)
+        and os.path.exists(model_path)
+    ):
+        print("Local Wav2Vec2 processor and model found. Loading from local directory.")
+        audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)
+        audio_model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
+    else:
+        print(
+            "Local Wav2Vec2 processor and model not found. Downloading from Hugging Face Hub."
+        )
+        audio_processor = Wav2Vec2Processor.from_pretrained(AUDIO_BASE_MODEL)
+        audio_model = Wav2Vec2ForCTC.from_pretrained(AUDIO_BASE_MODEL)
+        # Optionally save the downloaded model and processor for future use
+        audio_processor.save_pretrained(audio_model_dir)
+        audio_model.save_pretrained(audio_model_dir)
+        print(f"Wav2Vec2 processor and model downloaded and saved to {audio_model_dir}")
+    # Move audio model to GPU
+    audio_model = audio_model.to(device)
+    audio_model.eval()
+    with torch.no_grad():
+        audio_file_path = input_audio_path
+        encoded_inputs = []
+        attention_masks = []
+        bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
+        sample_rate = bundle.sample_rate
+        waveform, sr = torchaudio.load(audio_file_path)
+        if sr != sample_rate:
+            print("Resampling")
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=sr, new_freq=sample_rate
+            )
+            waveform = resampler(waveform)
+        # Move waveform to GPU before processing
+        input_values = audio_processor(
+            waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt"
+        ).input_values.to(device)
+        with torch.no_grad():
+            logits = audio_model(input_values).logits
+        predicted_ids_hf = torch.argmax(logits, dim=-1)
+        transcript_hf = audio_processor.decode(
+            predicted_ids_hf[0].cpu().numpy()
+        )  # Move predicted_ids_hf back to CPU for decoding
+        transcript_hf = transcript_hf.lower() if transcript_hf is not None else None
+        encoded_input = loaded_tokenizer.encode_plus(
+            transcript_hf,
+            max_length=max_len,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        encoded_inputs.append(encoded_input["input_ids"].squeeze(0))
+        attention_masks.append(encoded_input["attention_mask"].squeeze(0))
+        # Stack the lists of tensors before moving to device
+        text_input = torch.stack(encoded_inputs)
+        attention = torch.stack(attention_masks)
+        output = saved_model(text_input.to(device), attention.to(device))
+        class_labels = torch.load(saved_target_cats_path, weights_only=False)
+        return class_labels[output.cpu().numpy().argmax(axis=1)[0]]
+# Login using e.g. `huggingface-cli login` to access this dataset
+# global_train_ds = load_dataset("asapp/slue-voxceleb", streaming=True, token='jrmd_hf_token')
+# global_train_ds = load_dataset('asapp/slue',token='jrmd_hf_token')
+# global_train_ds = load_dataset('voxceleb',token='jrmd_hf_token')
+# global_test_ds = load_dataset("asapp/slue", "voxceleb", split="test", token='jrmd_hf_token')
+# Get torchaudio pipeline components
+"""bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
+#model = bundle.get_model()
+#labels = bundle.get_labels()
+sample_rate = bundle.sample_rate"""
+"""waveform, sr = torchaudio.load("/content/dev_raw/id10012_0AXjxNXiEzo_00001.flac")
+# Resample if sr != sample_rate (or model_hf.config.sampling_rate)
+if sr != sample_rate:
+  print("Resampling")
+  resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
+  waveform = resampler(waveform)"""
+# Using torchaudio pipeline - Manual Greedy Decoding
+"""with torch.no_grad():
+    emission = model(waveform)"""
+# Assuming emission is log-probabilities or logits
+# Perform greedy decoding: get the index of the max probability at each time step
+# predicted_ids_torchaudio = torch.argmax(emission[0], dim=-1)
+# Process the predicted IDs: remove consecutive duplicates and blank tokens
+# Assuming the blank token is at index 0 (which is common for CTC, check labels if unsure)
+"""processed_ids_torchaudio = []
+for id in predicted_ids_torchaudio[0]: # emission has shape (batch_size, num_frames, num_labels)
+    if id.item() != 0 and (len(processed_ids_torchaudio) == 0 or id.item() != processed_ids_torchaudio[-1]):
+        processed_ids_torchaudio.append(id.item())"""
+"""# Convert token IDs to transcript using labels
+#transcript = "".join([labels[id] for id in processed_ids_torchaudio])
+# Using Hugging Face transformers
+# Note: processor and model_hf are defined in cell DnJDG6P3BTjZ
+# To make this cell fully self-contained, you might want to include their definitions here as well.
+# For now, assuming they are defined in a previously executed cell.
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+model_hf = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+# Load and resample waveform
+waveform, sr = torchaudio.load("/content/dev_raw/id10012_0AXjxNXiEzo_00001.flac")
+if sr != sample_rate:
+    print("Resampling")
+    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
+    waveform = resampler(waveform)
+input_values = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt").input_values
+with torch.no_grad():
+    logits = model_hf(input_values).logits
+predicted_ids_hf = torch.argmax(logits, dim=-1)
+transcript_hf = processor.decode(predicted_ids_hf[0])
+#print("Torchaudio Transcript:", transcript)
+print("Hugging Face Transcript:", transcript_hf)"""
+if __name__ == "__main__":
+    wandb.login(key=wandb_token)
+    run = wandb.init(project="DIT-Wav2Vec-Bert-Sentiment-Analysis-project")
+    bert_train_dataset = CustomBertDataset(TRAIN_DS_PATH, "fine-tune_raw")
+    bert_test_dataset = CustomBertDataset(TEST_DS_PATH, "test_raw")
+    print(f"Size of bert dataset : {len(bert_train_dataset)}")
+    """train_dataset = Subset(our_bert_dataset, range(int(len(our_bert_dataset)*0.8)))
+  test_dataset = Subset(our_bert_dataset, range(int(len(our_bert_dataset)*0.8), len(our_bert_dataset)))"""
+    train_dataloader = DataLoader(
+        bert_train_dataset, batch_size=BATCH_SIZE, shuffle=True
+    )
+    test_dataloader = DataLoader(
+        bert_test_dataset, batch_size=BATCH_SIZE, shuffle=False
+    )
+    our_bert_model = CustomBertModel(bert_train_dataset.num_class)
+    our_bert_model = our_bert_model.to(device)
+    loss_fn = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(
+        filter(lambda p: p.requires_grad, our_bert_model.parameters()), lr=0.01
+    )
+    train_step(our_bert_model, train_dataloader, loss_fn, optimizer)
+    eval_step(test_dataloader, loss_fn, bert_train_dataset.num_class)
+    eval_pipeline_step(test_dataloader, loss_fn, bert_train_dataset.num_class)
+    test_inference_audio_path = "/content/dev_raw/id10012_0AXjxNXiEzo_00001.flac"
+    print(get_audio_sentiment(test_inference_audio_path))

bert_tokenizer_local/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

bert_tokenizer_local/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bert_tokenizer_local/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

bert_tokenizer_local/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

categories.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce4f35be86b2eecde01dac17af9f2885aa5dde5c90ab4770871d4e7f6d7fe92d
+size 1196

custom_bert_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b962a92b9dcb34ba0659d0fda0f5a312bbe6f5e7d13060413dd3abde366c517c
+size 438021794

demo.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import gradio as gr
+import audiospeechsentimentanalysis_jrmdiouf as assaj
+def find_sentiment(input):
+    return assaj.get_audio_sentiment(input)
+with gr.Blocks() as demo:
+    gr.Markdown(
+        "<h1 style='text-align: center;'>CUSTOM MODEL BASED ON WAV2VEC2 AND BERT BASE TO ANALYZE SPEECH SENTIMENT</h1>"
+    )
+    gr.Interface(
+        fn=find_sentiment,
+        inputs=[gr.Audio(type="filepath")],
+        outputs=["text"],
+        live=False,
+    )
+    gr.Markdown(
+        "<h2 style='text-align: center;'>Speech sentiment analysis model loss during training and eval time</h2>"
+    )
+    with gr.Row():
+        gr.Image(value="wandb_chart_train.png", label="Training Loss", width=300)
+        gr.Image(value="wandb_chart_eval.png", label="Pipeline eval Loss", width=300)
+    gr.Markdown(
+        "<h2 style='text-align: center;'>Confusion matrix obtained from model evaluation on VoxCeleb dataset</h2>"
+    )
+    with gr.Row():
+        gr.Image(
+            value="SpeechSentimentModelConfusionMatrix.png",
+            label="Confusion Matrix from model evaluation",
+        )
+    with gr.Row():
+        gr.Markdown(
+            "<h3><span style='text-decoration:underline;'>Pipeline Accuracy</span> : <span style='font-style:italic;'>0.758</span></h3>"
+        )
+demo.launch(share=True)

demo_api_client.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+from gradio_client import Client, handle_file
+client = Client("http://localhost:7860/")
+# Use a raw string for the file path
+audio_file_path = r"E:\00.Divers\DIT\04.Cours\M2\06.DS-DeepLearning2\Examen\Dev\id10012_0AXjxNXiEzo_00001.flac"
+# Verify the file exists (good practice!)
+if not os.path.exists(audio_file_path):
+    print(f"Error: The file '{audio_file_path}' does not exist. Please check the path.")
+else:
+    print(f"File found: {audio_file_path}")
+    result = client.predict(input=handle_file(audio_file_path), api_name="/predict")
+    print(result)

id10012_0AXjxNXiEzo_00001.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f1be9a6c5fa7421364e026e4294bf4976d15d7a61dc397c9385b796c619299f
+size 78322

max_len.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a2b0264bcc30013ba2d474c3e149ba4401daaa47d88d874ccaba45d3c1518fb
+size 5

wandb_chart_eval.png ADDED Viewed

Git LFS Details

SHA256: 12523283a96b2953885832dcf250e7816579836fa90b3c01de56e8dbbeab0c0c
Pointer size: 131 Bytes
Size of remote file: 548 kB

wandb_chart_train.png ADDED Viewed

Git LFS Details

SHA256: f6156215bba9266c6edb00c2a3d46a53b2f538fc9aeb43cb3bdedcda218fbde8
Pointer size: 131 Bytes
Size of remote file: 439 kB

wav2vec2_local/config.json ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 256,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": false,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 12,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 768,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 256,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.1",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

wav2vec2_local/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b516d7bf54ca328ba24c507c2d11ba2fd2be54991e2a7cd965aadba947cc532c
+size 377611120

wav2vec2_local/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

wav2vec2_local/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

wav2vec2_local/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "do_normalize": true,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "return_attention_mask": false,
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": "|"
+}

wav2vec2_local/vocab.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "'": 27,
+  "</s>": 2,
+  "<pad>": 0,
+  "<s>": 1,
+  "<unk>": 3,
+  "A": 7,
+  "B": 24,
+  "C": 19,
+  "D": 14,
+  "E": 5,
+  "F": 20,
+  "G": 21,
+  "H": 11,
+  "I": 10,
+  "J": 29,
+  "K": 26,
+  "L": 15,
+  "M": 17,
+  "N": 9,
+  "O": 8,
+  "P": 23,
+  "Q": 30,
+  "R": 13,
+  "S": 12,
+  "T": 6,
+  "U": 16,
+  "V": 25,
+  "W": 18,
+  "X": 28,
+  "Y": 22,
+  "Z": 31,
+  "|": 4
+}