jrmd commited on
Commit
200202c
·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.png filter=lfs diff=lfs merge=lfs -text
2
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.flac filter=lfs diff=lfs merge=lfs -text
5
+ *.pkl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .gradio
SpeechSentimentModelConfusionMatrix.png ADDED

Git LFS Details

  • SHA256: b304b4a2287962ca64e7a68d29ad345667ca2b2fafb8712828a80780dba67a28
  • Pointer size: 130 Bytes
  • Size of remote file: 28.5 kB
audiospeechsentimentanalysis_jrmdiouf.py ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """AudioSpeechSentimentAnalysis_JRMDIOUF.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1tizgeMs7DXaZPQO3V253paATKev0ra0m
8
+ """
9
+
10
+ #!pip install transformers
11
+ #!pip install wandb
12
+
13
+ import os
14
+
15
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
16
+
17
+ import pickle
18
+ import re
19
+ from typing import DefaultDict
20
+
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import pandas as pd
24
+ import seaborn as sns
25
+ import torch
26
+ import torch.nn as nn
27
+ import torch.optim as optim
28
+ import torchaudio
29
+ import torchaudio.functional as F
30
+ import wandb
31
+
32
+ # from google.colab import userdata
33
+ # from huggingface_hub import login
34
+ from sklearn.metrics import (
35
+ accuracy_score,
36
+ confusion_matrix,
37
+ precision_score,
38
+ recall_score,
39
+ )
40
+ from torch.utils.data import DataLoader, Dataset, Subset
41
+ from transformers import AutoTokenizer, BertModel, Wav2Vec2ForCTC, Wav2Vec2Processor
42
+
43
+ """hf_token = userdata.get("HF_TOKEN")
44
+ wandb_token = userdata.get("WAND_TOKEN")"""
45
+
46
+ # Commented out IPython magic to ensure Python compatibility.
47
+ # %env HF_TOKEN_ENV=$hf_token
48
+ """!wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/dev.tsv
49
+ !wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/fine-tune.tsv
50
+ !wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/test.tsv
51
+
52
+ !wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/audio/dev.zip
53
+ !wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/audio/fine-tune.zip
54
+ !wget -nc --header "Authorization: Bearer ${HF_TOKEN_ENV}" https://huggingface.co/datasets/asapp/slue/resolve/main/data/voxceleb/audio/test.zip
55
+
56
+ if not os.path.exists("dev_raw"):
57
+ print("dev_raw folder not found. Unzipping dev.zip...")
58
+ !unzip -q dev.zip
59
+ else:
60
+ print("dev_raw folder already exists. Skipping unzip.")
61
+
62
+ if not os.path.exists("fine-tune_raw"):
63
+ print("fine-tune_raw folder not found. Unzipping fine-tune.zip...")
64
+ !unzip -q fine-tune.zip
65
+ else:
66
+ print("fine-tune_raw folder already exists. Skipping unzip.")
67
+
68
+ if not os.path.exists("test_raw"):
69
+ print("test_raw folder not found. Unzipping test.zip...")
70
+ !unzip -q test.zip
71
+ else:
72
+ print("test_raw folder already exists. Skipping unzip.")"""
73
+
74
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
75
+ NUM_EPOCHS = 5
76
+ BATCH_SIZE = 16
77
+
78
+ SAVED_CUSTOM_BERT_TOKEN_MAX_LEN_PATH = "max_len.pkl"
79
+ SAVED_CUSTOM_BERT_TOKENIZER_DIR = "bert_tokenizer_local"
80
+ SAVED_CUSTOM_BERT_MODEL_PATH = "custom_bert_model.bin"
81
+ SAVED_TARGET_CAT_PATH = "categories.bin"
82
+ TRAIN_DS_PATH = "fine-tune.tsv"
83
+ TEST_DS_PATH = "test.tsv"
84
+ BERT_BASE_MODEL = "google-bert/bert-base-uncased"
85
+ INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE = 30
86
+
87
+ SAVED_AUDIO_MODEL_DIR_PATH = "wav2vec2_local"
88
+ AUDIO_BASE_MODEL = "facebook/wav2vec2-base-960h"
89
+ PROCESSOR_NAME = "preprocessor_config.json"
90
+ MODEL_NAME = "config.json"
91
+
92
+ SENTIMENT_MODALITIES = ["Neutral", "Positive", "Negative"]
93
+
94
+
95
+ class CustomBertDataset(Dataset):
96
+ def __init__(
97
+ self,
98
+ file_path,
99
+ audio_folder,
100
+ model_path=BERT_BASE_MODEL,
101
+ saved_target_cats_path=SAVED_TARGET_CAT_PATH,
102
+ saved_max_len_path=SAVED_CUSTOM_BERT_TOKEN_MAX_LEN_PATH,
103
+ ):
104
+ self.model_path = model_path
105
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
106
+ self.lines = open(file_path).readlines()
107
+ self.lines = np.array(
108
+ [
109
+ [
110
+ re.split(r"\t+", line.replace("\n", ""))[1],
111
+ re.split(r"\t+", line.replace("\n", ""))[4],
112
+ re.split(r"\t+", line.replace("\n", ""))[0],
113
+ ]
114
+ for i, line in enumerate(self.lines)
115
+ if line != "\n" and i != 0
116
+ ]
117
+ )
118
+
119
+ self.elem_cats = self.lines[:, 1]
120
+ self.corpus = self.lines[:, 0]
121
+ self.audio_files_id = self.lines[:, 2]
122
+
123
+ # We have to proceed in this order here
124
+ self.corpus = [
125
+ sent.lower()
126
+ for sent, cat in zip(self.corpus, self.elem_cats)
127
+ if cat in SENTIMENT_MODALITIES
128
+ ]
129
+ self.audio_files = np.array(
130
+ [
131
+ os.path.join(audio_folder, f"{file_name}.flac")
132
+ for file_name, cat in zip(self.audio_files_id, self.elem_cats)
133
+ if cat in SENTIMENT_MODALITIES
134
+ ]
135
+ )
136
+ self.elem_cats = [cat for cat in self.elem_cats if cat in SENTIMENT_MODALITIES]
137
+
138
+ self.unique_cats = sorted(list(set(self.elem_cats)))
139
+ self.num_class = len(self.unique_cats)
140
+ self.cats_dict = {cat: i for i, cat in enumerate(self.unique_cats)}
141
+ self.targets = np.array([self.cats_dict[cat] for cat in self.elem_cats])
142
+
143
+ torch.save(self.unique_cats, saved_target_cats_path)
144
+ self.tokenizer.save_pretrained(SAVED_CUSTOM_BERT_TOKENIZER_DIR)
145
+
146
+ """entry_dict = DefaultDict(list)
147
+ for i in range(len(self.corpus)):
148
+ entry_dict[self.targets[i]].append(self.corpus[i])
149
+
150
+ self.final_corpus = []
151
+ self.final_targets = []
152
+ n=0
153
+ while n < len(self.corpus):
154
+ for key in entry_dict.keys():
155
+ if len(entry_dict[key]) > 0:
156
+ self.final_corpus.append(entry_dict[key].pop(0))
157
+ self.final_targets.append(key)
158
+ n+=1
159
+
160
+ self.corpus = np.array(self.final_corpus)
161
+ self.targets = np.array(self.final_targets)"""
162
+
163
+ self.max_len = 0
164
+ for sent in self.corpus:
165
+ input_ids = self.tokenizer.encode(sent, add_special_tokens=True)
166
+ self.max_len = max(self.max_len, len(input_ids))
167
+
168
+ self.max_len = min(self.max_len, 512)
169
+ print(f"Max length : {self.max_len}")
170
+ print(f"Nombre de classes : {self.num_class}")
171
+ print(f"Exemples de targets : {np.unique(self.targets)}")
172
+
173
+ # Save max_len
174
+ with open(saved_max_len_path, "wb") as f:
175
+ pickle.dump(self.max_len, f)
176
+ print(f"max_len saved to {saved_max_len_path}")
177
+
178
+ def __len__(self):
179
+ return len(self.elem_cats)
180
+
181
+ def __getitem__(self, idx):
182
+ text = self.corpus[idx]
183
+ target = self.targets[idx]
184
+
185
+ # Vérification : target doit être entre 0 et num_class - 1
186
+ if target < 0 or target >= self.num_class:
187
+ raise ValueError(
188
+ f"Target out of bounds: {target} not in [0, {self.num_class - 1}]"
189
+ )
190
+
191
+ encoded_input = self.tokenizer.encode_plus(
192
+ text,
193
+ max_length=self.max_len,
194
+ padding="max_length",
195
+ truncation=True,
196
+ return_tensors="pt",
197
+ )
198
+ return (
199
+ encoded_input["input_ids"].squeeze(0),
200
+ encoded_input["attention_mask"].squeeze(0),
201
+ torch.tensor(target, dtype=torch.long),
202
+ self.audio_files[idx],
203
+ )
204
+ # return np.array(encoded_input), torch.tensor(target, dtype=torch.long)
205
+
206
+
207
+ class CustomBertModel(nn.Module):
208
+ def __init__(self, num_class, model_path=BERT_BASE_MODEL):
209
+ super(CustomBertModel, self).__init__()
210
+ self.model_path = model_path
211
+ self.num_class = num_class
212
+
213
+ self.bert = BertModel.from_pretrained(self.model_path)
214
+ # Freeze of the parameters of this layer for the training process
215
+ for param in self.bert.parameters():
216
+ param.requires_grad = False
217
+ # self.proj_intermediate = nn.Sequential(nn.Linear(self.bert.config.hidden_size, INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE),nn.Linear(INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE, INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE), INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE),nn.Linear(INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE, INTERMEDIATE_CUSTOM_BERT_LAYER_SIZE))
218
+ self.proj_lin = nn.Linear(self.bert.config.hidden_size, self.num_class)
219
+
220
+ def forward(self, input_ids, attention_mask):
221
+ x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
222
+
223
+ x = x.last_hidden_state[:, 0, :]
224
+ # x = self.proj_intermediate(x)
225
+ x = self.proj_lin(x)
226
+ return x
227
+
228
+
229
+ def train_step(model, train_dataloader, loss_fn, optimizer):
230
+
231
+ num_iterations = len(train_dataloader)
232
+
233
+ for i in range(NUM_EPOCHS):
234
+ print(f"Training Epoch n° {i}")
235
+ model.train()
236
+
237
+ for j, batch in enumerate(train_dataloader):
238
+
239
+ input = batch[:][0]
240
+ attention = batch[:][1]
241
+ target = batch[:][2]
242
+
243
+ output = model(input.to(device), attention.to(device))
244
+
245
+ loss = loss_fn(output, target.to(device))
246
+
247
+ optimizer.zero_grad()
248
+ loss.backward()
249
+ optimizer.step()
250
+
251
+ run.log({"Training loss": loss})
252
+
253
+ print(f"Epoch {i+1} | step {j+1} / {num_iterations} | loss : {loss}")
254
+
255
+ # Save model
256
+ torch.save(model.state_dict(), SAVED_CUSTOM_BERT_MODEL_PATH)
257
+ print(f"Custom BERT Model saved at {SAVED_CUSTOM_BERT_MODEL_PATH}")
258
+
259
+
260
+ def eval_step(
261
+ test_dataloader,
262
+ loss_fn,
263
+ num_class,
264
+ saved_model_path=SAVED_CUSTOM_BERT_MODEL_PATH,
265
+ saved_target_cats_path=SAVED_TARGET_CAT_PATH,
266
+ ):
267
+
268
+ y_pred = []
269
+ y_true = []
270
+
271
+ num_iterations = len(test_dataloader)
272
+ # Load the saved model
273
+ saved_model = CustomBertModel(num_class)
274
+ saved_model.load_state_dict(
275
+ torch.load(saved_model_path, weights_only=False)
276
+ ) # Explicitly set weights_only to False
277
+ saved_model = saved_model.to(device)
278
+ saved_model.eval() # Set the model to evaluation mode
279
+ print(f"Model loaded from path :{saved_model_path}")
280
+
281
+ with torch.no_grad():
282
+ for j, batch in enumerate(test_dataloader):
283
+
284
+ input = batch[:][0]
285
+ attention = batch[:][1]
286
+ target = batch[:][2]
287
+
288
+ output = saved_model(input.to(device), attention.to(device))
289
+
290
+ loss = loss_fn(output, target.to(device))
291
+
292
+ run.log({"Eval loss": loss})
293
+ print(f"Step {j+1} / {num_iterations} | Eval loss : {loss}")
294
+ y_pred.extend(output.cpu().numpy().argmax(axis=1))
295
+ y_true.extend(target.cpu().numpy())
296
+
297
+ class_labels = torch.load(saved_target_cats_path, weights_only=False)
298
+
299
+ true_labels = [class_labels[i] for i in y_true]
300
+ pred_labels = [class_labels[i] for i in y_pred]
301
+
302
+ print(f"Accuracy : {accuracy_score(true_labels, pred_labels)}")
303
+
304
+ cm = confusion_matrix(true_labels, pred_labels, labels=class_labels)
305
+ df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)
306
+ sns.heatmap(df_cm, annot=True, fmt="d")
307
+ plt.title("Confusion Matrix for Sentiment analysis dataset")
308
+ plt.xlabel("Predicted Label")
309
+ plt.ylabel("True Label")
310
+ plt.show()
311
+
312
+
313
+ def eval_pipeline_step(
314
+ test_dataloader,
315
+ loss_fn,
316
+ num_class,
317
+ audio_model_dir=SAVED_AUDIO_MODEL_DIR_PATH,
318
+ audio_model_name=MODEL_NAME,
319
+ audio_processor_name=PROCESSOR_NAME,
320
+ saved_model_path=SAVED_CUSTOM_BERT_MODEL_PATH,
321
+ saved_target_cats_path=SAVED_TARGET_CAT_PATH,
322
+ ):
323
+
324
+ y_pred = []
325
+ y_true = []
326
+
327
+ num_iterations = len(test_dataloader)
328
+ # Load the saved model
329
+ saved_model = CustomBertModel(num_class)
330
+ saved_model.load_state_dict(
331
+ torch.load(saved_model_path, weights_only=False)
332
+ ) # Explicitly set weights_only to False
333
+ saved_model = saved_model.to(device)
334
+ saved_model.eval() # Set the model to evaluation mode
335
+ print(f"Model loaded from path :{saved_model_path}")
336
+
337
+ audio_processor = None
338
+ audio_model = None
339
+
340
+ processor_path = os.path.join(
341
+ audio_model_dir, audio_processor_name
342
+ ) # Check for a key file, like the preprocessor config
343
+ model_path = os.path.join(
344
+ audio_model_dir, audio_model_name
345
+ ) # Check for a key file, like the model config
346
+
347
+ if (
348
+ os.path.exists(audio_model_dir)
349
+ and os.path.exists(processor_path)
350
+ and os.path.exists(model_path)
351
+ ):
352
+ print("Local Wav2Vec2 processor and model found. Loading from local directory.")
353
+ audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)
354
+ audio_model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
355
+ else:
356
+ print(
357
+ "Local Wav2Vec2 processor and model not found. Downloading from Hugging Face Hub."
358
+ )
359
+ audio_processor = Wav2Vec2Processor.from_pretrained(AUDIO_BASE_MODEL)
360
+ audio_model = Wav2Vec2ForCTC.from_pretrained(AUDIO_BASE_MODEL)
361
+
362
+ # Optionally save the downloaded model and processor for future use
363
+ audio_processor.save_pretrained(audio_model_dir)
364
+ audio_model.save_pretrained(audio_model_dir)
365
+ print(f"Wav2Vec2 processor and model downloaded and saved to {audio_model_dir}")
366
+
367
+ # Move audio model to GPU
368
+ audio_model = audio_model.to(device)
369
+ audio_model.eval()
370
+
371
+ with torch.no_grad():
372
+ for j, batch in enumerate(test_dataloader):
373
+
374
+ target = batch[:][2]
375
+ audio_file_path = batch[:][3]
376
+
377
+ encoded_inputs = []
378
+ attention_masks = []
379
+
380
+ bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
381
+ sample_rate = bundle.sample_rate
382
+
383
+ for audio_file in audio_file_path:
384
+ waveform, sr = torchaudio.load(audio_file)
385
+ if sr != sample_rate:
386
+ print("Resampling")
387
+ resampler = torchaudio.transforms.Resample(
388
+ orig_freq=sr, new_freq=sample_rate
389
+ )
390
+ waveform = resampler(waveform)
391
+
392
+ # Move waveform to GPU before processing
393
+ input_values = audio_processor(
394
+ waveform.squeeze().numpy(),
395
+ sampling_rate=sample_rate,
396
+ return_tensors="pt",
397
+ ).input_values.to(device)
398
+
399
+ with torch.no_grad():
400
+ logits = audio_model(input_values).logits
401
+ predicted_ids_hf = torch.argmax(logits, dim=-1)
402
+ transcript_hf = audio_processor.decode(
403
+ predicted_ids_hf[0].cpu().numpy()
404
+ ) # Move predicted_ids_hf back to CPU for decoding
405
+ transcript_hf = (
406
+ transcript_hf.lower() if transcript_hf is not None else None
407
+ )
408
+
409
+ encoded_input = test_dataloader.dataset.tokenizer.encode_plus(
410
+ transcript_hf,
411
+ max_length=test_dataloader.dataset.max_len,
412
+ padding="max_length",
413
+ truncation=True,
414
+ return_tensors="pt",
415
+ )
416
+ encoded_inputs.append(encoded_input["input_ids"].squeeze(0))
417
+ attention_masks.append(encoded_input["attention_mask"].squeeze(0))
418
+
419
+ text_input = torch.stack(encoded_inputs)
420
+ attention = torch.stack(attention_masks)
421
+
422
+ output = saved_model(text_input.to(device), attention.to(device))
423
+
424
+ loss = loss_fn(output, target.to(device))
425
+
426
+ run.log({"Pipeline Eval loss": loss})
427
+ print(f"Step {j+1} / {num_iterations} | Pipeline Eval loss : {loss}")
428
+
429
+ y_pred.extend(output.cpu().numpy().argmax(axis=1))
430
+ y_true.extend(target.cpu().numpy())
431
+
432
+ class_labels = torch.load(saved_target_cats_path, weights_only=False)
433
+
434
+ true_labels = [class_labels[i] for i in y_true]
435
+ pred_labels = [class_labels[i] for i in y_pred]
436
+
437
+ print(f"Pipeline Accuracy : {accuracy_score(true_labels, pred_labels)}")
438
+
439
+ cm = confusion_matrix(true_labels, pred_labels, labels=class_labels)
440
+ df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)
441
+ sns.heatmap(df_cm, annot=True, fmt="d")
442
+ plt.title("Confusion Matrix for Sentiment analysis Pipeline")
443
+ plt.xlabel("Predicted Label")
444
+ plt.ylabel("True Label")
445
+ plt.show()
446
+
447
+
448
+ def get_audio_sentiment(
449
+ input_audio_path,
450
+ num_class=len(SENTIMENT_MODALITIES),
451
+ audio_model_dir=SAVED_AUDIO_MODEL_DIR_PATH,
452
+ audio_model_name=MODEL_NAME,
453
+ audio_processor_name=PROCESSOR_NAME,
454
+ saved_model_path=SAVED_CUSTOM_BERT_MODEL_PATH,
455
+ saved_target_cats_path=SAVED_TARGET_CAT_PATH,
456
+ tokenizer_save_directory=SAVED_CUSTOM_BERT_TOKENIZER_DIR,
457
+ saved_max_len_path=SAVED_CUSTOM_BERT_TOKEN_MAX_LEN_PATH,
458
+ ):
459
+ # Load the saved model
460
+ saved_model = CustomBertModel(num_class)
461
+ saved_model.load_state_dict(
462
+ torch.load(
463
+ saved_model_path, weights_only=False, map_location=torch.device(device)
464
+ )
465
+ ) # Explicitly set weights_only to False
466
+ saved_model = saved_model.to(device)
467
+ saved_model.eval() # Set the model to evaluation mode
468
+ print(f"Model loaded from path :{saved_model_path}")
469
+ loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_directory)
470
+ max_len = 0
471
+ with open(saved_max_len_path, "rb") as f:
472
+ max_len = pickle.load(f)
473
+
474
+ audio_processor = None
475
+ audio_model = None
476
+
477
+ processor_path = os.path.join(
478
+ audio_model_dir, audio_processor_name
479
+ ) # Check for a key file, like the preprocessor config
480
+ model_path = os.path.join(
481
+ audio_model_dir, audio_model_name
482
+ ) # Check for a key file, like the model config
483
+
484
+ if (
485
+ os.path.exists(audio_model_dir)
486
+ and os.path.exists(processor_path)
487
+ and os.path.exists(model_path)
488
+ ):
489
+ print("Local Wav2Vec2 processor and model found. Loading from local directory.")
490
+ audio_processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)
491
+ audio_model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
492
+ else:
493
+ print(
494
+ "Local Wav2Vec2 processor and model not found. Downloading from Hugging Face Hub."
495
+ )
496
+ audio_processor = Wav2Vec2Processor.from_pretrained(AUDIO_BASE_MODEL)
497
+ audio_model = Wav2Vec2ForCTC.from_pretrained(AUDIO_BASE_MODEL)
498
+
499
+ # Optionally save the downloaded model and processor for future use
500
+ audio_processor.save_pretrained(audio_model_dir)
501
+ audio_model.save_pretrained(audio_model_dir)
502
+ print(f"Wav2Vec2 processor and model downloaded and saved to {audio_model_dir}")
503
+
504
+ # Move audio model to GPU
505
+ audio_model = audio_model.to(device)
506
+ audio_model.eval()
507
+
508
+ with torch.no_grad():
509
+ audio_file_path = input_audio_path
510
+
511
+ encoded_inputs = []
512
+ attention_masks = []
513
+
514
+ bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
515
+ sample_rate = bundle.sample_rate
516
+
517
+ waveform, sr = torchaudio.load(audio_file_path)
518
+ if sr != sample_rate:
519
+ print("Resampling")
520
+ resampler = torchaudio.transforms.Resample(
521
+ orig_freq=sr, new_freq=sample_rate
522
+ )
523
+ waveform = resampler(waveform)
524
+
525
+ # Move waveform to GPU before processing
526
+ input_values = audio_processor(
527
+ waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt"
528
+ ).input_values.to(device)
529
+
530
+ with torch.no_grad():
531
+ logits = audio_model(input_values).logits
532
+ predicted_ids_hf = torch.argmax(logits, dim=-1)
533
+ transcript_hf = audio_processor.decode(
534
+ predicted_ids_hf[0].cpu().numpy()
535
+ ) # Move predicted_ids_hf back to CPU for decoding
536
+ transcript_hf = transcript_hf.lower() if transcript_hf is not None else None
537
+
538
+ encoded_input = loaded_tokenizer.encode_plus(
539
+ transcript_hf,
540
+ max_length=max_len,
541
+ padding="max_length",
542
+ truncation=True,
543
+ return_tensors="pt",
544
+ )
545
+ encoded_inputs.append(encoded_input["input_ids"].squeeze(0))
546
+ attention_masks.append(encoded_input["attention_mask"].squeeze(0))
547
+
548
+ # Stack the lists of tensors before moving to device
549
+ text_input = torch.stack(encoded_inputs)
550
+ attention = torch.stack(attention_masks)
551
+
552
+ output = saved_model(text_input.to(device), attention.to(device))
553
+ class_labels = torch.load(saved_target_cats_path, weights_only=False)
554
+
555
+ return class_labels[output.cpu().numpy().argmax(axis=1)[0]]
556
+
557
+
558
+ # Login using e.g. `huggingface-cli login` to access this dataset
559
+ # global_train_ds = load_dataset("asapp/slue-voxceleb", streaming=True, token='jrmd_hf_token')
560
+ # global_train_ds = load_dataset('asapp/slue',token='jrmd_hf_token')
561
+ # global_train_ds = load_dataset('voxceleb',token='jrmd_hf_token')
562
+
563
+ # global_test_ds = load_dataset("asapp/slue", "voxceleb", split="test", token='jrmd_hf_token')
564
+
565
+ # Get torchaudio pipeline components
566
+ """bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
567
+ #model = bundle.get_model()
568
+ #labels = bundle.get_labels()
569
+ sample_rate = bundle.sample_rate"""
570
+
571
+ """waveform, sr = torchaudio.load("/content/dev_raw/id10012_0AXjxNXiEzo_00001.flac")
572
+ # Resample if sr != sample_rate (or model_hf.config.sampling_rate)
573
+ if sr != sample_rate:
574
+ print("Resampling")
575
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
576
+ waveform = resampler(waveform)"""
577
+
578
+ # Using torchaudio pipeline - Manual Greedy Decoding
579
+ """with torch.no_grad():
580
+ emission = model(waveform)"""
581
+
582
+ # Assuming emission is log-probabilities or logits
583
+ # Perform greedy decoding: get the index of the max probability at each time step
584
+
585
+ # predicted_ids_torchaudio = torch.argmax(emission[0], dim=-1)
586
+
587
+ # Process the predicted IDs: remove consecutive duplicates and blank tokens
588
+ # Assuming the blank token is at index 0 (which is common for CTC, check labels if unsure)
589
+ """processed_ids_torchaudio = []
590
+ for id in predicted_ids_torchaudio[0]: # emission has shape (batch_size, num_frames, num_labels)
591
+ if id.item() != 0 and (len(processed_ids_torchaudio) == 0 or id.item() != processed_ids_torchaudio[-1]):
592
+ processed_ids_torchaudio.append(id.item())"""
593
+
594
+ """# Convert token IDs to transcript using labels
595
+ #transcript = "".join([labels[id] for id in processed_ids_torchaudio])
596
+
597
+ # Using Hugging Face transformers
598
+ # Note: processor and model_hf are defined in cell DnJDG6P3BTjZ
599
+ # To make this cell fully self-contained, you might want to include their definitions here as well.
600
+ # For now, assuming they are defined in a previously executed cell.
601
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
602
+ model_hf = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
603
+
604
+ # Load and resample waveform
605
+ waveform, sr = torchaudio.load("/content/dev_raw/id10012_0AXjxNXiEzo_00001.flac")
606
+ if sr != sample_rate:
607
+ print("Resampling")
608
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)
609
+ waveform = resampler(waveform)
610
+
611
+ input_values = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt").input_values
612
+ with torch.no_grad():
613
+ logits = model_hf(input_values).logits
614
+ predicted_ids_hf = torch.argmax(logits, dim=-1)
615
+ transcript_hf = processor.decode(predicted_ids_hf[0])
616
+
617
+ #print("Torchaudio Transcript:", transcript)
618
+ print("Hugging Face Transcript:", transcript_hf)"""
619
+
620
+ if __name__ == "__main__":
621
+
622
+ wandb.login(key=wandb_token)
623
+ run = wandb.init(project="DIT-Wav2Vec-Bert-Sentiment-Analysis-project")
624
+ bert_train_dataset = CustomBertDataset(TRAIN_DS_PATH, "fine-tune_raw")
625
+ bert_test_dataset = CustomBertDataset(TEST_DS_PATH, "test_raw")
626
+ print(f"Size of bert dataset : {len(bert_train_dataset)}")
627
+ """train_dataset = Subset(our_bert_dataset, range(int(len(our_bert_dataset)*0.8)))
628
+ test_dataset = Subset(our_bert_dataset, range(int(len(our_bert_dataset)*0.8), len(our_bert_dataset)))"""
629
+
630
+ train_dataloader = DataLoader(
631
+ bert_train_dataset, batch_size=BATCH_SIZE, shuffle=True
632
+ )
633
+ test_dataloader = DataLoader(
634
+ bert_test_dataset, batch_size=BATCH_SIZE, shuffle=False
635
+ )
636
+
637
+ our_bert_model = CustomBertModel(bert_train_dataset.num_class)
638
+ our_bert_model = our_bert_model.to(device)
639
+
640
+ loss_fn = nn.CrossEntropyLoss()
641
+ optimizer = optim.SGD(
642
+ filter(lambda p: p.requires_grad, our_bert_model.parameters()), lr=0.01
643
+ )
644
+
645
+ train_step(our_bert_model, train_dataloader, loss_fn, optimizer)
646
+ eval_step(test_dataloader, loss_fn, bert_train_dataset.num_class)
647
+ eval_pipeline_step(test_dataloader, loss_fn, bert_train_dataset.num_class)
648
+
649
+ test_inference_audio_path = "/content/dev_raw/id10012_0AXjxNXiEzo_00001.flac"
650
+ print(get_audio_sentiment(test_inference_audio_path))
bert_tokenizer_local/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
bert_tokenizer_local/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
bert_tokenizer_local/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
bert_tokenizer_local/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
categories.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce4f35be86b2eecde01dac17af9f2885aa5dde5c90ab4770871d4e7f6d7fe92d
3
+ size 1196
custom_bert_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b962a92b9dcb34ba0659d0fda0f5a312bbe6f5e7d13060413dd3abde366c517c
3
+ size 438021794
demo.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import audiospeechsentimentanalysis_jrmdiouf as assaj
4
+
5
+
6
+ def find_sentiment(input):
7
+ return assaj.get_audio_sentiment(input)
8
+
9
+
10
+ with gr.Blocks() as demo:
11
+ gr.Markdown(
12
+ "<h1 style='text-align: center;'>CUSTOM MODEL BASED ON WAV2VEC2 AND BERT BASE TO ANALYZE SPEECH SENTIMENT</h1>"
13
+ )
14
+
15
+ gr.Interface(
16
+ fn=find_sentiment,
17
+ inputs=[gr.Audio(type="filepath")],
18
+ outputs=["text"],
19
+ live=False,
20
+ )
21
+
22
+ gr.Markdown(
23
+ "<h2 style='text-align: center;'>Speech sentiment analysis model loss during training and eval time</h2>"
24
+ )
25
+
26
+ with gr.Row():
27
+ gr.Image(value="wandb_chart_train.png", label="Training Loss", width=300)
28
+ gr.Image(value="wandb_chart_eval.png", label="Pipeline eval Loss", width=300)
29
+
30
+ gr.Markdown(
31
+ "<h2 style='text-align: center;'>Confusion matrix obtained from model evaluation on VoxCeleb dataset</h2>"
32
+ )
33
+
34
+ with gr.Row():
35
+ gr.Image(
36
+ value="SpeechSentimentModelConfusionMatrix.png",
37
+ label="Confusion Matrix from model evaluation",
38
+ )
39
+
40
+ with gr.Row():
41
+ gr.Markdown(
42
+ "<h3><span style='text-decoration:underline;'>Pipeline Accuracy</span> : <span style='font-style:italic;'>0.758</span></h3>"
43
+ )
44
+
45
+
46
+ demo.launch(share=True)
demo_api_client.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from gradio_client import Client, handle_file
4
+
5
+ client = Client("http://localhost:7860/")
6
+
7
+ # Use a raw string for the file path
8
+ audio_file_path = r"E:\00.Divers\DIT\04.Cours\M2\06.DS-DeepLearning2\Examen\Dev\id10012_0AXjxNXiEzo_00001.flac"
9
+
10
+ # Verify the file exists (good practice!)
11
+ if not os.path.exists(audio_file_path):
12
+ print(f"Error: The file '{audio_file_path}' does not exist. Please check the path.")
13
+ else:
14
+ print(f"File found: {audio_file_path}")
15
+ result = client.predict(input=handle_file(audio_file_path), api_name="/predict")
16
+ print(result)
id10012_0AXjxNXiEzo_00001.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f1be9a6c5fa7421364e026e4294bf4976d15d7a61dc397c9385b796c619299f
3
+ size 78322
max_len.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2b0264bcc30013ba2d474c3e149ba4401daaa47d88d874ccaba45d3c1518fb
3
+ size 5
wandb_chart_eval.png ADDED

Git LFS Details

  • SHA256: 12523283a96b2953885832dcf250e7816579836fa90b3c01de56e8dbbeab0c0c
  • Pointer size: 131 Bytes
  • Size of remote file: 548 kB
wandb_chart_train.png ADDED

Git LFS Details

  • SHA256: f6156215bba9266c6edb00c2a3d46a53b2f538fc9aeb43cb3bdedcda218fbde8
  • Pointer size: 131 Bytes
  • Size of remote file: 439 kB
wav2vec2_local/config.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "adapter_attn_dim": null,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": false,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "group",
52
+ "feat_proj_dropout": 0.1,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.1,
55
+ "gradient_checkpointing": false,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_dropout_prob": 0.1,
59
+ "hidden_size": 768,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 3072,
62
+ "layer_norm_eps": 1e-05,
63
+ "layerdrop": 0.1,
64
+ "mask_feature_length": 10,
65
+ "mask_feature_min_masks": 0,
66
+ "mask_feature_prob": 0.0,
67
+ "mask_time_length": 10,
68
+ "mask_time_min_masks": 2,
69
+ "mask_time_prob": 0.05,
70
+ "model_type": "wav2vec2",
71
+ "num_adapter_layers": 3,
72
+ "num_attention_heads": 12,
73
+ "num_codevector_groups": 2,
74
+ "num_codevectors_per_group": 320,
75
+ "num_conv_pos_embedding_groups": 16,
76
+ "num_conv_pos_embeddings": 128,
77
+ "num_feat_extract_layers": 7,
78
+ "num_hidden_layers": 12,
79
+ "num_negatives": 100,
80
+ "output_hidden_size": 768,
81
+ "pad_token_id": 0,
82
+ "proj_codevector_dim": 256,
83
+ "tdnn_dilation": [
84
+ 1,
85
+ 2,
86
+ 3,
87
+ 1,
88
+ 1
89
+ ],
90
+ "tdnn_dim": [
91
+ 512,
92
+ 512,
93
+ 512,
94
+ 512,
95
+ 1500
96
+ ],
97
+ "tdnn_kernel": [
98
+ 5,
99
+ 3,
100
+ 3,
101
+ 1,
102
+ 1
103
+ ],
104
+ "torch_dtype": "float32",
105
+ "transformers_version": "4.53.1",
106
+ "use_weighted_layer_sum": false,
107
+ "vocab_size": 32,
108
+ "xvector_output_dim": 512
109
+ }
wav2vec2_local/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b516d7bf54ca328ba24c507c2d11ba2fd2be54991e2a7cd965aadba947cc532c
3
+ size 377611120
wav2vec2_local/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": false,
9
+ "sampling_rate": 16000
10
+ }
wav2vec2_local/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
wav2vec2_local/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "do_lower_case": false,
39
+ "do_normalize": true,
40
+ "eos_token": "</s>",
41
+ "extra_special_tokens": {},
42
+ "model_max_length": 1000000000000000019884624838656,
43
+ "pad_token": "<pad>",
44
+ "processor_class": "Wav2Vec2Processor",
45
+ "replace_word_delimiter_char": " ",
46
+ "return_attention_mask": false,
47
+ "target_lang": null,
48
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
49
+ "unk_token": "<unk>",
50
+ "word_delimiter_token": "|"
51
+ }
wav2vec2_local/vocab.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "'": 27,
3
+ "</s>": 2,
4
+ "<pad>": 0,
5
+ "<s>": 1,
6
+ "<unk>": 3,
7
+ "A": 7,
8
+ "B": 24,
9
+ "C": 19,
10
+ "D": 14,
11
+ "E": 5,
12
+ "F": 20,
13
+ "G": 21,
14
+ "H": 11,
15
+ "I": 10,
16
+ "J": 29,
17
+ "K": 26,
18
+ "L": 15,
19
+ "M": 17,
20
+ "N": 9,
21
+ "O": 8,
22
+ "P": 23,
23
+ "Q": 30,
24
+ "R": 13,
25
+ "S": 12,
26
+ "T": 6,
27
+ "U": 16,
28
+ "V": 25,
29
+ "W": 18,
30
+ "X": 28,
31
+ "Y": 22,
32
+ "Z": 31,
33
+ "|": 4
34
+ }