ImenMourali commited on
Commit
6327a1b
·
verified ·
1 Parent(s): 98adc46

Update tasks/audio.py

Browse files
Files changed (1) hide show
  1. tasks/audio.py +43 -150
tasks/audio.py CHANGED
@@ -1,196 +1,89 @@
 
 
 
 
 
 
 
1
  from fastapi import APIRouter
2
  from datetime import datetime
3
  from datasets import load_dataset
4
  from sklearn.metrics import accuracy_score
5
- import random
6
- import os
7
-
8
  from .utils.evaluation import AudioEvaluationRequest
9
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
10
-
11
  from dotenv import load_dotenv
 
12
  load_dotenv()
13
 
14
  router = APIRouter()
15
-
16
  DESCRIPTION = "Random Baseline"
17
  ROUTE = "/audio"
18
 
19
- @router.post(ROUTE, tags=["Audio Task"],
20
- description=DESCRIPTION)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  async def evaluate_audio(request: AudioEvaluationRequest):
22
- # Load and prepare the dataset
23
- # Because the dataset is gated, we need to use the HF_TOKEN environment variable to authenticate
24
  dataset = load_dataset(request.dataset_name, token=os.getenv("HF_TOKEN"))
25
-
26
- # Split dataset
27
- train_test = dataset["train"]
28
  test_dataset = dataset["test"]
29
 
30
  # Start tracking emissions
31
  tracker.start()
32
  tracker.start_task("inference")
33
 
34
- #--------------------------------------------------------------------------------------------
35
- # YOUR MODEL INFERENCE CODE HERE
36
- # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
37
- #--------------------------------------------------------------------------------------------
38
- import tensorflow as tf
39
- import tensorflow_hub as hub
40
- import librosa
41
- import numpy as np
42
- from sklearn.model_selection import train_test_split
43
- from tensorflow.keras.utils import to_categorical
44
-
45
- # Load YAMNet Model
46
- yamnet_model_url = "https://tfhub.dev/google/yamnet/1"
47
- yamnet_model = hub.load(yamnet_model_url)
48
-
49
- # Function to extract embeddings from audio
50
- def extract_embedding(audio_example):
51
- '''Extract YAMNet embeddings from a waveform'''
52
- waveform = audio_example["audio"]["array"] # Ensure correct key reference
53
- waveform = tf.convert_to_tensor(waveform, dtype=tf.float32)
54
- scores, embeddings, spectrogram = yamnet_model(waveform)
55
- return {"embedding": embeddings.numpy()}
56
-
57
- # Apply embedding extraction to training data
58
- train_embeddings = dataset["train"].map(extract_embedding)
59
-
60
- # Apply embedding extraction to testing data
61
- test_embeddings = dataset["test"].map(extract_embedding)
62
-
63
- X_train, y_train = [], []
64
- X_test, y_test = [], []
65
-
66
- # Process Training Data
67
- for example in train_embeddings:
68
- for embedding in example["embedding"]:
69
- X_train.append(embedding)
70
- y_train.append(example["label"])
71
-
72
- # Process Testing Data
73
- for example in test_embeddings:
74
- for embedding in example["embedding"]:
75
- X_test.append(embedding)
76
- y_test.append(example["label"])
77
-
78
- # Convert to NumPy arrays
79
- X_train = np.array(X_train)
80
- y_train = np.array(y_train)
81
- X_test = np.array(X_test)
82
- y_test = np.array(y_test)
83
-
84
- # Convert labels to categorical (one-hot encoding)
85
- y_train_cat = to_categorical(y_train, num_classes=2)
86
- y_test_cat = to_categorical(y_test, num_classes=2)
87
-
88
- print(f"Training samples: {X_train.shape}, Test samples: {X_test.shape}")
89
-
90
- from tensorflow.keras.models import Sequential
91
- from tensorflow.keras.layers import Dense, Dropout
92
-
93
- # Define the model
94
- model = Sequential([
95
- Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
96
- Dropout(0.3),
97
- Dense(64, activation='relu'),
98
- Dropout(0.3),
99
- Dense(2, activation='softmax') # 2 classes: chainsaw (0) vs. environment (1)
100
- ])
101
-
102
- model.summary()
103
-
104
- # Compile the model
105
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
106
-
107
- # Train the model on YAMNet embeddings
108
- model.fit(X_train, y_train_cat, epochs=20, batch_size=16, validation_data=(X_test, y_test_cat))
109
-
110
- # Evaluate the model
111
- y_pred = model.predict(X_test)
112
- y_pred_labels = np.argmax(y_pred, axis=1)
113
-
114
- from sklearn.metrics import accuracy_score
115
- accuracy = accuracy_score(y_test, y_pred_labels)
116
- print("Transfer Learning Model Accuracy:", accuracy)
117
-
118
- # Predict labels for the test dataset
119
- # Run YAMNet inference on the raw audio data
120
  predictions = []
121
-
122
  for audio_data in test_dataset["audio"]:
123
  # Extract waveform and sampling rate
124
  waveform = audio_data["array"]
125
  sample_rate = audio_data["sampling_rate"]
126
 
127
- # Resample the waveform to 16kHz (YAMNet's expected sample rate) if necessary
128
  if sample_rate != 16000:
129
  waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
130
 
131
- # Convert waveform to tensor
132
  waveform = tf.convert_to_tensor(waveform, dtype=tf.float32)
 
133
 
134
- # Ensure waveform is 1D
135
- waveform = tf.squeeze(waveform)
136
-
137
- # Predict with YAMNet--->model
138
- # Get YAMNet embeddings
139
- _, embeddings, _ = yamnet_model(waveform) # Using the original yamnet_model for embedding extraction
140
-
141
- # Calculate the mean of the embeddings across the time dimension
142
- embeddings = tf.reduce_mean(embeddings, axis=0) # Average across time frames
143
 
144
- # Reshape embeddings for prediction
145
- embeddings = embeddings.numpy() # Convert to NumPy array
146
- embeddings = embeddings.reshape(1, -1) # Reshape to (1, embedding_dimension)
147
 
148
- # Now predict using your trained model
149
- scores = model.predict(embeddings)
150
-
151
- # Get predicted class
152
  predicted_class_index = np.argmax(scores)
153
- predicted_class_label = predicted_class_index # Assuming 0 for 'chainsaw', 1 for 'environment'
154
-
155
- # Get the top class name using the predicted label
156
- top_class = "chainsaw" if predicted_class_label == 0 else "environment"
157
- predictions.append(top_class)
158
-
159
- print("Predictions:", predictions)
160
-
161
- def map_predictions_to_labels(predictions):
162
- """
163
- Maps string predictions to numeric labels:
164
- - "chainsaw" -> 0
165
- - any other class -> 1
166
- Args:
167
- predictions (list of str): List of class name predictions.
168
- Returns:
169
- list of int: Mapped numeric labels.
170
- """
171
- return [0 if pred == "chainsaw" else 1 for pred in predictions]
172
 
173
  # Map string predictions to numeric labels
174
- numeric_predictions = map_predictions_to_labels(predictions)
175
-
176
- # Extract true labels (already numeric)
177
  true_labels = test_dataset["label"]
178
-
179
- # Calculate accuracy
180
  accuracy = accuracy_score(true_labels, numeric_predictions)
181
- print("Accuracy:", accuracy)
182
-
183
- #--------------------------------------------------------------------------------------------
184
- # YOUR MODEL INFERENCE STOPS HERE
185
- #--------------------------------------------------------------------------------------------
186
 
187
  # Stop tracking emissions
188
  emissions_data = tracker.stop_task()
189
 
190
- # Prepare results dictionary
191
  results = {
192
- "username": username,
193
- "space_url": space_url,
194
  "submission_timestamp": datetime.now().isoformat(),
195
  "model_description": DESCRIPTION,
196
  "accuracy": float(accuracy),
@@ -205,4 +98,4 @@ async def evaluate_audio(request: AudioEvaluationRequest):
205
  }
206
  }
207
 
208
- print(results)
 
1
+ import tensorflow as tf
2
+ import tensorflow_hub as hub
3
+ import numpy as np
4
+ import librosa
5
+ import os
6
+ import tarfile
7
+ from tensorflow.keras.models import load_model
8
  from fastapi import APIRouter
9
  from datetime import datetime
10
  from datasets import load_dataset
11
  from sklearn.metrics import accuracy_score
 
 
 
12
  from .utils.evaluation import AudioEvaluationRequest
13
  from .utils.emissions import tracker, clean_emissions_data, get_space_info
 
14
  from dotenv import load_dotenv
15
+
16
  load_dotenv()
17
 
18
  router = APIRouter()
 
19
  DESCRIPTION = "Random Baseline"
20
  ROUTE = "/audio"
21
 
22
+ # Define paths for local model files
23
+ YAMNET_TAR_PATH = "./yamnet-tensorflow2-yamnet-v1.tar.gz" # Ensure this is in the correct directory
24
+ EXTRACT_PATH = "./yamnet_model"
25
+ CLASSIFIER_PATH = "./audio_model.h5"
26
+
27
+ # Extract YAMNet if it is not already extracted
28
+ if not os.path.exists(EXTRACT_PATH):
29
+ with tarfile.open(YAMNET_TAR_PATH, "r:gz") as tar:
30
+ tar.extractall(EXTRACT_PATH)
31
+
32
+ # Load YAMNet
33
+ yamnet = hub.load(EXTRACT_PATH)
34
+
35
+ # Load trained classifier
36
+ audio_model = load_model(CLASSIFIER_PATH)
37
+
38
+
39
+ @router.post(ROUTE, tags=["Audio Task"], description=DESCRIPTION)
40
  async def evaluate_audio(request: AudioEvaluationRequest):
41
+ """Inference function to classify audio samples using a pre-trained model."""
42
+ # Load dataset
43
  dataset = load_dataset(request.dataset_name, token=os.getenv("HF_TOKEN"))
 
 
 
44
  test_dataset = dataset["test"]
45
 
46
  # Start tracking emissions
47
  tracker.start()
48
  tracker.start_task("inference")
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  predictions = []
 
51
  for audio_data in test_dataset["audio"]:
52
  # Extract waveform and sampling rate
53
  waveform = audio_data["array"]
54
  sample_rate = audio_data["sampling_rate"]
55
 
56
+ # Resample if needed
57
  if sample_rate != 16000:
58
  waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
59
 
60
+ # Convert to tensor
61
  waveform = tf.convert_to_tensor(waveform, dtype=tf.float32)
62
+ waveform = tf.squeeze(waveform) # Ensure waveform is 1D
63
 
64
+ # Extract embeddings from YAMNet
65
+ _, embeddings, _ = yamnet(waveform)
66
+ embeddings = tf.reduce_mean(embeddings, axis=0).numpy() # Average over time
 
 
 
 
 
 
67
 
68
+ # Reshape embeddings for classifier input
69
+ embeddings = embeddings.reshape(1, -1)
 
70
 
71
+ # Predict using the trained classifier
72
+ scores = audio_model.predict(embeddings)
 
 
73
  predicted_class_index = np.argmax(scores)
74
+ predicted_class_label = "chainsaw" if predicted_class_index == 0 else "environment"
75
+ predictions.append(predicted_class_label)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Map string predictions to numeric labels
78
+ numeric_predictions = [0 if pred == "chainsaw" else 1 for pred in predictions]
 
 
79
  true_labels = test_dataset["label"]
 
 
80
  accuracy = accuracy_score(true_labels, numeric_predictions)
 
 
 
 
 
81
 
82
  # Stop tracking emissions
83
  emissions_data = tracker.stop_task()
84
 
85
+ # Prepare results
86
  results = {
 
 
87
  "submission_timestamp": datetime.now().isoformat(),
88
  "model_description": DESCRIPTION,
89
  "accuracy": float(accuracy),
 
98
  }
99
  }
100
 
101
+ return results