deveix commited on
Commit
8faa556
·
1 Parent(s): e0568c1

update prediction

Browse files
app/1713630229.4965415_trained_model.joblib ADDED
Binary file (16.6 kB). View file
 
app/main.py CHANGED
@@ -20,6 +20,48 @@ import soundfile as sf
20
  import opensmile
21
 
22
  import ffmpeg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  load_dotenv()
25
 
@@ -140,103 +182,103 @@ async def get_answer(item: Item, token: str = Depends(verify_token)):
140
  # If there's an error, return a 500 error with the error's details
141
  raise HTTPException(status_code=500, detail=str(e))
142
 
143
- # mlp
144
- mlp_model = joblib.load('app/mlp_model.pkl')
145
- mlp_pca = joblib.load('app/pca.pkl')
146
- mlp_scaler = joblib.load('app/scaler.pkl')
147
- mlp_label_encoder = joblib.load('app/label_encoder.pkl')
148
-
149
- def preprocess_audio(path, save_dir):
150
- y, sr = librosa.load(path)
151
-
152
- # remove silence
153
- intervals = librosa.effects.split(y, top_db=20)
154
- # Concatenate non-silent intervals
155
- y_no_gaps = np.concatenate([y[start:end] for start, end in intervals])
156
-
157
- file_name_without_extension = os.path.basename(path).split('.')[0]
158
- extension = os.path.basename(path).split('.')[1]
159
- y_trimmed, _ = librosa.effects.trim(y_no_gaps, top_db = 20)
160
- D = librosa.stft(y)
161
- S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
162
- S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128*2,)
163
- S_db_mel = librosa.amplitude_to_db(np.abs(S), ref=np.max)
164
 
165
- # Apply noise reduction (example using spectral subtraction)
166
- y_denoised = librosa.effects.preemphasis(y_trimmed)
167
 
168
- # Apply dynamic range compression
169
- y_compressed = librosa.effects.preemphasis(y_denoised)
170
 
171
- # Augmentation (example of time stretching)
172
- # y_stretched = librosa.effects.time_stretch(y_compressed, rate=1.2)
173
 
174
- # Silence Removal
175
- y_silence_removed, _ = librosa.effects.trim(y_compressed)
176
 
177
- # Equalization (example: apply high-pass filter)
178
- y_equalized = librosa.effects.preemphasis(y_silence_removed)
179
 
180
- # Define target sample rate
181
- target_sr = sr
182
 
183
- # # Data Augmentation (example: pitch shifting)
184
- # y_pitch_shifted = librosa.effects.pitch_shift(y_normalized, sr=target_sr, n_steps=2)
185
 
186
 
187
- # Split audio into non-silent intervals
188
 
189
 
190
- # Normalize the audio signal
191
- y_normalized = librosa.util.normalize(y_equalized)
192
 
193
- # Feature Extraction (example: MFCCs)
194
- # mfccs = librosa.feature.mfcc(y=y_normalized, sr=target_sr, n_mfcc=20)
195
 
196
- # output_file_path = os.path.join(save_dir, f"{file_name_without_extension}.{extension}")
197
 
198
- # Write the audio data to the output file in .wav format
199
- sf.write(path, y_normalized, target_sr)
200
 
201
- return 'success'
202
 
203
- smile = opensmile.Smile(
204
- feature_set=opensmile.FeatureSet.ComParE_2016,
205
- feature_level=opensmile.FeatureLevel.Functionals,
206
- )
207
 
208
- def extract_features(file_path):
209
- # # Load the audio file
210
- # y, sr = librosa.load(file_path, sr=None, dtype=np.float32)
211
 
212
- # # Extract MFCCs
213
- # mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
214
- # mfccs_mean = pd.Series(mfccs.mean(axis=1), index=[f'mfcc_{i}' for i in range(mfccs.shape[0])])
215
 
216
- # # Extract Spectral Features
217
- # spectral_centroids = pd.Series(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)), index=['spectral_centroid'])
218
- # spectral_rolloff = pd.Series(np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)), index=['spectral_rolloff'])
219
- # spectral_flux = pd.Series(np.mean(librosa.onset.onset_strength(y=y, sr=sr)), index=['spectral_flux'])
220
- # spectral_contrast = pd.Series(np.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr), axis=1), index=[f'spectral_contrast_{i}' for i in range(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr).shape[0])])
221
 
222
- # # Extract Pitch
223
- # pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
224
- # pitch_mean = pd.Series(np.mean(pitches[pitches != 0]), index=['pitch_mean']) # Average only non-zero values
225
 
226
- # # Extract Zero Crossings
227
- # zero_crossings = pd.Series(np.mean(librosa.feature.zero_crossing_rate(y)), index=['zero_crossings'])
228
 
229
- # # Combine all features into a single Series
230
- # features = pd.concat([mfccs_mean, spectral_centroids, spectral_rolloff, spectral_flux, spectral_contrast, pitch_mean, zero_crossings])
231
 
232
- features = smile.process_file(file_path)
233
- features_reshaped = features.squeeze()
234
 
235
- # Ensure it's now a 2D structure suitable for DataFrame
236
- print("New shape of features:", features_reshaped.shape)
237
 
238
- all_data = pd.DataFrame([features_reshaped])
239
- return all_data
240
 
241
  def repair_mp3_with_ffmpeg_python(input_path, output_path):
242
  """Attempt to repair an MP3 file using FFmpeg."""
@@ -277,27 +319,36 @@ async def handle_audio(file: UploadFile = File(...)):
277
  with open(temp_filename, "wb") as f:
278
  f.write(contents)
279
 
280
- preprocess_audio(temp_filename, 'app')
281
- repair_mp3_with_ffmpeg_python(temp_filename, temp_filename)
282
- # Here you would add the feature extraction logic
283
- features = extract_features(temp_filename)
 
 
 
 
 
 
 
 
284
  print("Extracted Features:", features)
285
 
286
- features = mlp_scaler.transform(features)
287
- features = mlp_pca.transform(features)
 
288
 
289
  # proceed with an inference
290
- results = mlp_model.predict(features)
291
- decoded_predictions = [mlp_label_encoder.classes_[i] for i in results]
292
 
293
  # # Decode the predictions using the label encoder
294
- # decoded_predictions = mlp_label_encoder.inverse_transform(results)
295
  # .tolist()
296
  # Clean up the temporary file
297
  os.remove(temp_filename)
298
 
299
  # Return a successful response with decoded predictions
300
- return {"message": "File processed successfully", "prediction": decoded_predictions}
301
  except Exception as e:
302
  print(e)
303
  # Handle possible exceptions
 
20
  import opensmile
21
 
22
  import ffmpeg
23
+ import noisereduce as nr
24
+ import numpy as np
25
+
26
+ default_sample_rate=22050
27
+
28
+ def load(file_name, skip_seconds=0):
29
+ return librosa.load(file_name, sr=None, res_type='kaiser_fast')
30
+
31
+ def preprocess_audio(audio_data, rate):
32
+ # Apply preprocessing steps
33
+ audio_data = nr.reduce_noise(y=audio_data, sr=rate)
34
+ audio_data = librosa.util.normalize(audio_data)
35
+ audio_data, _ = librosa.effects.trim(audio_data)
36
+ audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
37
+ # audio_data = fix_length(audio_data)
38
+ rate = default_sample_rate
39
+
40
+ return audio_data, rate
41
+
42
+ def extract_features(X, sample_rate):
43
+ # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series
44
+ mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
45
+
46
+ # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
47
+ stft = np.abs(librosa.stft(X))
48
+
49
+ # Computes a chromagram from a waveform or power spectrogram.
50
+ chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
51
+
52
+ # Computes a mel-scaled spectrogram.
53
+ mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
54
+
55
+ # Computes spectral contrast
56
+ contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
57
+
58
+ # Computes the tonal centroid features (tonnetz)
59
+ tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),sr=sample_rate).T,axis=0)
60
+
61
+ # Concatenate all feature arrays into a single 1D array
62
+ combined_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz])
63
+ return combined_features
64
+
65
 
66
  load_dotenv()
67
 
 
182
  # If there's an error, return a 500 error with the error's details
183
  raise HTTPException(status_code=500, detail=str(e))
184
 
185
+ # naive bayes
186
+ nb_model = joblib.load('1713630229.4965415_trained_model.joblib')
187
+ nb_pca = joblib.load('app/pca.pkl')
188
+ nb_scaler = joblib.load('app/scaler.pkl')
189
+ nb_label_encoder = joblib.load('app/label_encoder.pkl')
190
+
191
+ # def preprocess_audio(path, save_dir):
192
+ # y, sr = librosa.load(path)
193
+
194
+ # # remove silence
195
+ # intervals = librosa.effects.split(y, top_db=20)
196
+ # # Concatenate non-silent intervals
197
+ # y_no_gaps = np.concatenate([y[start:end] for start, end in intervals])
198
+
199
+ # file_name_without_extension = os.path.basename(path).split('.')[0]
200
+ # extension = os.path.basename(path).split('.')[1]
201
+ # y_trimmed, _ = librosa.effects.trim(y_no_gaps, top_db = 20)
202
+ # D = librosa.stft(y)
203
+ # S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
204
+ # S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128*2,)
205
+ # S_db_mel = librosa.amplitude_to_db(np.abs(S), ref=np.max)
206
 
207
+ # # Apply noise reduction (example using spectral subtraction)
208
+ # y_denoised = librosa.effects.preemphasis(y_trimmed)
209
 
210
+ # # Apply dynamic range compression
211
+ # y_compressed = librosa.effects.preemphasis(y_denoised)
212
 
213
+ # # Augmentation (example of time stretching)
214
+ # # y_stretched = librosa.effects.time_stretch(y_compressed, rate=1.2)
215
 
216
+ # # Silence Removal
217
+ # y_silence_removed, _ = librosa.effects.trim(y_compressed)
218
 
219
+ # # Equalization (example: apply high-pass filter)
220
+ # y_equalized = librosa.effects.preemphasis(y_silence_removed)
221
 
222
+ # # Define target sample rate
223
+ # target_sr = sr
224
 
225
+ # # # Data Augmentation (example: pitch shifting)
226
+ # # y_pitch_shifted = librosa.effects.pitch_shift(y_normalized, sr=target_sr, n_steps=2)
227
 
228
 
229
+ # # Split audio into non-silent intervals
230
 
231
 
232
+ # # Normalize the audio signal
233
+ # y_normalized = librosa.util.normalize(y_equalized)
234
 
235
+ # # Feature Extraction (example: MFCCs)
236
+ # # mfccs = librosa.feature.mfcc(y=y_normalized, sr=target_sr, n_mfcc=20)
237
 
238
+ # # output_file_path = os.path.join(save_dir, f"{file_name_without_extension}.{extension}")
239
 
240
+ # # Write the audio data to the output file in .wav format
241
+ # sf.write(path, y_normalized, target_sr)
242
 
243
+ # return 'success'
244
 
245
+ # smile = opensmile.Smile(
246
+ # feature_set=opensmile.FeatureSet.ComParE_2016,
247
+ # feature_level=opensmile.FeatureLevel.Functionals,
248
+ # )
249
 
250
+ # def extract_features(file_path):
251
+ # # # Load the audio file
252
+ # # y, sr = librosa.load(file_path, sr=None, dtype=np.float32)
253
 
254
+ # # # Extract MFCCs
255
+ # # mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
256
+ # # mfccs_mean = pd.Series(mfccs.mean(axis=1), index=[f'mfcc_{i}' for i in range(mfccs.shape[0])])
257
 
258
+ # # # Extract Spectral Features
259
+ # # spectral_centroids = pd.Series(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)), index=['spectral_centroid'])
260
+ # # spectral_rolloff = pd.Series(np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)), index=['spectral_rolloff'])
261
+ # # spectral_flux = pd.Series(np.mean(librosa.onset.onset_strength(y=y, sr=sr)), index=['spectral_flux'])
262
+ # # spectral_contrast = pd.Series(np.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr), axis=1), index=[f'spectral_contrast_{i}' for i in range(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr).shape[0])])
263
 
264
+ # # # Extract Pitch
265
+ # # pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
266
+ # # pitch_mean = pd.Series(np.mean(pitches[pitches != 0]), index=['pitch_mean']) # Average only non-zero values
267
 
268
+ # # # Extract Zero Crossings
269
+ # # zero_crossings = pd.Series(np.mean(librosa.feature.zero_crossing_rate(y)), index=['zero_crossings'])
270
 
271
+ # # # Combine all features into a single Series
272
+ # # features = pd.concat([mfccs_mean, spectral_centroids, spectral_rolloff, spectral_flux, spectral_contrast, pitch_mean, zero_crossings])
273
 
274
+ # features = smile.process_file(file_path)
275
+ # features_reshaped = features.squeeze()
276
 
277
+ # # Ensure it's now a 2D structure suitable for DataFrame
278
+ # print("New shape of features:", features_reshaped.shape)
279
 
280
+ # all_data = pd.DataFrame([features_reshaped])
281
+ # return all_data
282
 
283
  def repair_mp3_with_ffmpeg_python(input_path, output_path):
284
  """Attempt to repair an MP3 file using FFmpeg."""
 
319
  with open(temp_filename, "wb") as f:
320
  f.write(contents)
321
 
322
+ audio_data, sr = load(temp_filename, skip_seconds=5)
323
+ print("finished loading ", temp_filename)
324
+ # Preprocess data
325
+ audio_data, sr = preprocess_audio(audio_data, sr)
326
+ print("finished processing ", temp_filename)
327
+ # Extract features
328
+ features = extract_features(audio_data, sr)
329
+
330
+ # preprocess_audio(temp_filename, 'app')
331
+ # repair_mp3_with_ffmpeg_python(temp_filename, temp_filename)
332
+ # # Here you would add the feature extraction logic
333
+ # features = extract_features(temp_filename)
334
  print("Extracted Features:", features)
335
 
336
+ # features = nb_scaler.transform(features)
337
+ # features = nb_pca.transform(features)
338
+ features = np.array(features).reshape(1, -1)
339
 
340
  # proceed with an inference
341
+ results = nb_model.predict(features)
342
+ # decoded_predictions = [nb_label_encoder.classes_[i] for i in results]
343
 
344
  # # Decode the predictions using the label encoder
345
+ # decoded_predictions = nb_label_encoder.inverse_transform(results)
346
  # .tolist()
347
  # Clean up the temporary file
348
  os.remove(temp_filename)
349
 
350
  # Return a successful response with decoded predictions
351
+ return {"message": "File processed successfully", "prediction": results}
352
  except Exception as e:
353
  print(e)
354
  # Handle possible exceptions
requirements.txt CHANGED
@@ -17,4 +17,5 @@ opensmile
17
  eyeD3
18
  matplotlib
19
  python-multipart
20
- ffmpeg-python
 
 
17
  eyeD3
18
  matplotlib
19
  python-multipart
20
+ ffmpeg-python
21
+ noisereduce