deveix commited on
Commit
9c01213
·
1 Parent(s): b1d468e

return to old model

Browse files
Files changed (1) hide show
  1. app/main.py +54 -97
app/main.py CHANGED
@@ -27,32 +27,16 @@ default_sample_rate=22050
27
  def load(file_name, skip_seconds=0):
28
  return librosa.load(file_name, sr=None, res_type='kaiser_fast')
29
 
30
- def split_audio(audio_data, sample_rate, segment_length_sec=20):
31
- # Calculate the number of samples in each segment
32
- num_samples_per_segment = segment_length_sec * sample_rate
33
-
34
- # Calculate total number of segments
35
- total_segments = int(np.ceil(len(audio_data) / num_samples_per_segment))
36
-
37
- # Split the audio data into segments
38
- segments = []
39
- for i in range(total_segments):
40
- start = i * num_samples_per_segment
41
- end = start + num_samples_per_segment
42
- segment = audio_data[start:end]
43
- segments.append(segment)
44
- return segments
45
-
46
- def preprocess_audio(audio_data, rate):
47
- # Apply preprocessing steps
48
- audio_data = nr.reduce_noise(y=audio_data, sr=rate)
49
- audio_data = librosa.util.normalize(audio_data)
50
- audio_data, _ = librosa.effects.trim(audio_data)
51
- audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
52
- # audio_data = fix_length(audio_data)
53
- rate = default_sample_rate
54
 
55
- return audio_data, rate
56
 
57
  def extract_features(X, sample_rate):
58
  # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series
@@ -198,83 +182,66 @@ async def get_answer(item: Item, token: str = Depends(verify_token)):
198
  raise HTTPException(status_code=500, detail=str(e))
199
 
200
  # random forest
201
- model = joblib.load('app/1713696933.326759_trained_model.joblib')
202
  pca = joblib.load('app/pca.pkl')
203
- scaler = joblib.load('app/1713696947.894978_scaler.joblib')
204
- label_encoder = joblib.load('app/1713696954.9487948_label_encoder.joblib')
205
-
206
- # def preprocess_audio(audio_data, rate):
207
- # # Resample first if the target rate is lower to reduce data size for subsequent operations
208
- # if rate > default_sample_rate:
209
- # audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
210
- # rate = default_sample_rate
211
-
212
- # # Trim silence before applying computationally expensive noise reduction
213
- # audio_data, _ = librosa.effects.trim(audio_data)
214
 
215
- # # Normalize the audio data
216
- # audio_data = librosa.util.normalize(audio_data)
217
-
218
- # # Apply noise reduction
219
- # audio_data = nr.reduce_noise(y=audio_data, sr=rate)
220
-
221
- # return audio_data, rate
222
-
223
- # def preprocess_audio(audio_data, rate):
224
- # audio_data = nr.reduce_noise(y=audio_data, sr=rate)
225
- # # remove silence
226
- # # intervals = librosa.effects.split(audio_data, top_db=20)
227
- # # # Concatenate non-silent intervals
228
- # # audio_data = np.concatenate([audio_data[start:end] for start, end in intervals])
229
 
230
- # audio_data = librosa.util.normalize(audio_data)
231
- # audio_data, _ = librosa.effects.trim(audio_data)
232
- # audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
233
- # rate = default_sample_rate
234
 
235
- # # y_trimmed, _ = librosa.effects.trim(y_no_gaps, top_db = 20)
236
- # # D = librosa.stft(y)
237
- # # S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
238
- # # S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128*2,)
239
- # # S_db_mel = librosa.amplitude_to_db(np.abs(S), ref=np.max)
240
 
241
- # # Apply noise reduction (example using spectral subtraction)
242
- # # y_denoised = librosa.effects.preemphasis(y_trimmed)
243
 
244
- # # # Apply dynamic range compression
245
- # # y_compressed = librosa.effects.preemphasis(y_denoised)
246
 
247
- # # # Augmentation (example of time stretching)
248
- # # # y_stretched = librosa.effects.time_stretch(y_compressed, rate=1.2)
249
 
250
- # # # Silence Removal
251
- # # y_silence_removed, _ = librosa.effects.trim(y_compressed)
252
 
253
- # # # Equalization (example: apply high-pass filter)
254
- # # y_equalized = librosa.effects.preemphasis(y_silence_removed)
255
 
256
- # # # Define target sample rate
257
- # # target_sr = sr
258
 
259
- # # # Data Augmentation (example: pitch shifting)
260
- # # y_pitch_shifted = librosa.effects.pitch_shift(y_normalized, sr=target_sr, n_steps=2)
261
 
262
 
263
- # # Split audio into non-silent intervals
264
 
265
 
266
- # # Normalize the audio signal
267
- # # y_normalized = librosa.util.normalize(y_equalized)
268
 
269
- # # Feature Extraction (example: MFCCs)
270
- # # mfccs = librosa.feature.mfcc(y=y_normalized, sr=target_sr, n_mfcc=20)
271
 
272
- # # output_file_path = os.path.join(save_dir, f"{file_name_without_extension}.{extension}")
273
 
274
- # # Write the audio data to the output file in .wav format
275
- # # sf.write(path, y_normalized, target_sr)
276
 
277
- # return audio_data, rate
278
 
279
  # smile = opensmile.Smile(
280
  # feature_set=opensmile.FeatureSet.ComParE_2016,
@@ -359,16 +326,7 @@ async def handle_audio(file: UploadFile = File(...)):
359
  audio_data, sr = preprocess_audio(audio_data, sr)
360
  print("finished processing ", temp_filename)
361
  # Extract features
362
- features_list = []
363
  features = extract_features(audio_data, sr)
364
- features_list.append(features)
365
-
366
- segments = split_audio(audio_data, sr)
367
- for i, segment in enumerate(segments):
368
- # Extract features from the processed audio segment (you need to define this function)
369
- features = extract_features(segment, sr)
370
- print(f"Features extracted for segment {i+1}")
371
- features_list.append(features)
372
 
373
  # preprocess_audio(temp_filename, 'app')
374
  # repair_mp3_with_ffmpeg_python(temp_filename, temp_filename)
@@ -377,18 +335,17 @@ async def handle_audio(file: UploadFile = File(...)):
377
  # print("Extracted Features:", features)
378
  # features = pca.transform(features)
379
  # features = np.array(features).reshape(1, -1)
380
- # features = features.reshape(1, -1)
381
 
382
- features_list = scaler.transform(features_list)
383
 
384
  # proceed with an inference
385
- results = model.predict(features_list)
386
  # decoded_predictions = [label_encoder.classes_[i] for i in results]
387
- print('decoded', results)
388
 
389
  # # Decode the predictions using the label encoder
390
  decoded_predictions = label_encoder.inverse_transform(results)
391
- print('decoded', decoded_predictions)
392
  # .tolist()
393
  # Clean up the temporary file
394
  os.remove(temp_filename)
 
27
  def load(file_name, skip_seconds=0):
28
  return librosa.load(file_name, sr=None, res_type='kaiser_fast')
29
 
30
+ # def preprocess_audio(audio_data, rate):
31
+ # # Apply preprocessing steps
32
+ # audio_data = nr.reduce_noise(y=audio_data, sr=rate)
33
+ # audio_data = librosa.util.normalize(audio_data)
34
+ # audio_data, _ = librosa.effects.trim(audio_data)
35
+ # audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
36
+ # # audio_data = fix_length(audio_data)
37
+ # rate = default_sample_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # return audio_data, rate
40
 
41
  def extract_features(X, sample_rate):
42
  # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series
 
182
  raise HTTPException(status_code=500, detail=str(e))
183
 
184
  # random forest
185
+ model = joblib.load('app/1713661391.0946255_trained_model.joblib')
186
  pca = joblib.load('app/pca.pkl')
187
+ scaler = joblib.load('app/1713661464.8205004_scaler.joblib')
188
+ label_encoder = joblib.load('app/1713661470.6730225_label_encoder.joblib')
 
 
 
 
 
 
 
 
 
189
 
190
+ def preprocess_audio(audio_data, rate):
191
+ audio_data = nr.reduce_noise(y=audio_data, sr=rate)
192
+ # remove silence
193
+ # intervals = librosa.effects.split(audio_data, top_db=20)
194
+ # # Concatenate non-silent intervals
195
+ # audio_data = np.concatenate([audio_data[start:end] for start, end in intervals])
 
 
 
 
 
 
 
 
196
 
197
+ audio_data = librosa.util.normalize(audio_data)
198
+ audio_data, _ = librosa.effects.trim(audio_data)
199
+ audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
200
+ rate = default_sample_rate
201
 
202
+ # y_trimmed, _ = librosa.effects.trim(y_no_gaps, top_db = 20)
203
+ # D = librosa.stft(y)
204
+ # S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
205
+ # S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128*2,)
206
+ # S_db_mel = librosa.amplitude_to_db(np.abs(S), ref=np.max)
207
 
208
+ # Apply noise reduction (example using spectral subtraction)
209
+ # y_denoised = librosa.effects.preemphasis(y_trimmed)
210
 
211
+ # # Apply dynamic range compression
212
+ # y_compressed = librosa.effects.preemphasis(y_denoised)
213
 
214
+ # # Augmentation (example of time stretching)
215
+ # # y_stretched = librosa.effects.time_stretch(y_compressed, rate=1.2)
216
 
217
+ # # Silence Removal
218
+ # y_silence_removed, _ = librosa.effects.trim(y_compressed)
219
 
220
+ # # Equalization (example: apply high-pass filter)
221
+ # y_equalized = librosa.effects.preemphasis(y_silence_removed)
222
 
223
+ # # Define target sample rate
224
+ # target_sr = sr
225
 
226
+ # # Data Augmentation (example: pitch shifting)
227
+ # y_pitch_shifted = librosa.effects.pitch_shift(y_normalized, sr=target_sr, n_steps=2)
228
 
229
 
230
+ # Split audio into non-silent intervals
231
 
232
 
233
+ # Normalize the audio signal
234
+ # y_normalized = librosa.util.normalize(y_equalized)
235
 
236
+ # Feature Extraction (example: MFCCs)
237
+ # mfccs = librosa.feature.mfcc(y=y_normalized, sr=target_sr, n_mfcc=20)
238
 
239
+ # output_file_path = os.path.join(save_dir, f"{file_name_without_extension}.{extension}")
240
 
241
+ # Write the audio data to the output file in .wav format
242
+ # sf.write(path, y_normalized, target_sr)
243
 
244
+ return audio_data, rate
245
 
246
  # smile = opensmile.Smile(
247
  # feature_set=opensmile.FeatureSet.ComParE_2016,
 
326
  audio_data, sr = preprocess_audio(audio_data, sr)
327
  print("finished processing ", temp_filename)
328
  # Extract features
 
329
  features = extract_features(audio_data, sr)
 
 
 
 
 
 
 
 
330
 
331
  # preprocess_audio(temp_filename, 'app')
332
  # repair_mp3_with_ffmpeg_python(temp_filename, temp_filename)
 
335
  # print("Extracted Features:", features)
336
  # features = pca.transform(features)
337
  # features = np.array(features).reshape(1, -1)
338
+ features = features.reshape(1, -1)
339
 
340
+ features = scaler.transform(features)
341
 
342
  # proceed with an inference
343
+ results = model.predict(features)
344
  # decoded_predictions = [label_encoder.classes_[i] for i in results]
 
345
 
346
  # # Decode the predictions using the label encoder
347
  decoded_predictions = label_encoder.inverse_transform(results)
348
+ print('decoded', decoded_predictions[0])
349
  # .tolist()
350
  # Clean up the temporary file
351
  os.remove(temp_filename)