Spaces:
Running
Running
deveix
commited on
Commit
·
9c01213
1
Parent(s):
b1d468e
return to old model
Browse files- app/main.py +54 -97
app/main.py
CHANGED
@@ -27,32 +27,16 @@ default_sample_rate=22050
|
|
27 |
def load(file_name, skip_seconds=0):
|
28 |
return librosa.load(file_name, sr=None, res_type='kaiser_fast')
|
29 |
|
30 |
-
def
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
segments = []
|
39 |
-
for i in range(total_segments):
|
40 |
-
start = i * num_samples_per_segment
|
41 |
-
end = start + num_samples_per_segment
|
42 |
-
segment = audio_data[start:end]
|
43 |
-
segments.append(segment)
|
44 |
-
return segments
|
45 |
-
|
46 |
-
def preprocess_audio(audio_data, rate):
|
47 |
-
# Apply preprocessing steps
|
48 |
-
audio_data = nr.reduce_noise(y=audio_data, sr=rate)
|
49 |
-
audio_data = librosa.util.normalize(audio_data)
|
50 |
-
audio_data, _ = librosa.effects.trim(audio_data)
|
51 |
-
audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
|
52 |
-
# audio_data = fix_length(audio_data)
|
53 |
-
rate = default_sample_rate
|
54 |
|
55 |
-
|
56 |
|
57 |
def extract_features(X, sample_rate):
|
58 |
# Generate Mel-frequency cepstral coefficients (MFCCs) from a time series
|
@@ -198,83 +182,66 @@ async def get_answer(item: Item, token: str = Depends(verify_token)):
|
|
198 |
raise HTTPException(status_code=500, detail=str(e))
|
199 |
|
200 |
# random forest
|
201 |
-
model = joblib.load('app/
|
202 |
pca = joblib.load('app/pca.pkl')
|
203 |
-
scaler = joblib.load('app/
|
204 |
-
label_encoder = joblib.load('app/
|
205 |
-
|
206 |
-
# def preprocess_audio(audio_data, rate):
|
207 |
-
# # Resample first if the target rate is lower to reduce data size for subsequent operations
|
208 |
-
# if rate > default_sample_rate:
|
209 |
-
# audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
|
210 |
-
# rate = default_sample_rate
|
211 |
-
|
212 |
-
# # Trim silence before applying computationally expensive noise reduction
|
213 |
-
# audio_data, _ = librosa.effects.trim(audio_data)
|
214 |
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
#
|
219 |
-
#
|
220 |
-
|
221 |
-
# return audio_data, rate
|
222 |
-
|
223 |
-
# def preprocess_audio(audio_data, rate):
|
224 |
-
# audio_data = nr.reduce_noise(y=audio_data, sr=rate)
|
225 |
-
# # remove silence
|
226 |
-
# # intervals = librosa.effects.split(audio_data, top_db=20)
|
227 |
-
# # # Concatenate non-silent intervals
|
228 |
-
# # audio_data = np.concatenate([audio_data[start:end] for start, end in intervals])
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
|
235 |
-
#
|
236 |
-
#
|
237 |
-
#
|
238 |
-
#
|
239 |
-
#
|
240 |
|
241 |
-
#
|
242 |
-
#
|
243 |
|
244 |
-
#
|
245 |
-
#
|
246 |
|
247 |
-
#
|
248 |
-
# #
|
249 |
|
250 |
-
#
|
251 |
-
#
|
252 |
|
253 |
-
#
|
254 |
-
#
|
255 |
|
256 |
-
#
|
257 |
-
#
|
258 |
|
259 |
-
#
|
260 |
-
#
|
261 |
|
262 |
|
263 |
-
#
|
264 |
|
265 |
|
266 |
-
#
|
267 |
-
#
|
268 |
|
269 |
-
#
|
270 |
-
#
|
271 |
|
272 |
-
#
|
273 |
|
274 |
-
#
|
275 |
-
#
|
276 |
|
277 |
-
|
278 |
|
279 |
# smile = opensmile.Smile(
|
280 |
# feature_set=opensmile.FeatureSet.ComParE_2016,
|
@@ -359,16 +326,7 @@ async def handle_audio(file: UploadFile = File(...)):
|
|
359 |
audio_data, sr = preprocess_audio(audio_data, sr)
|
360 |
print("finished processing ", temp_filename)
|
361 |
# Extract features
|
362 |
-
features_list = []
|
363 |
features = extract_features(audio_data, sr)
|
364 |
-
features_list.append(features)
|
365 |
-
|
366 |
-
segments = split_audio(audio_data, sr)
|
367 |
-
for i, segment in enumerate(segments):
|
368 |
-
# Extract features from the processed audio segment (you need to define this function)
|
369 |
-
features = extract_features(segment, sr)
|
370 |
-
print(f"Features extracted for segment {i+1}")
|
371 |
-
features_list.append(features)
|
372 |
|
373 |
# preprocess_audio(temp_filename, 'app')
|
374 |
# repair_mp3_with_ffmpeg_python(temp_filename, temp_filename)
|
@@ -377,18 +335,17 @@ async def handle_audio(file: UploadFile = File(...)):
|
|
377 |
# print("Extracted Features:", features)
|
378 |
# features = pca.transform(features)
|
379 |
# features = np.array(features).reshape(1, -1)
|
380 |
-
|
381 |
|
382 |
-
|
383 |
|
384 |
# proceed with an inference
|
385 |
-
results = model.predict(
|
386 |
# decoded_predictions = [label_encoder.classes_[i] for i in results]
|
387 |
-
print('decoded', results)
|
388 |
|
389 |
# # Decode the predictions using the label encoder
|
390 |
decoded_predictions = label_encoder.inverse_transform(results)
|
391 |
-
print('decoded', decoded_predictions)
|
392 |
# .tolist()
|
393 |
# Clean up the temporary file
|
394 |
os.remove(temp_filename)
|
|
|
27 |
def load(file_name, skip_seconds=0):
|
28 |
return librosa.load(file_name, sr=None, res_type='kaiser_fast')
|
29 |
|
30 |
+
# def preprocess_audio(audio_data, rate):
|
31 |
+
# # Apply preprocessing steps
|
32 |
+
# audio_data = nr.reduce_noise(y=audio_data, sr=rate)
|
33 |
+
# audio_data = librosa.util.normalize(audio_data)
|
34 |
+
# audio_data, _ = librosa.effects.trim(audio_data)
|
35 |
+
# audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
|
36 |
+
# # audio_data = fix_length(audio_data)
|
37 |
+
# rate = default_sample_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# return audio_data, rate
|
40 |
|
41 |
def extract_features(X, sample_rate):
|
42 |
# Generate Mel-frequency cepstral coefficients (MFCCs) from a time series
|
|
|
182 |
raise HTTPException(status_code=500, detail=str(e))
|
183 |
|
184 |
# random forest
|
185 |
+
model = joblib.load('app/1713661391.0946255_trained_model.joblib')
|
186 |
pca = joblib.load('app/pca.pkl')
|
187 |
+
scaler = joblib.load('app/1713661464.8205004_scaler.joblib')
|
188 |
+
label_encoder = joblib.load('app/1713661470.6730225_label_encoder.joblib')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
+
def preprocess_audio(audio_data, rate):
|
191 |
+
audio_data = nr.reduce_noise(y=audio_data, sr=rate)
|
192 |
+
# remove silence
|
193 |
+
# intervals = librosa.effects.split(audio_data, top_db=20)
|
194 |
+
# # Concatenate non-silent intervals
|
195 |
+
# audio_data = np.concatenate([audio_data[start:end] for start, end in intervals])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
+
audio_data = librosa.util.normalize(audio_data)
|
198 |
+
audio_data, _ = librosa.effects.trim(audio_data)
|
199 |
+
audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
|
200 |
+
rate = default_sample_rate
|
201 |
|
202 |
+
# y_trimmed, _ = librosa.effects.trim(y_no_gaps, top_db = 20)
|
203 |
+
# D = librosa.stft(y)
|
204 |
+
# S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
|
205 |
+
# S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128*2,)
|
206 |
+
# S_db_mel = librosa.amplitude_to_db(np.abs(S), ref=np.max)
|
207 |
|
208 |
+
# Apply noise reduction (example using spectral subtraction)
|
209 |
+
# y_denoised = librosa.effects.preemphasis(y_trimmed)
|
210 |
|
211 |
+
# # Apply dynamic range compression
|
212 |
+
# y_compressed = librosa.effects.preemphasis(y_denoised)
|
213 |
|
214 |
+
# # Augmentation (example of time stretching)
|
215 |
+
# # y_stretched = librosa.effects.time_stretch(y_compressed, rate=1.2)
|
216 |
|
217 |
+
# # Silence Removal
|
218 |
+
# y_silence_removed, _ = librosa.effects.trim(y_compressed)
|
219 |
|
220 |
+
# # Equalization (example: apply high-pass filter)
|
221 |
+
# y_equalized = librosa.effects.preemphasis(y_silence_removed)
|
222 |
|
223 |
+
# # Define target sample rate
|
224 |
+
# target_sr = sr
|
225 |
|
226 |
+
# # Data Augmentation (example: pitch shifting)
|
227 |
+
# y_pitch_shifted = librosa.effects.pitch_shift(y_normalized, sr=target_sr, n_steps=2)
|
228 |
|
229 |
|
230 |
+
# Split audio into non-silent intervals
|
231 |
|
232 |
|
233 |
+
# Normalize the audio signal
|
234 |
+
# y_normalized = librosa.util.normalize(y_equalized)
|
235 |
|
236 |
+
# Feature Extraction (example: MFCCs)
|
237 |
+
# mfccs = librosa.feature.mfcc(y=y_normalized, sr=target_sr, n_mfcc=20)
|
238 |
|
239 |
+
# output_file_path = os.path.join(save_dir, f"{file_name_without_extension}.{extension}")
|
240 |
|
241 |
+
# Write the audio data to the output file in .wav format
|
242 |
+
# sf.write(path, y_normalized, target_sr)
|
243 |
|
244 |
+
return audio_data, rate
|
245 |
|
246 |
# smile = opensmile.Smile(
|
247 |
# feature_set=opensmile.FeatureSet.ComParE_2016,
|
|
|
326 |
audio_data, sr = preprocess_audio(audio_data, sr)
|
327 |
print("finished processing ", temp_filename)
|
328 |
# Extract features
|
|
|
329 |
features = extract_features(audio_data, sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
# preprocess_audio(temp_filename, 'app')
|
332 |
# repair_mp3_with_ffmpeg_python(temp_filename, temp_filename)
|
|
|
335 |
# print("Extracted Features:", features)
|
336 |
# features = pca.transform(features)
|
337 |
# features = np.array(features).reshape(1, -1)
|
338 |
+
features = features.reshape(1, -1)
|
339 |
|
340 |
+
features = scaler.transform(features)
|
341 |
|
342 |
# proceed with an inference
|
343 |
+
results = model.predict(features)
|
344 |
# decoded_predictions = [label_encoder.classes_[i] for i in results]
|
|
|
345 |
|
346 |
# # Decode the predictions using the label encoder
|
347 |
decoded_predictions = label_encoder.inverse_transform(results)
|
348 |
+
print('decoded', decoded_predictions[0])
|
349 |
# .tolist()
|
350 |
# Clean up the temporary file
|
351 |
os.remove(temp_filename)
|