Spaces:
Running
Running
deveix
commited on
Commit
·
8faa556
1
Parent(s):
e0568c1
update prediction
Browse files- app/1713630229.4965415_trained_model.joblib +0 -0
- app/main.py +133 -82
- requirements.txt +2 -1
app/1713630229.4965415_trained_model.joblib
ADDED
Binary file (16.6 kB). View file
|
|
app/main.py
CHANGED
@@ -20,6 +20,48 @@ import soundfile as sf
|
|
20 |
import opensmile
|
21 |
|
22 |
import ffmpeg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
load_dotenv()
|
25 |
|
@@ -140,103 +182,103 @@ async def get_answer(item: Item, token: str = Depends(verify_token)):
|
|
140 |
# If there's an error, return a 500 error with the error's details
|
141 |
raise HTTPException(status_code=500, detail=str(e))
|
142 |
|
143 |
-
#
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
def preprocess_audio(path, save_dir):
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
|
165 |
-
|
166 |
-
|
167 |
|
168 |
-
|
169 |
-
|
170 |
|
171 |
-
|
172 |
-
# y_stretched = librosa.effects.time_stretch(y_compressed, rate=1.2)
|
173 |
|
174 |
-
|
175 |
-
|
176 |
|
177 |
-
|
178 |
-
|
179 |
|
180 |
-
|
181 |
-
|
182 |
|
183 |
-
# # Data Augmentation (example: pitch shifting)
|
184 |
-
# y_pitch_shifted = librosa.effects.pitch_shift(y_normalized, sr=target_sr, n_steps=2)
|
185 |
|
186 |
|
187 |
-
|
188 |
|
189 |
|
190 |
-
|
191 |
-
|
192 |
|
193 |
-
|
194 |
-
# mfccs = librosa.feature.mfcc(y=y_normalized, sr=target_sr, n_mfcc=20)
|
195 |
|
196 |
-
|
197 |
|
198 |
-
|
199 |
-
|
200 |
|
201 |
-
|
202 |
|
203 |
-
smile = opensmile.Smile(
|
204 |
-
|
205 |
-
|
206 |
-
)
|
207 |
|
208 |
-
def extract_features(file_path):
|
209 |
-
|
210 |
-
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
|
226 |
-
|
227 |
-
|
228 |
|
229 |
-
|
230 |
-
|
231 |
|
232 |
-
|
233 |
-
|
234 |
|
235 |
-
|
236 |
-
|
237 |
|
238 |
-
|
239 |
-
|
240 |
|
241 |
def repair_mp3_with_ffmpeg_python(input_path, output_path):
|
242 |
"""Attempt to repair an MP3 file using FFmpeg."""
|
@@ -277,27 +319,36 @@ async def handle_audio(file: UploadFile = File(...)):
|
|
277 |
with open(temp_filename, "wb") as f:
|
278 |
f.write(contents)
|
279 |
|
280 |
-
|
281 |
-
|
282 |
-
#
|
283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
print("Extracted Features:", features)
|
285 |
|
286 |
-
features =
|
287 |
-
features =
|
|
|
288 |
|
289 |
# proceed with an inference
|
290 |
-
results =
|
291 |
-
decoded_predictions = [
|
292 |
|
293 |
# # Decode the predictions using the label encoder
|
294 |
-
# decoded_predictions =
|
295 |
# .tolist()
|
296 |
# Clean up the temporary file
|
297 |
os.remove(temp_filename)
|
298 |
|
299 |
# Return a successful response with decoded predictions
|
300 |
-
return {"message": "File processed successfully", "prediction":
|
301 |
except Exception as e:
|
302 |
print(e)
|
303 |
# Handle possible exceptions
|
|
|
20 |
import opensmile
|
21 |
|
22 |
import ffmpeg
|
23 |
+
import noisereduce as nr
|
24 |
+
import numpy as np
|
25 |
+
|
26 |
+
default_sample_rate=22050
|
27 |
+
|
28 |
+
def load(file_name, skip_seconds=0):
|
29 |
+
return librosa.load(file_name, sr=None, res_type='kaiser_fast')
|
30 |
+
|
31 |
+
def preprocess_audio(audio_data, rate):
|
32 |
+
# Apply preprocessing steps
|
33 |
+
audio_data = nr.reduce_noise(y=audio_data, sr=rate)
|
34 |
+
audio_data = librosa.util.normalize(audio_data)
|
35 |
+
audio_data, _ = librosa.effects.trim(audio_data)
|
36 |
+
audio_data = librosa.resample(audio_data, orig_sr=rate, target_sr=default_sample_rate)
|
37 |
+
# audio_data = fix_length(audio_data)
|
38 |
+
rate = default_sample_rate
|
39 |
+
|
40 |
+
return audio_data, rate
|
41 |
+
|
42 |
+
def extract_features(X, sample_rate):
|
43 |
+
# Generate Mel-frequency cepstral coefficients (MFCCs) from a time series
|
44 |
+
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
|
45 |
+
|
46 |
+
# Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
|
47 |
+
stft = np.abs(librosa.stft(X))
|
48 |
+
|
49 |
+
# Computes a chromagram from a waveform or power spectrogram.
|
50 |
+
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
|
51 |
+
|
52 |
+
# Computes a mel-scaled spectrogram.
|
53 |
+
mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
|
54 |
+
|
55 |
+
# Computes spectral contrast
|
56 |
+
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
|
57 |
+
|
58 |
+
# Computes the tonal centroid features (tonnetz)
|
59 |
+
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),sr=sample_rate).T,axis=0)
|
60 |
+
|
61 |
+
# Concatenate all feature arrays into a single 1D array
|
62 |
+
combined_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz])
|
63 |
+
return combined_features
|
64 |
+
|
65 |
|
66 |
load_dotenv()
|
67 |
|
|
|
182 |
# If there's an error, return a 500 error with the error's details
|
183 |
raise HTTPException(status_code=500, detail=str(e))
|
184 |
|
185 |
+
# naive bayes
|
186 |
+
nb_model = joblib.load('1713630229.4965415_trained_model.joblib')
|
187 |
+
nb_pca = joblib.load('app/pca.pkl')
|
188 |
+
nb_scaler = joblib.load('app/scaler.pkl')
|
189 |
+
nb_label_encoder = joblib.load('app/label_encoder.pkl')
|
190 |
+
|
191 |
+
# def preprocess_audio(path, save_dir):
|
192 |
+
# y, sr = librosa.load(path)
|
193 |
+
|
194 |
+
# # remove silence
|
195 |
+
# intervals = librosa.effects.split(y, top_db=20)
|
196 |
+
# # Concatenate non-silent intervals
|
197 |
+
# y_no_gaps = np.concatenate([y[start:end] for start, end in intervals])
|
198 |
+
|
199 |
+
# file_name_without_extension = os.path.basename(path).split('.')[0]
|
200 |
+
# extension = os.path.basename(path).split('.')[1]
|
201 |
+
# y_trimmed, _ = librosa.effects.trim(y_no_gaps, top_db = 20)
|
202 |
+
# D = librosa.stft(y)
|
203 |
+
# S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
|
204 |
+
# S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128*2,)
|
205 |
+
# S_db_mel = librosa.amplitude_to_db(np.abs(S), ref=np.max)
|
206 |
|
207 |
+
# # Apply noise reduction (example using spectral subtraction)
|
208 |
+
# y_denoised = librosa.effects.preemphasis(y_trimmed)
|
209 |
|
210 |
+
# # Apply dynamic range compression
|
211 |
+
# y_compressed = librosa.effects.preemphasis(y_denoised)
|
212 |
|
213 |
+
# # Augmentation (example of time stretching)
|
214 |
+
# # y_stretched = librosa.effects.time_stretch(y_compressed, rate=1.2)
|
215 |
|
216 |
+
# # Silence Removal
|
217 |
+
# y_silence_removed, _ = librosa.effects.trim(y_compressed)
|
218 |
|
219 |
+
# # Equalization (example: apply high-pass filter)
|
220 |
+
# y_equalized = librosa.effects.preemphasis(y_silence_removed)
|
221 |
|
222 |
+
# # Define target sample rate
|
223 |
+
# target_sr = sr
|
224 |
|
225 |
+
# # # Data Augmentation (example: pitch shifting)
|
226 |
+
# # y_pitch_shifted = librosa.effects.pitch_shift(y_normalized, sr=target_sr, n_steps=2)
|
227 |
|
228 |
|
229 |
+
# # Split audio into non-silent intervals
|
230 |
|
231 |
|
232 |
+
# # Normalize the audio signal
|
233 |
+
# y_normalized = librosa.util.normalize(y_equalized)
|
234 |
|
235 |
+
# # Feature Extraction (example: MFCCs)
|
236 |
+
# # mfccs = librosa.feature.mfcc(y=y_normalized, sr=target_sr, n_mfcc=20)
|
237 |
|
238 |
+
# # output_file_path = os.path.join(save_dir, f"{file_name_without_extension}.{extension}")
|
239 |
|
240 |
+
# # Write the audio data to the output file in .wav format
|
241 |
+
# sf.write(path, y_normalized, target_sr)
|
242 |
|
243 |
+
# return 'success'
|
244 |
|
245 |
+
# smile = opensmile.Smile(
|
246 |
+
# feature_set=opensmile.FeatureSet.ComParE_2016,
|
247 |
+
# feature_level=opensmile.FeatureLevel.Functionals,
|
248 |
+
# )
|
249 |
|
250 |
+
# def extract_features(file_path):
|
251 |
+
# # # Load the audio file
|
252 |
+
# # y, sr = librosa.load(file_path, sr=None, dtype=np.float32)
|
253 |
|
254 |
+
# # # Extract MFCCs
|
255 |
+
# # mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
|
256 |
+
# # mfccs_mean = pd.Series(mfccs.mean(axis=1), index=[f'mfcc_{i}' for i in range(mfccs.shape[0])])
|
257 |
|
258 |
+
# # # Extract Spectral Features
|
259 |
+
# # spectral_centroids = pd.Series(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)), index=['spectral_centroid'])
|
260 |
+
# # spectral_rolloff = pd.Series(np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)), index=['spectral_rolloff'])
|
261 |
+
# # spectral_flux = pd.Series(np.mean(librosa.onset.onset_strength(y=y, sr=sr)), index=['spectral_flux'])
|
262 |
+
# # spectral_contrast = pd.Series(np.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr), axis=1), index=[f'spectral_contrast_{i}' for i in range(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr).shape[0])])
|
263 |
|
264 |
+
# # # Extract Pitch
|
265 |
+
# # pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
|
266 |
+
# # pitch_mean = pd.Series(np.mean(pitches[pitches != 0]), index=['pitch_mean']) # Average only non-zero values
|
267 |
|
268 |
+
# # # Extract Zero Crossings
|
269 |
+
# # zero_crossings = pd.Series(np.mean(librosa.feature.zero_crossing_rate(y)), index=['zero_crossings'])
|
270 |
|
271 |
+
# # # Combine all features into a single Series
|
272 |
+
# # features = pd.concat([mfccs_mean, spectral_centroids, spectral_rolloff, spectral_flux, spectral_contrast, pitch_mean, zero_crossings])
|
273 |
|
274 |
+
# features = smile.process_file(file_path)
|
275 |
+
# features_reshaped = features.squeeze()
|
276 |
|
277 |
+
# # Ensure it's now a 2D structure suitable for DataFrame
|
278 |
+
# print("New shape of features:", features_reshaped.shape)
|
279 |
|
280 |
+
# all_data = pd.DataFrame([features_reshaped])
|
281 |
+
# return all_data
|
282 |
|
283 |
def repair_mp3_with_ffmpeg_python(input_path, output_path):
|
284 |
"""Attempt to repair an MP3 file using FFmpeg."""
|
|
|
319 |
with open(temp_filename, "wb") as f:
|
320 |
f.write(contents)
|
321 |
|
322 |
+
audio_data, sr = load(temp_filename, skip_seconds=5)
|
323 |
+
print("finished loading ", temp_filename)
|
324 |
+
# Preprocess data
|
325 |
+
audio_data, sr = preprocess_audio(audio_data, sr)
|
326 |
+
print("finished processing ", temp_filename)
|
327 |
+
# Extract features
|
328 |
+
features = extract_features(audio_data, sr)
|
329 |
+
|
330 |
+
# preprocess_audio(temp_filename, 'app')
|
331 |
+
# repair_mp3_with_ffmpeg_python(temp_filename, temp_filename)
|
332 |
+
# # Here you would add the feature extraction logic
|
333 |
+
# features = extract_features(temp_filename)
|
334 |
print("Extracted Features:", features)
|
335 |
|
336 |
+
# features = nb_scaler.transform(features)
|
337 |
+
# features = nb_pca.transform(features)
|
338 |
+
features = np.array(features).reshape(1, -1)
|
339 |
|
340 |
# proceed with an inference
|
341 |
+
results = nb_model.predict(features)
|
342 |
+
# decoded_predictions = [nb_label_encoder.classes_[i] for i in results]
|
343 |
|
344 |
# # Decode the predictions using the label encoder
|
345 |
+
# decoded_predictions = nb_label_encoder.inverse_transform(results)
|
346 |
# .tolist()
|
347 |
# Clean up the temporary file
|
348 |
os.remove(temp_filename)
|
349 |
|
350 |
# Return a successful response with decoded predictions
|
351 |
+
return {"message": "File processed successfully", "prediction": results}
|
352 |
except Exception as e:
|
353 |
print(e)
|
354 |
# Handle possible exceptions
|
requirements.txt
CHANGED
@@ -17,4 +17,5 @@ opensmile
|
|
17 |
eyeD3
|
18 |
matplotlib
|
19 |
python-multipart
|
20 |
-
ffmpeg-python
|
|
|
|
17 |
eyeD3
|
18 |
matplotlib
|
19 |
python-multipart
|
20 |
+
ffmpeg-python
|
21 |
+
noisereduce
|