Spaces:
Running
Running
File size: 3,798 Bytes
1053c8b 0a5cfb8 d19f3e0 1053c8b 0cf693f 557b689 d19f3e0 1053c8b d19f3e0 e5c4db0 0cf693f 1053c8b d19f3e0 54472e1 1053c8b d19f3e0 6f061b9 d19f3e0 54472e1 0cf693f 2ee2f68 0cf693f 2ee2f68 d19f3e0 54472e1 d19f3e0 54472e1 0cf693f d19f3e0 6163755 0a5cfb8 2ee2f68 54472e1 0a5cfb8 2ee2f68 0a5cfb8 d19f3e0 0a5cfb8 54472e1 0a5cfb8 54472e1 0a5cfb8 d19f3e0 1053c8b d19f3e0 1053c8b d19f3e0 1053c8b d19f3e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from transformers import pipeline
import noisereduce as nr
from sklearn.metrics import silhouette_score
print("Chargement du modèle Wav2Vec2...")
stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija")
print("Modèle chargé avec succès !")
def find_optimal_clusters(mfccs_scaled):
"""Trouve le nombre optimal de locuteurs en utilisant la méthode du score silhouette"""
best_score = -1
best_n_clusters = 1 # Au moins 1 cluster (1 locuteur)
for n_clusters in range(1, 3): # On teste pour 1 ou 2 locuteurs
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(mfccs_scaled)
if n_clusters > 1:
score = silhouette_score(mfccs_scaled, labels) # Score d’évaluation
if score > best_score:
best_score = score
best_n_clusters = n_clusters
return best_n_clusters
def process_audio(audio_path):
print(f"Fichier reçu : {audio_path}")
try:
# Charger uniquement les 30 premières secondes
audio, sr = librosa.load(audio_path, sr=None, duration=30)
print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz")
# Réduction du bruit (amélioration du SNR)
audio_denoised = nr.reduce_noise(y=audio, sr=sr)
print("Bruit réduit.")
# Extraction des MFCC après réduction du bruit
mfccs = librosa.feature.mfcc(y=audio_denoised, sr=sr, n_mfcc=13)
print(f"MFCC extrait, shape: {mfccs.shape}")
# Normalisation
scaler = StandardScaler()
mfccs_scaled = scaler.fit_transform(mfccs.T)
print("MFCC normalisé.")
# Trouver le nombre optimal de locuteurs
optimal_clusters = find_optimal_clusters(mfccs_scaled)
print(f"Nombre optimal de locuteurs détecté : {optimal_clusters}")
# Appliquer KMeans avec le bon nombre de locuteurs
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
speaker_labels = kmeans.fit_predict(mfccs_scaled)
# Regrouper les segments audio par speaker
speaker_audio = {speaker: [] for speaker in set(speaker_labels)}
segment_duration = len(audio_denoised) // len(speaker_labels)
for i in range(len(speaker_labels)):
start = i * segment_duration
end = start + segment_duration
speaker_id = speaker_labels[i]
speaker_audio[speaker_id].extend(audio_denoised[start:end])
# Transcrire les segments fusionnés
result = []
for speaker, audio_segment in speaker_audio.items():
if len(audio_segment) == 0:
continue
temp_filename = f"temp_speaker_{speaker}.wav"
sf.write(temp_filename, np.array(audio_segment), sr) # Sauvegarder le segment
transcription = stt_pipeline(temp_filename) # Transcrire
result.append(f"Speaker {speaker}: {transcription['text']}")
print(f"Transcription Speaker {speaker} terminée.")
return "\n".join(result)
except Exception as e:
print(f"Erreur : {e}")
return "Une erreur s'est produite."
# Interface Gradio
print("Démarrage de Gradio...")
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Speaker Diarization & Transcription",
description="Upload an audio file to detect speakers and transcribe speech for each segment."
)
iface.launch()
print("Interface lancée avec succès !")
|