import gradio as gr import librosa import numpy as np import soundfile as sf from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from transformers import pipeline import noisereduce as nr # Ajout de la bibliothèque pour réduire le bruit print("Chargement du modèle Wav2Vec2...") stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija") print("Modèle chargé avec succès !") def process_audio(audio_path): print(f"Fichier reçu : {audio_path}") try: # Charger uniquement les 30 premières secondes audio, sr = librosa.load(audio_path, sr=None, duration=30) print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz") # Réduction du bruit (si nécessaire) audio_denoised = nr.reduce_noise(y=audio, sr=sr) print("Bruit réduit.") # Extraction des MFCC mfccs = librosa.feature.mfcc(y=audio_denoised, sr=sr, n_mfcc=13) print(f"MFCC extrait, shape: {mfccs.shape}") # Normalisation scaler = StandardScaler() mfccs_scaled = scaler.fit_transform(mfccs.T) print("MFCC normalisé.") # Clustering avec KMeans kmeans = KMeans(n_clusters=2, random_state=42, n_init=10) speaker_labels = kmeans.fit_predict(mfccs_scaled) print(f"Clustering terminé, {len(set(speaker_labels))} locuteurs détectés.") # Regrouper les segments audio par speaker speaker_audio = {speaker: [] for speaker in set(speaker_labels)} segment_duration = len(audio_denoised) // len(speaker_labels) for i in range(len(speaker_labels)): start = i * segment_duration end = start + segment_duration speaker_id = speaker_labels[i] speaker_audio[speaker_id].extend(audio_denoised[start:end]) # Transcrire les segments fusionnés result = [] for speaker, audio_segment in speaker_audio.items(): if len(audio_segment) == 0: continue temp_filename = f"temp_speaker_{speaker}.wav" sf.write(temp_filename, np.array(audio_segment), sr) # Sauvegarder le segment transcription = stt_pipeline(temp_filename) # Transcrire result.append(f"Speaker {speaker}: {transcription['text']}") print(f"Transcription Speaker {speaker} terminée.") return "\n".join(result) except Exception as e: print(f"Erreur : {e}") return "Une erreur s'est produite." # Interface Gradio print("Démarrage de Gradio...") iface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Speaker Diarization & Transcription", description="Upload an audio file to detect speakers and transcribe speech for each segment." ) iface.launch() print("Interface lancée avec succès !")