Spaces:
Running
Running
import gradio as gr | |
import librosa | |
import numpy as np | |
import soundfile as sf | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.cluster import KMeans | |
from transformers import pipeline | |
print("Chargement du modèle Wav2Vec2...") | |
stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija") | |
print("Modèle chargé avec succès !") | |
def process_audio(audio_path): | |
print(f"Fichier reçu : {audio_path}") | |
try: | |
# Charger uniquement les 30 premières secondes | |
audio, sr = librosa.load(audio_path, sr=None, duration=30) | |
print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz") | |
# Extraction des MFCC | |
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) | |
print(f"MFCC extrait, shape: {mfccs.shape}") | |
# Normalisation | |
scaler = StandardScaler() | |
mfccs_scaled = scaler.fit_transform(mfccs.T) | |
print("MFCC normalisé.") | |
# Clustering avec KMeans | |
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10) | |
speaker_labels = kmeans.fit_predict(mfccs_scaled) | |
print(f"Clustering terminé, {len(set(speaker_labels))} locuteurs détectés.") | |
# Regrouper les segments audio par speaker | |
speaker_audio = {speaker: [] for speaker in set(speaker_labels)} | |
segment_duration = len(audio) // len(speaker_labels) | |
for i in range(len(speaker_labels)): | |
start = i * segment_duration | |
end = start + segment_duration | |
speaker_id = speaker_labels[i] | |
speaker_audio[speaker_id].extend(audio[start:end]) | |
# Transcrire les segments fusionnés | |
result = [] | |
for speaker, audio_segment in speaker_audio.items(): | |
if len(audio_segment) == 0: | |
continue | |
temp_filename = f"temp_speaker_{speaker}.wav" | |
sf.write(temp_filename, np.array(audio_segment), sr) # Sauvegarder le segment | |
transcription = stt_pipeline(temp_filename) # Transcrire | |
result.append(f"Speaker {speaker}: {transcription['text']}") | |
print(f"Transcription Speaker {speaker} terminée.") | |
return "\n".join(result) | |
except Exception as e: | |
print(f"Erreur : {e}") | |
return "Une erreur s'est produite." | |
# Interface Gradio | |
print("Démarrage de Gradio...") | |
iface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(type="filepath"), | |
outputs="text", | |
title="Speaker Diarization & Transcription", | |
description="Upload an audio file to detect speakers and transcribe speech for each segment." | |
) | |
iface.launch() | |
print("Interface lancée avec succès !") | |