Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.cluster import KMeans | |
| from transformers import pipeline | |
| print("Chargement du modèle Wav2Vec2...") | |
| stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija") | |
| print("Modèle chargé avec succès !") | |
| def process_audio(audio_path): | |
| print(f"Fichier reçu : {audio_path}") | |
| try: | |
| # Charger uniquement les 30 premières secondes | |
| audio, sr = librosa.load(audio_path, sr=None, duration=30) | |
| print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz") | |
| # Extraction des MFCC | |
| mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) | |
| print(f"MFCC extrait, shape: {mfccs.shape}") | |
| # Normalisation | |
| scaler = StandardScaler() | |
| mfccs_scaled = scaler.fit_transform(mfccs.T) | |
| print("MFCC normalisé.") | |
| # Clustering avec KMeans | |
| kmeans = KMeans(n_clusters=2, random_state=42, n_init=10) | |
| speaker_labels = kmeans.fit_predict(mfccs_scaled) | |
| print(f"Clustering terminé, {len(set(speaker_labels))} locuteurs détectés.") | |
| # Segmentation et transcription | |
| transcriptions = [] | |
| segment_duration = len(audio) // len(speaker_labels) | |
| print("Début de la transcription...") | |
| for i in range(0, len(audio), sr * 5): | |
| segment = audio[i : i + sr * 5] | |
| if len(segment) < sr: | |
| continue | |
| transcription = stt_pipeline(segment) # Transcription | |
| transcriptions.append(f"Speaker {speaker_labels[i // segment_duration]}: {transcription['text']}") | |
| print(f"Segment {i // sr}-{(i + sr * 5) // sr}s transcrit.") | |
| print("Transcription terminée !") | |
| return "\n".join(transcriptions) | |
| except Exception as e: | |
| print(f"Erreur : {e}") | |
| return "Une erreur s'est produite." | |
| # Interface Gradio | |
| print("Démarrage de Gradio...") | |
| iface = gr.Interface( | |
| fn=process_audio, | |
| inputs=gr.Audio(type="filepath"), | |
| outputs="text", | |
| title="Speaker Diarization & Transcription", | |
| description="Upload an audio file to detect speakers and transcribe speech for each segment." | |
| ) | |
| iface.launch() | |
| print("Interface lancée avec succès !") | |