Spaces:
Running
Running
import gradio as gr | |
import librosa | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.cluster import KMeans | |
from transformers import pipeline | |
print("Chargement du modèle Wav2Vec2...") | |
stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija") | |
print("Modèle chargé avec succès !") | |
def process_audio(audio_path): | |
print(f"Fichier reçu : {audio_path}") | |
try: | |
# Charger uniquement les 30 premières secondes | |
audio, sr = librosa.load(audio_path, sr=None, duration=30) | |
print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz") | |
# Extraction des MFCC | |
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) | |
print(f"MFCC extrait, shape: {mfccs.shape}") | |
# Normalisation | |
scaler = StandardScaler() | |
mfccs_scaled = scaler.fit_transform(mfccs.T) | |
print("MFCC normalisé.") | |
# Clustering avec KMeans | |
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10) | |
speaker_labels = kmeans.fit_predict(mfccs_scaled) | |
print(f"Clustering terminé, {len(set(speaker_labels))} locuteurs détectés.") | |
# Segmentation et transcription | |
transcriptions = [] | |
segment_duration = len(audio) // len(speaker_labels) | |
print("Début de la transcription...") | |
for i in range(0, len(audio), sr * 5): | |
segment = audio[i : i + sr * 5] | |
if len(segment) < sr: | |
continue | |
transcription = stt_pipeline(segment) # Transcription | |
transcriptions.append(f"Speaker {speaker_labels[i // segment_duration]}: {transcription['text']}") | |
print(f"Segment {i // sr}-{(i + sr * 5) // sr}s transcrit.") | |
print("Transcription terminée !") | |
return "\n".join(transcriptions) | |
except Exception as e: | |
print(f"Erreur : {e}") | |
return "Une erreur s'est produite." | |
# Interface Gradio | |
print("Démarrage de Gradio...") | |
iface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(type="filepath"), | |
outputs="text", | |
title="Speaker Diarization & Transcription", | |
description="Upload an audio file to detect speakers and transcribe speech for each segment." | |
) | |
iface.launch() | |
print("Interface lancée avec succès !") | |