Spaces:
Running
Running
File size: 2,318 Bytes
1053c8b d19f3e0 1053c8b 557b689 d19f3e0 1053c8b d19f3e0 e5c4db0 1053c8b d19f3e0 54472e1 1053c8b d19f3e0 6f061b9 d19f3e0 54472e1 d19f3e0 54472e1 d19f3e0 54472e1 d19f3e0 54472e1 d19f3e0 1053c8b d19f3e0 54472e1 d19f3e0 54472e1 d19f3e0 54472e1 d19f3e0 1053c8b d19f3e0 1053c8b d19f3e0 1053c8b d19f3e0 1053c8b d19f3e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import gradio as gr
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from transformers import pipeline
print("Chargement du modèle Wav2Vec2...")
stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija")
print("Modèle chargé avec succès !")
def process_audio(audio_path):
print(f"Fichier reçu : {audio_path}")
try:
# Charger uniquement les 30 premières secondes
audio, sr = librosa.load(audio_path, sr=None, duration=30)
print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz")
# Extraction des MFCC
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
print(f"MFCC extrait, shape: {mfccs.shape}")
# Normalisation
scaler = StandardScaler()
mfccs_scaled = scaler.fit_transform(mfccs.T)
print("MFCC normalisé.")
# Clustering avec KMeans
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
speaker_labels = kmeans.fit_predict(mfccs_scaled)
print(f"Clustering terminé, {len(set(speaker_labels))} locuteurs détectés.")
# Segmentation et transcription
transcriptions = []
segment_duration = len(audio) // len(speaker_labels)
print("Début de la transcription...")
for i in range(0, len(audio), sr * 5):
segment = audio[i : i + sr * 5]
if len(segment) < sr:
continue
transcription = stt_pipeline(segment) # Transcription
transcriptions.append(f"Speaker {speaker_labels[i // segment_duration]}: {transcription['text']}")
print(f"Segment {i // sr}-{(i + sr * 5) // sr}s transcrit.")
print("Transcription terminée !")
return "\n".join(transcriptions)
except Exception as e:
print(f"Erreur : {e}")
return "Une erreur s'est produite."
# Interface Gradio
print("Démarrage de Gradio...")
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Speaker Diarization & Transcription",
description="Upload an audio file to detect speakers and transcribe speech for each segment."
)
iface.launch()
print("Interface lancée avec succès !")
|