File size: 2,318 Bytes
1053c8b
 
 
d19f3e0
 
1053c8b
557b689
d19f3e0
1053c8b
d19f3e0
e5c4db0
1053c8b
d19f3e0
54472e1
1053c8b
d19f3e0
6f061b9
d19f3e0
54472e1
d19f3e0
 
 
54472e1
d19f3e0
 
 
 
54472e1
d19f3e0
 
 
 
54472e1
d19f3e0
1053c8b
d19f3e0
54472e1
d19f3e0
 
 
 
 
54472e1
d19f3e0
 
 
54472e1
d19f3e0
1053c8b
d19f3e0
1053c8b
d19f3e0
 
1053c8b
d19f3e0
 
1053c8b
 
 
 
 
 
 
 
 
d19f3e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from transformers import pipeline

print("Chargement du modèle Wav2Vec2...")  
stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija")
print("Modèle chargé avec succès !")

def process_audio(audio_path):
    print(f"Fichier reçu : {audio_path}")

    try:
        # Charger uniquement les 30 premières secondes
        audio, sr = librosa.load(audio_path, sr=None, duration=30)
        print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz")

        # Extraction des MFCC
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        print(f"MFCC extrait, shape: {mfccs.shape}")

        # Normalisation
        scaler = StandardScaler()
        mfccs_scaled = scaler.fit_transform(mfccs.T)
        print("MFCC normalisé.")

        # Clustering avec KMeans
        kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
        speaker_labels = kmeans.fit_predict(mfccs_scaled)
        print(f"Clustering terminé, {len(set(speaker_labels))} locuteurs détectés.")

        # Segmentation et transcription
        transcriptions = []
        segment_duration = len(audio) // len(speaker_labels)

        print("Début de la transcription...")
        for i in range(0, len(audio), sr * 5):
            segment = audio[i : i + sr * 5]
            if len(segment) < sr:
                continue

            transcription = stt_pipeline(segment)  # Transcription
            transcriptions.append(f"Speaker {speaker_labels[i // segment_duration]}: {transcription['text']}")
            print(f"Segment {i // sr}-{(i + sr * 5) // sr}s transcrit.")

        print("Transcription terminée !")
        return "\n".join(transcriptions)

    except Exception as e:
        print(f"Erreur : {e}")
        return "Une erreur s'est produite."

# Interface Gradio
print("Démarrage de Gradio...")
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Speaker Diarization & Transcription",
    description="Upload an audio file to detect speakers and transcribe speech for each segment."
)

iface.launch()
print("Interface lancée avec succès !")