Mohssinibra commited on
Commit
d19f3e0
·
verified ·
1 Parent(s): 54472e1

diarizationNative

Browse files
Files changed (1) hide show
  1. app.py +35 -90
app.py CHANGED
@@ -1,113 +1,59 @@
1
  import gradio as gr
2
  import librosa
3
  import numpy as np
4
- import whisperx
 
5
  from transformers import pipeline
6
- from pydub import AudioSegment
7
- import os
8
- import scipy.signal as signal
9
- import torch
10
- from pydub.utils import mediainfo
11
- from pydub.silence import detect_nonsilent # Correct import
12
- import pandas as pd
13
 
14
- # Load Hugging Face token
15
- hf_token = os.getenv('diarizationToken')
16
-
17
- print("Initializing Speech-to-Text Model...")
18
  stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija")
19
- print("Model Loaded Successfully.")
20
-
21
- # Initialize WhisperX with diarization
22
- device = "cuda" if torch.cuda.is_available() else "cpu"
23
- whisper_model = whisperx.load_model("large-v2", device)
24
- diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
25
- print("WhisperX Model Loaded Successfully.")
26
-
27
- def remove_phone_tonalities(audio, sr):
28
- nyquist = 0.5 * sr
29
- low_cut = 300 / nyquist
30
- high_cut = 3400 / nyquist
31
- b, a = signal.butter(1, [low_cut, high_cut], btype='band')
32
- filtered_audio = signal.filtfilt(b, a, audio)
33
- return filtered_audio
34
-
35
- def convert_audio_to_wav(audio_path):
36
- """ Convert any supported audio format to WAV. """
37
- audio_info = mediainfo(audio_path)
38
- print(f"Audio file info: {audio_info}")
39
-
40
- if audio_info['format_name'] not in ['wav', 'mp3', 'flac', 'ogg']:
41
- raise ValueError(f"Unsupported audio format: {audio_info['format_name']}")
42
-
43
- try:
44
- sound = AudioSegment.from_file(audio_path)
45
- wav_path = "converted_audio.wav"
46
- sound.export(wav_path, format="wav")
47
- return wav_path
48
- except Exception as e:
49
- print(f"Error converting audio: {e}")
50
- raise
51
 
52
  def process_audio(audio_path):
53
- """ Process the audio: remove noise, split, diarize, and transcribe. """
54
- print(f"Received audio file: {audio_path}")
55
 
56
  try:
57
- # Load the audio file
58
  audio, sr = librosa.load(audio_path, sr=None, duration=30)
59
- print(f"Audio loaded: {len(audio)} samples at {sr} Hz")
60
-
61
- # Remove phone tonalities
62
- audio = remove_phone_tonalities(audio, sr)
63
- print("Phone tonalities removed")
64
-
65
- # Convert to AudioSegment for silence detection
66
- sound = AudioSegment.from_wav(audio_path)
67
 
68
- # Silence detection
69
- min_silence_len = 1000 # Minimum silence length in ms
70
- silence_thresh = sound.dBFS - 14 # Threshold for silence detection
71
 
72
- nonsilent_chunks = detect_nonsilent(sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
 
 
 
73
 
74
- # Apply diarization
75
- diarization = diarize_model(audio_path)
76
-
77
- if isinstance(diarization, pd.DataFrame):
78
- diarization = diarization.to_dict(orient="records")
79
 
 
80
  transcriptions = []
 
81
 
82
- for start, end in nonsilent_chunks:
83
- chunk = sound[start:end]
84
- chunk.export("chunk.wav", format="wav")
85
-
86
- # Track start time manually
87
- chunk_start_time = start / 1000.0 # Convert ms to seconds
88
 
89
- chunk_audio, chunk_sr = librosa.load("chunk.wav", sr=None)
90
- transcription = stt_pipeline(chunk_audio)
 
91
 
92
- # Match transcription segment with diarization
93
- speaker_label = "Unknown"
94
- for speaker in diarization:
95
- spk_start, spk_end, label = speaker['start'], speaker['end'], speaker['label']
96
- if spk_start <= chunk_start_time <= spk_end: # Use manually tracked start time
97
- speaker_label = label
98
- break
99
-
100
- transcriptions.append(f"Speaker {speaker_label}: {transcription['text']}")
101
-
102
- os.remove("chunk.wav") # Clean up temporary file
103
-
104
  return "\n".join(transcriptions)
105
-
106
  except Exception as e:
107
- print(f"Error: {str(e)}")
108
- return f"Error: {str(e)}"
109
 
110
- # Create Gradio interface
 
111
  iface = gr.Interface(
112
  fn=process_audio,
113
  inputs=gr.Audio(type="filepath"),
@@ -116,6 +62,5 @@ iface = gr.Interface(
116
  description="Upload an audio file to detect speakers and transcribe speech for each segment."
117
  )
118
 
119
- print("Launching Gradio Interface...")
120
  iface.launch()
121
- print("Gradio Interface Launched Successfully.")
 
1
  import gradio as gr
2
  import librosa
3
  import numpy as np
4
+ from sklearn.preprocessing import StandardScaler
5
+ from sklearn.cluster import KMeans
6
  from transformers import pipeline
 
 
 
 
 
 
 
7
 
8
+ print("Chargement du modèle Wav2Vec2...")
 
 
 
9
  stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija")
10
+ print("Modèle chargé avec succès !")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def process_audio(audio_path):
13
+ print(f"Fichier reçu : {audio_path}")
 
14
 
15
  try:
16
+ # Charger uniquement les 30 premières secondes
17
  audio, sr = librosa.load(audio_path, sr=None, duration=30)
18
+ print(f"Audio chargé : {len(audio)} échantillons à {sr} Hz")
 
 
 
 
 
 
 
19
 
20
+ # Extraction des MFCC
21
+ mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
22
+ print(f"MFCC extrait, shape: {mfccs.shape}")
23
 
24
+ # Normalisation
25
+ scaler = StandardScaler()
26
+ mfccs_scaled = scaler.fit_transform(mfccs.T)
27
+ print("MFCC normalisé.")
28
 
29
+ # Clustering avec KMeans
30
+ kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
31
+ speaker_labels = kmeans.fit_predict(mfccs_scaled)
32
+ print(f"Clustering terminé, {len(set(speaker_labels))} locuteurs détectés.")
 
33
 
34
+ # Segmentation et transcription
35
  transcriptions = []
36
+ segment_duration = len(audio) // len(speaker_labels)
37
 
38
+ print("Début de la transcription...")
39
+ for i in range(0, len(audio), sr * 5):
40
+ segment = audio[i : i + sr * 5]
41
+ if len(segment) < sr:
42
+ continue
 
43
 
44
+ transcription = stt_pipeline(segment) # Transcription
45
+ transcriptions.append(f"Speaker {speaker_labels[i // segment_duration]}: {transcription['text']}")
46
+ print(f"Segment {i // sr}-{(i + sr * 5) // sr}s transcrit.")
47
 
48
+ print("Transcription terminée !")
 
 
 
 
 
 
 
 
 
 
 
49
  return "\n".join(transcriptions)
50
+
51
  except Exception as e:
52
+ print(f"Erreur : {e}")
53
+ return "Une erreur s'est produite."
54
 
55
+ # Interface Gradio
56
+ print("Démarrage de Gradio...")
57
  iface = gr.Interface(
58
  fn=process_audio,
59
  inputs=gr.Audio(type="filepath"),
 
62
  description="Upload an audio file to detect speakers and transcribe speech for each segment."
63
  )
64
 
 
65
  iface.launch()
66
+ print("Interface lancée avec succès !")