Mohssinibra commited on
Commit
f9b4788
·
verified ·
1 Parent(s): 4bfde29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -3
app.py CHANGED
@@ -38,6 +38,38 @@ def convert_audio_to_wav(audio_path):
38
 
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def process_audio(audio_path):
42
  print(f"Received audio file: {audio_path}")
43
 
@@ -71,6 +103,11 @@ def process_audio(audio_path):
71
  # Apply diarization (WhisperX)
72
  diarization = diarize_model(audio_path)
73
 
 
 
 
 
 
74
  transcriptions = []
75
  for chunk in non_silent_chunks:
76
  chunk.export("chunk.wav", format="wav")
@@ -79,8 +116,8 @@ def process_audio(audio_path):
79
 
80
  # Match transcription segment with diarization result
81
  speaker_label = "Unknown"
82
- for segment in diarization.itertracks(yield_label=True):
83
- spk_start, spk_end, label = segment
84
  # Adjust timestamp matching
85
  if spk_start <= (chunk.start_time / 1000) <= spk_end: # Convert ms to seconds
86
  speaker_label = label
@@ -97,7 +134,6 @@ def process_audio(audio_path):
97
  print(f"Error: {str(e)}")
98
  return f"Error: {str(e)}"
99
 
100
-
101
  # Create Gradio interface
102
  iface = gr.Interface(
103
  fn=process_audio,
 
38
 
39
 
40
 
41
+ import gradio as gr
42
+ import librosa
43
+ import numpy as np
44
+ import whisperx
45
+ from transformers import pipeline
46
+ from pydub import AudioSegment
47
+ import os
48
+ import scipy.signal as signal
49
+ import torch
50
+ import pandas as pd
51
+ from pydub.silence import detect_nonsilent
52
+
53
+ hf_token = os.getenv('diarizationToken')
54
+
55
+ print("Initializing Speech-to-Text Model...")
56
+ stt_pipeline = pipeline("automatic-speech-recognition", model="boumehdi/wav2vec2-large-xlsr-moroccan-darija")
57
+ print("Model Loaded Successfully.")
58
+
59
+ # Initialize WhisperX with diarization
60
+ device = "cuda" if torch.cuda.is_available() else "cpu"
61
+ whisper_model = whisperx.load_model("large-v2", device)
62
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
63
+ print("WhisperX Model Loaded Successfully.")
64
+
65
+ def remove_phone_tonalities(audio, sr):
66
+ nyquist = 0.5 * sr
67
+ low_cut = 300 / nyquist
68
+ high_cut = 3400 / nyquist
69
+ b, a = signal.butter(1, [low_cut, high_cut], btype='band')
70
+ filtered_audio = signal.filtfilt(b, a, audio)
71
+ return filtered_audio
72
+
73
  def process_audio(audio_path):
74
  print(f"Received audio file: {audio_path}")
75
 
 
103
  # Apply diarization (WhisperX)
104
  diarization = diarize_model(audio_path)
105
 
106
+ # Check if diarization is a DataFrame and process accordingly
107
+ if isinstance(diarization, pd.DataFrame):
108
+ print("Diarization is a DataFrame")
109
+ diarization = diarization.to_dict(orient="records") # Convert DataFrame to a list of dicts
110
+
111
  transcriptions = []
112
  for chunk in non_silent_chunks:
113
  chunk.export("chunk.wav", format="wav")
 
116
 
117
  # Match transcription segment with diarization result
118
  speaker_label = "Unknown"
119
+ for speaker in diarization:
120
+ spk_start, spk_end, label = speaker['start'], speaker['end'], speaker['label']
121
  # Adjust timestamp matching
122
  if spk_start <= (chunk.start_time / 1000) <= spk_end: # Convert ms to seconds
123
  speaker_label = label
 
134
  print(f"Error: {str(e)}")
135
  return f"Error: {str(e)}"
136
 
 
137
  # Create Gradio interface
138
  iface = gr.Interface(
139
  fn=process_audio,