Mohssinibra commited on
Commit
6f061b9
·
verified ·
1 Parent(s): 99623d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -9
app.py CHANGED
@@ -8,6 +8,8 @@ import os
8
  import scipy.signal as signal
9
  import torch
10
 
 
 
11
  hf_token = os.getenv('diarizationToken')
12
 
13
  print("Initializing Speech-to-Text Model...")
@@ -34,32 +36,40 @@ def convert_audio_to_wav(audio_path):
34
  sound.export(wav_path, format="wav")
35
  return wav_path
36
 
 
 
37
  def process_audio(audio_path):
38
  print(f"Received audio file: {audio_path}")
39
 
40
  try:
41
- # Convert the input audio to WAV format
42
- wav_path = convert_audio_to_wav(audio_path)
43
- print(f"Audio converted to WAV: {wav_path}")
44
-
45
  # Load the audio file using librosa
46
- audio, sr = librosa.load(wav_path, sr=None, duration=30)
47
  print(f"Audio loaded: {len(audio)} samples at {sr} Hz")
48
 
49
  # Remove phone tonalities (if any)
50
  audio = remove_phone_tonalities(audio, sr)
51
  print("Phone tonalities removed")
52
 
 
 
 
53
  # Silence detection: split based on silence
54
- sound = AudioSegment.from_wav(wav_path)
55
  min_silence_len = 1000 # minimum silence length in ms
56
  silence_thresh = sound.dBFS - 14 # threshold for silence (adjust as needed)
 
 
 
 
 
 
 
 
57
  non_silent_chunks = [
58
- sound[start:end] for start, end in sound.detect_nonsilent(min_silence_len=min_silence_len, silence_thresh=silence_thresh)
59
  ]
60
 
61
  # Apply diarization (WhisperX)
62
- diarization = diarize_model(wav_path)
63
 
64
  transcriptions = []
65
  for chunk in non_silent_chunks:
@@ -80,7 +90,6 @@ def process_audio(audio_path):
80
 
81
  # Clean up temporary files
82
  os.remove("chunk.wav")
83
- os.remove(wav_path) # Remove converted wav file
84
 
85
  return "\n".join(transcriptions)
86
 
@@ -88,6 +97,7 @@ def process_audio(audio_path):
88
  print(f"Error: {str(e)}")
89
  return f"Error: {str(e)}"
90
 
 
91
  # Create Gradio interface
92
  iface = gr.Interface(
93
  fn=process_audio,
 
8
  import scipy.signal as signal
9
  import torch
10
 
11
+ from pydub.silence import detect_nonsilent # Correct import
12
+
13
  hf_token = os.getenv('diarizationToken')
14
 
15
  print("Initializing Speech-to-Text Model...")
 
36
  sound.export(wav_path, format="wav")
37
  return wav_path
38
 
39
+
40
+
41
  def process_audio(audio_path):
42
  print(f"Received audio file: {audio_path}")
43
 
44
  try:
 
 
 
 
45
  # Load the audio file using librosa
46
+ audio, sr = librosa.load(audio_path, sr=None, duration=30)
47
  print(f"Audio loaded: {len(audio)} samples at {sr} Hz")
48
 
49
  # Remove phone tonalities (if any)
50
  audio = remove_phone_tonalities(audio, sr)
51
  print("Phone tonalities removed")
52
 
53
+ # Convert to AudioSegment for silence detection
54
+ sound = AudioSegment.from_wav(audio_path)
55
+
56
  # Silence detection: split based on silence
 
57
  min_silence_len = 1000 # minimum silence length in ms
58
  silence_thresh = sound.dBFS - 14 # threshold for silence (adjust as needed)
59
+
60
+ # Correct usage of detect_nonsilent from pydub.silence
61
+ nonsilent_chunks = detect_nonsilent(
62
+ sound,
63
+ min_silence_len=min_silence_len,
64
+ silence_thresh=silence_thresh
65
+ )
66
+
67
  non_silent_chunks = [
68
+ sound[start:end] for start, end in nonsilent_chunks
69
  ]
70
 
71
  # Apply diarization (WhisperX)
72
+ diarization = diarize_model(audio_path)
73
 
74
  transcriptions = []
75
  for chunk in non_silent_chunks:
 
90
 
91
  # Clean up temporary files
92
  os.remove("chunk.wav")
 
93
 
94
  return "\n".join(transcriptions)
95
 
 
97
  print(f"Error: {str(e)}")
98
  return f"Error: {str(e)}"
99
 
100
+
101
  # Create Gradio interface
102
  iface = gr.Interface(
103
  fn=process_audio,