Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,8 @@ import os
|
|
8 |
import scipy.signal as signal
|
9 |
import torch
|
10 |
|
|
|
|
|
11 |
hf_token = os.getenv('diarizationToken')
|
12 |
|
13 |
print("Initializing Speech-to-Text Model...")
|
@@ -34,32 +36,40 @@ def convert_audio_to_wav(audio_path):
|
|
34 |
sound.export(wav_path, format="wav")
|
35 |
return wav_path
|
36 |
|
|
|
|
|
37 |
def process_audio(audio_path):
|
38 |
print(f"Received audio file: {audio_path}")
|
39 |
|
40 |
try:
|
41 |
-
# Convert the input audio to WAV format
|
42 |
-
wav_path = convert_audio_to_wav(audio_path)
|
43 |
-
print(f"Audio converted to WAV: {wav_path}")
|
44 |
-
|
45 |
# Load the audio file using librosa
|
46 |
-
audio, sr = librosa.load(
|
47 |
print(f"Audio loaded: {len(audio)} samples at {sr} Hz")
|
48 |
|
49 |
# Remove phone tonalities (if any)
|
50 |
audio = remove_phone_tonalities(audio, sr)
|
51 |
print("Phone tonalities removed")
|
52 |
|
|
|
|
|
|
|
53 |
# Silence detection: split based on silence
|
54 |
-
sound = AudioSegment.from_wav(wav_path)
|
55 |
min_silence_len = 1000 # minimum silence length in ms
|
56 |
silence_thresh = sound.dBFS - 14 # threshold for silence (adjust as needed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
non_silent_chunks = [
|
58 |
-
sound[start:end] for start, end in
|
59 |
]
|
60 |
|
61 |
# Apply diarization (WhisperX)
|
62 |
-
diarization = diarize_model(
|
63 |
|
64 |
transcriptions = []
|
65 |
for chunk in non_silent_chunks:
|
@@ -80,7 +90,6 @@ def process_audio(audio_path):
|
|
80 |
|
81 |
# Clean up temporary files
|
82 |
os.remove("chunk.wav")
|
83 |
-
os.remove(wav_path) # Remove converted wav file
|
84 |
|
85 |
return "\n".join(transcriptions)
|
86 |
|
@@ -88,6 +97,7 @@ def process_audio(audio_path):
|
|
88 |
print(f"Error: {str(e)}")
|
89 |
return f"Error: {str(e)}"
|
90 |
|
|
|
91 |
# Create Gradio interface
|
92 |
iface = gr.Interface(
|
93 |
fn=process_audio,
|
|
|
8 |
import scipy.signal as signal
|
9 |
import torch
|
10 |
|
11 |
+
from pydub.silence import detect_nonsilent # Correct import
|
12 |
+
|
13 |
hf_token = os.getenv('diarizationToken')
|
14 |
|
15 |
print("Initializing Speech-to-Text Model...")
|
|
|
36 |
sound.export(wav_path, format="wav")
|
37 |
return wav_path
|
38 |
|
39 |
+
|
40 |
+
|
41 |
def process_audio(audio_path):
|
42 |
print(f"Received audio file: {audio_path}")
|
43 |
|
44 |
try:
|
|
|
|
|
|
|
|
|
45 |
# Load the audio file using librosa
|
46 |
+
audio, sr = librosa.load(audio_path, sr=None, duration=30)
|
47 |
print(f"Audio loaded: {len(audio)} samples at {sr} Hz")
|
48 |
|
49 |
# Remove phone tonalities (if any)
|
50 |
audio = remove_phone_tonalities(audio, sr)
|
51 |
print("Phone tonalities removed")
|
52 |
|
53 |
+
# Convert to AudioSegment for silence detection
|
54 |
+
sound = AudioSegment.from_wav(audio_path)
|
55 |
+
|
56 |
# Silence detection: split based on silence
|
|
|
57 |
min_silence_len = 1000 # minimum silence length in ms
|
58 |
silence_thresh = sound.dBFS - 14 # threshold for silence (adjust as needed)
|
59 |
+
|
60 |
+
# Correct usage of detect_nonsilent from pydub.silence
|
61 |
+
nonsilent_chunks = detect_nonsilent(
|
62 |
+
sound,
|
63 |
+
min_silence_len=min_silence_len,
|
64 |
+
silence_thresh=silence_thresh
|
65 |
+
)
|
66 |
+
|
67 |
non_silent_chunks = [
|
68 |
+
sound[start:end] for start, end in nonsilent_chunks
|
69 |
]
|
70 |
|
71 |
# Apply diarization (WhisperX)
|
72 |
+
diarization = diarize_model(audio_path)
|
73 |
|
74 |
transcriptions = []
|
75 |
for chunk in non_silent_chunks:
|
|
|
90 |
|
91 |
# Clean up temporary files
|
92 |
os.remove("chunk.wav")
|
|
|
93 |
|
94 |
return "\n".join(transcriptions)
|
95 |
|
|
|
97 |
print(f"Error: {str(e)}")
|
98 |
return f"Error: {str(e)}"
|
99 |
|
100 |
+
|
101 |
# Create Gradio interface
|
102 |
iface = gr.Interface(
|
103 |
fn=process_audio,
|