Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -6,15 +6,15 @@ from transformers import (
|
|
6 |
SpeechT5Processor,
|
7 |
SpeechT5ForTextToSpeech,
|
8 |
SpeechT5HifiGan,
|
9 |
-
WhisperProcessor, #
|
10 |
-
WhisperForConditionalGeneration #
|
11 |
)
|
12 |
from datasets import load_dataset # To get a speaker embedding for TTS
|
13 |
import os
|
14 |
import spaces # Import the spaces library for GPU decorator
|
15 |
import tempfile # For creating temporary audio files
|
16 |
import soundfile as sf # To save audio files
|
17 |
-
import librosa #
|
18 |
|
19 |
# --- Configuration for Language Model (LLM) ---
|
20 |
HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
|
@@ -30,7 +30,7 @@ TTS_MODEL_ID = "microsoft/speecht5_tts"
|
|
30 |
TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
|
31 |
|
32 |
# --- Configuration for Speech-to-Text (STT) ---
|
33 |
-
STT_MODEL_ID = "openai/whisper-
|
34 |
|
35 |
# --- Global variables for models and tokenizers/processors ---
|
36 |
tokenizer = None
|
@@ -39,8 +39,8 @@ tts_processor = None
|
|
39 |
tts_model = None
|
40 |
tts_vocoder = None
|
41 |
speaker_embeddings = None
|
42 |
-
whisper_processor = None
|
43 |
-
whisper_model = None
|
44 |
|
45 |
# --- Load All Models Function ---
|
46 |
@spaces.GPU # Decorate with @spaces.GPU to signal this function needs GPU access
|
@@ -235,6 +235,8 @@ def transcribe_audio(audio_filepath):
|
|
235 |
audio, sample_rate = librosa.load(audio_filepath, sr=16000)
|
236 |
|
237 |
# Process audio input for the Whisper model
|
|
|
|
|
238 |
input_features = whisper_processor(
|
239 |
audio,
|
240 |
sampling_rate=sample_rate,
|
@@ -298,6 +300,8 @@ with gr.Blocks() as demo:
|
|
298 |
type="filepath",
|
299 |
label="Upload Audio or Record from Microphone",
|
300 |
# Removed 'microphone=True' and 'source' as they cause TypeError with older Gradio versions
|
|
|
|
|
301 |
format="wav" # Ensure consistent format
|
302 |
)
|
303 |
transcribe_button = gr.Button("Transcribe Audio")
|
|
|
6 |
SpeechT5Processor,
|
7 |
SpeechT5ForTextToSpeech,
|
8 |
SpeechT5HifiGan,
|
9 |
+
WhisperProcessor, # For Speech-to-Text
|
10 |
+
WhisperForConditionalGeneration # For Speech-to-Text
|
11 |
)
|
12 |
from datasets import load_dataset # To get a speaker embedding for TTS
|
13 |
import os
|
14 |
import spaces # Import the spaces library for GPU decorator
|
15 |
import tempfile # For creating temporary audio files
|
16 |
import soundfile as sf # To save audio files
|
17 |
+
import librosa # For loading audio files for transcription
|
18 |
|
19 |
# --- Configuration for Language Model (LLM) ---
|
20 |
HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
|
|
|
30 |
TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
|
31 |
|
32 |
# --- Configuration for Speech-to-Text (STT) ---
|
33 |
+
STT_MODEL_ID = "openai/whisper-small" # Changed from 'openai/whisper-tiny' for better long audio transcription
|
34 |
|
35 |
# --- Global variables for models and tokenizers/processors ---
|
36 |
tokenizer = None
|
|
|
39 |
tts_model = None
|
40 |
tts_vocoder = None
|
41 |
speaker_embeddings = None
|
42 |
+
whisper_processor = None
|
43 |
+
whisper_model = None
|
44 |
|
45 |
# --- Load All Models Function ---
|
46 |
@spaces.GPU # Decorate with @spaces.GPU to signal this function needs GPU access
|
|
|
235 |
audio, sample_rate = librosa.load(audio_filepath, sr=16000)
|
236 |
|
237 |
# Process audio input for the Whisper model
|
238 |
+
# The Whisper `generate` method, especially with larger models, is designed
|
239 |
+
# to handle variable-length inputs by internally managing context.
|
240 |
input_features = whisper_processor(
|
241 |
audio,
|
242 |
sampling_rate=sample_rate,
|
|
|
300 |
type="filepath",
|
301 |
label="Upload Audio or Record from Microphone",
|
302 |
# Removed 'microphone=True' and 'source' as they cause TypeError with older Gradio versions
|
303 |
+
# If you are still seeing TypeError for 'microphone', your Gradio version might be very old.
|
304 |
+
# In that case, this component will only support file uploads.
|
305 |
format="wav" # Ensure consistent format
|
306 |
)
|
307 |
transcribe_button = gr.Button("Transcribe Audio")
|