Spaces:

ajsbsd
/

Qwen2.5-1.5B-Instruct-gkd-demo

Running on Zero

App Files Files Community

ajsbsd commited on 15 days ago

Commit

4d692df

verified ·

1 Parent(s): 1ffb73e

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -6

app.py CHANGED Viewed

@@ -6,15 +6,15 @@ from transformers import (
     SpeechT5Processor,
     SpeechT5ForTextToSpeech,
     SpeechT5HifiGan,
-    WhisperProcessor, # New: For Speech-to-Text
-    WhisperForConditionalGeneration # New: For Speech-to-Text
 )
 from datasets import load_dataset # To get a speaker embedding for TTS
 import os
 import spaces # Import the spaces library for GPU decorator
 import tempfile # For creating temporary audio files
 import soundfile as sf # To save audio files
-import librosa # New: For loading audio files for transcription
 # --- Configuration for Language Model (LLM) ---
 HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
@@ -30,7 +30,7 @@ TTS_MODEL_ID = "microsoft/speecht5_tts"
 TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
 # --- Configuration for Speech-to-Text (STT) ---
-STT_MODEL_ID = "openai/whisper-tiny" # Using a smaller Whisper model for faster inference
 # --- Global variables for models and tokenizers/processors ---
 tokenizer = None
@@ -39,8 +39,8 @@ tts_processor = None
 tts_model = None
 tts_vocoder = None
 speaker_embeddings = None
-whisper_processor = None # New: Global for Whisper processor
-whisper_model = None # New: Global for Whisper model
 # --- Load All Models Function ---
 @spaces.GPU # Decorate with @spaces.GPU to signal this function needs GPU access
@@ -235,6 +235,8 @@ def transcribe_audio(audio_filepath):
         audio, sample_rate = librosa.load(audio_filepath, sr=16000)
         # Process audio input for the Whisper model
         input_features = whisper_processor(
             audio,
             sampling_rate=sample_rate,
@@ -298,6 +300,8 @@ with gr.Blocks() as demo:
             type="filepath",
             label="Upload Audio or Record from Microphone",
             # Removed 'microphone=True' and 'source' as they cause TypeError with older Gradio versions
             format="wav" # Ensure consistent format
         )
         transcribe_button = gr.Button("Transcribe Audio")

     SpeechT5Processor,
     SpeechT5ForTextToSpeech,
     SpeechT5HifiGan,
+    WhisperProcessor, # For Speech-to-Text
+    WhisperForConditionalGeneration # For Speech-to-Text
 )
 from datasets import load_dataset # To get a speaker embedding for TTS
 import os
 import spaces # Import the spaces library for GPU decorator
 import tempfile # For creating temporary audio files
 import soundfile as sf # To save audio files
+import librosa # For loading audio files for transcription
 # --- Configuration for Language Model (LLM) ---
 HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
 TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
 # --- Configuration for Speech-to-Text (STT) ---
+STT_MODEL_ID = "openai/whisper-small" # Changed from 'openai/whisper-tiny' for better long audio transcription
 # --- Global variables for models and tokenizers/processors ---
 tokenizer = None
 tts_model = None
 tts_vocoder = None
 speaker_embeddings = None
+whisper_processor = None
+whisper_model = None
 # --- Load All Models Function ---
 @spaces.GPU # Decorate with @spaces.GPU to signal this function needs GPU access
         audio, sample_rate = librosa.load(audio_filepath, sr=16000)
         # Process audio input for the Whisper model
+        # The Whisper `generate` method, especially with larger models, is designed
+        # to handle variable-length inputs by internally managing context.
         input_features = whisper_processor(
             audio,
             sampling_rate=sample_rate,
             type="filepath",
             label="Upload Audio or Record from Microphone",
             # Removed 'microphone=True' and 'source' as they cause TypeError with older Gradio versions
+            # If you are still seeing TypeError for 'microphone', your Gradio version might be very old.
+            # In that case, this component will only support file uploads.
             format="wav" # Ensure consistent format
         )
         transcribe_button = gr.Button("Transcribe Audio")