ajsbsd commited on
Commit
4d692df
·
verified ·
1 Parent(s): 1ffb73e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -6
app.py CHANGED
@@ -6,15 +6,15 @@ from transformers import (
6
  SpeechT5Processor,
7
  SpeechT5ForTextToSpeech,
8
  SpeechT5HifiGan,
9
- WhisperProcessor, # New: For Speech-to-Text
10
- WhisperForConditionalGeneration # New: For Speech-to-Text
11
  )
12
  from datasets import load_dataset # To get a speaker embedding for TTS
13
  import os
14
  import spaces # Import the spaces library for GPU decorator
15
  import tempfile # For creating temporary audio files
16
  import soundfile as sf # To save audio files
17
- import librosa # New: For loading audio files for transcription
18
 
19
  # --- Configuration for Language Model (LLM) ---
20
  HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
@@ -30,7 +30,7 @@ TTS_MODEL_ID = "microsoft/speecht5_tts"
30
  TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
31
 
32
  # --- Configuration for Speech-to-Text (STT) ---
33
- STT_MODEL_ID = "openai/whisper-tiny" # Using a smaller Whisper model for faster inference
34
 
35
  # --- Global variables for models and tokenizers/processors ---
36
  tokenizer = None
@@ -39,8 +39,8 @@ tts_processor = None
39
  tts_model = None
40
  tts_vocoder = None
41
  speaker_embeddings = None
42
- whisper_processor = None # New: Global for Whisper processor
43
- whisper_model = None # New: Global for Whisper model
44
 
45
  # --- Load All Models Function ---
46
  @spaces.GPU # Decorate with @spaces.GPU to signal this function needs GPU access
@@ -235,6 +235,8 @@ def transcribe_audio(audio_filepath):
235
  audio, sample_rate = librosa.load(audio_filepath, sr=16000)
236
 
237
  # Process audio input for the Whisper model
 
 
238
  input_features = whisper_processor(
239
  audio,
240
  sampling_rate=sample_rate,
@@ -298,6 +300,8 @@ with gr.Blocks() as demo:
298
  type="filepath",
299
  label="Upload Audio or Record from Microphone",
300
  # Removed 'microphone=True' and 'source' as they cause TypeError with older Gradio versions
 
 
301
  format="wav" # Ensure consistent format
302
  )
303
  transcribe_button = gr.Button("Transcribe Audio")
 
6
  SpeechT5Processor,
7
  SpeechT5ForTextToSpeech,
8
  SpeechT5HifiGan,
9
+ WhisperProcessor, # For Speech-to-Text
10
+ WhisperForConditionalGeneration # For Speech-to-Text
11
  )
12
  from datasets import load_dataset # To get a speaker embedding for TTS
13
  import os
14
  import spaces # Import the spaces library for GPU decorator
15
  import tempfile # For creating temporary audio files
16
  import soundfile as sf # To save audio files
17
+ import librosa # For loading audio files for transcription
18
 
19
  # --- Configuration for Language Model (LLM) ---
20
  HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
 
30
  TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
31
 
32
  # --- Configuration for Speech-to-Text (STT) ---
33
+ STT_MODEL_ID = "openai/whisper-small" # Changed from 'openai/whisper-tiny' for better long audio transcription
34
 
35
  # --- Global variables for models and tokenizers/processors ---
36
  tokenizer = None
 
39
  tts_model = None
40
  tts_vocoder = None
41
  speaker_embeddings = None
42
+ whisper_processor = None
43
+ whisper_model = None
44
 
45
  # --- Load All Models Function ---
46
  @spaces.GPU # Decorate with @spaces.GPU to signal this function needs GPU access
 
235
  audio, sample_rate = librosa.load(audio_filepath, sr=16000)
236
 
237
  # Process audio input for the Whisper model
238
+ # The Whisper `generate` method, especially with larger models, is designed
239
+ # to handle variable-length inputs by internally managing context.
240
  input_features = whisper_processor(
241
  audio,
242
  sampling_rate=sample_rate,
 
300
  type="filepath",
301
  label="Upload Audio or Record from Microphone",
302
  # Removed 'microphone=True' and 'source' as they cause TypeError with older Gradio versions
303
+ # If you are still seeing TypeError for 'microphone', your Gradio version might be very old.
304
+ # In that case, this component will only support file uploads.
305
  format="wav" # Ensure consistent format
306
  )
307
  transcribe_button = gr.Button("Transcribe Audio")