Spaces:

Emmanuel08
/

CCI_Realtime_Transcribing_model

Sleeping

App Files Files Community

Emmanuel08 commited on Mar 5

Commit

a612649

verified ·

1 Parent(s): e8b3dee

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -8

app.py CHANGED Viewed

@@ -6,12 +6,12 @@ import numpy as np
 import scipy.io.wavfile
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-# ✅ 1️⃣ Use "whisper-small" for better accuracy
 device = "cpu"
 torch_dtype = torch.float32
-MODEL_NAME = "openai/whisper-small"
-# ✅ 2️⃣ Load Whisper Model on CPU (Removed bitsandbytes)
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
 )
@@ -29,10 +29,10 @@ pipe = pipeline(
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    chunk_length_s=5,  # ✅ Better balance between speed & accuracy
     torch_dtype=torch_dtype,
     device=device,
-    generate_kwargs={"num_beams": 5, "language": "en"},  # ✅ Beam search for better accuracy
 )
 # ✅ 5️⃣ Real-Time Streaming Transcription (Microphone)
@@ -48,7 +48,7 @@ def stream_transcribe(stream, new_chunk):
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
-        # ✅ Resample audio using optimized torchaudio method
         y_tensor = torch.tensor(y)
         y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
@@ -96,8 +96,8 @@ def clear():
 # ✅ 8️⃣ Gradio Interface (Microphone Streaming)
 with gr.Blocks() as microphone:
-    gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) 🎙️")
-    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
     with gr.Row():
         input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)

 import scipy.io.wavfile
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+# ✅ 1️⃣ Use "whisper-medium" for the best balance of speed & accuracy
 device = "cpu"
 torch_dtype = torch.float32
+MODEL_NAME = "openai/whisper-medium"
+# ✅ 2️⃣ Load Whisper Model on CPU
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
 )
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    chunk_length_s=10,  # ✅ Longer chunks for better accuracy
     torch_dtype=torch_dtype,
     device=device,
+    generate_kwargs={"num_beams": 5, "language": "en", "temperature": 0.1},  # ✅ Beam search + English
 )
 # ✅ 5️⃣ Real-Time Streaming Transcription (Microphone)
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
+        # ✅ Resample audio to 16kHz using torchaudio
         y_tensor = torch.tensor(y)
         y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
 # ✅ 8️⃣ Gradio Interface (Microphone Streaming)
 with gr.Blocks() as microphone:
+    gr.Markdown(f"# Whisper Medium - High Accuracy Transcription (Optimized CPU) 🎙️")
+    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for best speech-to-text performance.")
     with gr.Row():
         input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)