Spaces:

Emmanuel08
/

CCI_Realtime_Transcribing_model

Sleeping

App Files Files Community

Emmanuel08 commited on Mar 5

Commit

d63bba0

verified ·

1 Parent(s): f33e6ad

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -20

app.py CHANGED Viewed

@@ -4,39 +4,38 @@ import gradio as gr
 import time
 import numpy as np
 import scipy.io.wavfile
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, BitsAndBytesConfig
-# ✅ 1️⃣ Optimize Model Selection
 device = "cpu"
-torch_dtype = torch.float32  # Use CPU-friendly float type
-MODEL_NAME = "openai/whisper-small"  # ✅ Switched to "small" for better accuracy
-# ✅ 2️⃣ Enable Quantization (Reduces Memory Usage, Speeds Up Inference)
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-# ✅ 3️⃣ Load Whisper Model on CPU with Optimized Settings
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    MODEL_NAME, quantization_config=quantization_config, torch_dtype=torch_dtype, use_safetensors=True
 )
 model.to(device)
-# ✅ 4️⃣ Load Processor & Set Default Sampling Rate
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
 processor.feature_extractor.sampling_rate = 16000  # ✅ Set correct sampling rate
-# ✅ 5️⃣ Optimized Pipeline with Beam Search for Better Accuracy
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    chunk_length_s=5,  # ✅ Increase chunk size for better performance
     torch_dtype=torch_dtype,
     device=device,
     generate_kwargs={"num_beams": 5, "language": "en"},  # ✅ Beam search for better accuracy
 )
-# ✅ 6️⃣ Real-Time Streaming Transcription (Microphone)
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
     try:
@@ -49,7 +48,7 @@ def stream_transcribe(stream, new_chunk):
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
-        # ✅ Resample audio to 16kHz using optimized torchaudio method
         y_tensor = torch.tensor(y)
         y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
@@ -69,7 +68,7 @@ def stream_transcribe(stream, new_chunk):
         print(f"Error: {e}")
         return stream, str(e), "Error"
-# ✅ 7️⃣ Transcription for File Upload
 def transcribe(inputs, previous_transcription):
     start_time = time.time()
     try:
@@ -91,11 +90,11 @@ def transcribe(inputs, previous_transcription):
         print(f"Error: {e}")
         return previous_transcription, "Error"
-# ✅ 8️⃣ Clear Function
 def clear():
     return ""
-# ✅ 9️⃣ Gradio Interface (Microphone Streaming)
 with gr.Blocks() as microphone:
     gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) 🎙️")
     gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
@@ -115,7 +114,7 @@ with gr.Blocks() as microphone:
     )
     clear_button.click(clear, outputs=[output])
-# ✅ 🔟 Gradio Interface (File Upload)
 with gr.Blocks() as file:
     gr.Markdown(f"# Upload Audio File for Transcription 🎵")
     gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
@@ -132,10 +131,10 @@ with gr.Blocks() as file:
     submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
     clear_button.click(clear, outputs=[output])
-# ✅ 1️⃣1️⃣ Final Gradio App
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
-# ✅ 1️⃣2️⃣ Run Gradio Locally
 if __name__ == "__main__":
     demo.launch()

 import time
 import numpy as np
 import scipy.io.wavfile
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+# ✅ 1️⃣ Use "whisper-small" for better accuracy
 device = "cpu"
+torch_dtype = torch.float32
+MODEL_NAME = "openai/whisper-small"
+# ✅ 2️⃣ Load Whisper Model on CPU (Removed bitsandbytes)
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
 )
 model.to(device)
+# ✅ 3️⃣ Speed up execution with torch.compile()
+model = torch.compile(model)  # ✅ Faster inference on CPU
+# ✅ 4️⃣ Load Processor & Pipeline
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
 processor.feature_extractor.sampling_rate = 16000  # ✅ Set correct sampling rate
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    chunk_length_s=5,  # ✅ Better balance between speed & accuracy
     torch_dtype=torch_dtype,
     device=device,
     generate_kwargs={"num_beams": 5, "language": "en"},  # ✅ Beam search for better accuracy
 )
+# ✅ 5️⃣ Real-Time Streaming Transcription (Microphone)
 def stream_transcribe(stream, new_chunk):
     start_time = time.time()
     try:
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
+        # ✅ Resample audio using optimized torchaudio method
         y_tensor = torch.tensor(y)
         y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
         print(f"Error: {e}")
         return stream, str(e), "Error"
+# ✅ 6️⃣ Transcription for File Upload
 def transcribe(inputs, previous_transcription):
     start_time = time.time()
     try:
         print(f"Error: {e}")
         return previous_transcription, "Error"
+# ✅ 7️⃣ Clear Function
 def clear():
     return ""
+# ✅ 8️⃣ Gradio Interface (Microphone Streaming)
 with gr.Blocks() as microphone:
     gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) 🎙️")
     gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
     )
     clear_button.click(clear, outputs=[output])
+# ✅ 9️⃣ Gradio Interface (File Upload)
 with gr.Blocks() as file:
     gr.Markdown(f"# Upload Audio File for Transcription 🎵")
     gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
     submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
     clear_button.click(clear, outputs=[output])
+# ✅ 🔟 Final Gradio App
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
+# ✅ 1️⃣1️⃣ Run Gradio Locally
 if __name__ == "__main__":
     demo.launch()