Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -4,39 +4,38 @@ import gradio as gr 
     | 
|
| 4 | 
         
             
            import time
         
     | 
| 5 | 
         
             
            import numpy as np
         
     | 
| 6 | 
         
             
            import scipy.io.wavfile
         
     | 
| 7 | 
         
            -
            from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline 
     | 
| 8 | 
         | 
| 9 | 
         
            -
            # β
 1οΈβ£  
     | 
| 10 | 
         
             
            device = "cpu"
         
     | 
| 11 | 
         
            -
            torch_dtype = torch.float32 
     | 
| 12 | 
         
            -
            MODEL_NAME = "openai/whisper-small" 
     | 
| 13 | 
         | 
| 14 | 
         
            -
            # β
 2οΈβ£  
     | 
| 15 | 
         
            -
            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
         
     | 
| 16 | 
         
            -
             
     | 
| 17 | 
         
            -
            # β
 3οΈβ£ Load Whisper Model on CPU with Optimized Settings
         
     | 
| 18 | 
         
             
            model = AutoModelForSpeechSeq2Seq.from_pretrained(
         
     | 
| 19 | 
         
            -
                MODEL_NAME,  
     | 
| 20 | 
         
             
            )
         
     | 
| 21 | 
         
             
            model.to(device)
         
     | 
| 22 | 
         | 
| 23 | 
         
            -
            # β
  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 24 | 
         
             
            processor = AutoProcessor.from_pretrained(MODEL_NAME)
         
     | 
| 25 | 
         
             
            processor.feature_extractor.sampling_rate = 16000  # β
 Set correct sampling rate
         
     | 
| 26 | 
         | 
| 27 | 
         
            -
            # β
 5οΈβ£ Optimized Pipeline with Beam Search for Better Accuracy
         
     | 
| 28 | 
         
             
            pipe = pipeline(
         
     | 
| 29 | 
         
             
                task="automatic-speech-recognition",
         
     | 
| 30 | 
         
             
                model=model,
         
     | 
| 31 | 
         
             
                tokenizer=processor.tokenizer,
         
     | 
| 32 | 
         
             
                feature_extractor=processor.feature_extractor,
         
     | 
| 33 | 
         
            -
                chunk_length_s=5,  # β
  
     | 
| 34 | 
         
             
                torch_dtype=torch_dtype,
         
     | 
| 35 | 
         
             
                device=device,
         
     | 
| 36 | 
         
             
                generate_kwargs={"num_beams": 5, "language": "en"},  # β
 Beam search for better accuracy
         
     | 
| 37 | 
         
             
            )
         
     | 
| 38 | 
         | 
| 39 | 
         
            -
            # β
  
     | 
| 40 | 
         
             
            def stream_transcribe(stream, new_chunk):
         
     | 
| 41 | 
         
             
                start_time = time.time()
         
     | 
| 42 | 
         
             
                try:
         
     | 
| 
         @@ -49,7 +48,7 @@ def stream_transcribe(stream, new_chunk): 
     | 
|
| 49 | 
         
             
                    y = y.astype(np.float32)
         
     | 
| 50 | 
         
             
                    y /= np.max(np.abs(y))
         
     | 
| 51 | 
         | 
| 52 | 
         
            -
                    # β
 Resample audio  
     | 
| 53 | 
         
             
                    y_tensor = torch.tensor(y)
         
     | 
| 54 | 
         
             
                    y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
         
     | 
| 55 | 
         | 
| 
         @@ -69,7 +68,7 @@ def stream_transcribe(stream, new_chunk): 
     | 
|
| 69 | 
         
             
                    print(f"Error: {e}")
         
     | 
| 70 | 
         
             
                    return stream, str(e), "Error"
         
     | 
| 71 | 
         | 
| 72 | 
         
            -
            # β
  
     | 
| 73 | 
         
             
            def transcribe(inputs, previous_transcription):
         
     | 
| 74 | 
         
             
                start_time = time.time()
         
     | 
| 75 | 
         
             
                try:
         
     | 
| 
         @@ -91,11 +90,11 @@ def transcribe(inputs, previous_transcription): 
     | 
|
| 91 | 
         
             
                    print(f"Error: {e}")
         
     | 
| 92 | 
         
             
                    return previous_transcription, "Error"
         
     | 
| 93 | 
         | 
| 94 | 
         
            -
            # β
  
     | 
| 95 | 
         
             
            def clear():
         
     | 
| 96 | 
         
             
                return ""
         
     | 
| 97 | 
         | 
| 98 | 
         
            -
            # β
  
     | 
| 99 | 
         
             
            with gr.Blocks() as microphone:
         
     | 
| 100 | 
         
             
                gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) ποΈ")
         
     | 
| 101 | 
         
             
                gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
         
     | 
| 
         @@ -115,7 +114,7 @@ with gr.Blocks() as microphone: 
     | 
|
| 115 | 
         
             
                )
         
     | 
| 116 | 
         
             
                clear_button.click(clear, outputs=[output])
         
     | 
| 117 | 
         | 
| 118 | 
         
            -
            # β
  
     | 
| 119 | 
         
             
            with gr.Blocks() as file:
         
     | 
| 120 | 
         
             
                gr.Markdown(f"# Upload Audio File for Transcription π΅")
         
     | 
| 121 | 
         
             
                gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
         
     | 
| 
         @@ -132,10 +131,10 @@ with gr.Blocks() as file: 
     | 
|
| 132 | 
         
             
                submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
         
     | 
| 133 | 
         
             
                clear_button.click(clear, outputs=[output])
         
     | 
| 134 | 
         | 
| 135 | 
         
            -
            # β
  
     | 
| 136 | 
         
             
            with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         
     | 
| 137 | 
         
             
                gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
         
     | 
| 138 | 
         | 
| 139 | 
         
            -
            # β
 1οΈβ£ 
     | 
| 140 | 
         
             
            if __name__ == "__main__":
         
     | 
| 141 | 
         
             
                demo.launch()
         
     | 
| 
         | 
|
| 4 | 
         
             
            import time
         
     | 
| 5 | 
         
             
            import numpy as np
         
     | 
| 6 | 
         
             
            import scipy.io.wavfile
         
     | 
| 7 | 
         
            +
            from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
         
     | 
| 8 | 
         | 
| 9 | 
         
            +
            # β
 1οΈβ£ Use "whisper-small" for better accuracy
         
     | 
| 10 | 
         
             
            device = "cpu"
         
     | 
| 11 | 
         
            +
            torch_dtype = torch.float32
         
     | 
| 12 | 
         
            +
            MODEL_NAME = "openai/whisper-small"
         
     | 
| 13 | 
         | 
| 14 | 
         
            +
            # β
 2οΈβ£ Load Whisper Model on CPU (Removed bitsandbytes)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 15 | 
         
             
            model = AutoModelForSpeechSeq2Seq.from_pretrained(
         
     | 
| 16 | 
         
            +
                MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
         
     | 
| 17 | 
         
             
            )
         
     | 
| 18 | 
         
             
            model.to(device)
         
     | 
| 19 | 
         | 
| 20 | 
         
            +
            # β
 3οΈβ£ Speed up execution with torch.compile()
         
     | 
| 21 | 
         
            +
            model = torch.compile(model)  # β
 Faster inference on CPU
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            # β
 4οΈβ£ Load Processor & Pipeline
         
     | 
| 24 | 
         
             
            processor = AutoProcessor.from_pretrained(MODEL_NAME)
         
     | 
| 25 | 
         
             
            processor.feature_extractor.sampling_rate = 16000  # β
 Set correct sampling rate
         
     | 
| 26 | 
         | 
| 
         | 
|
| 27 | 
         
             
            pipe = pipeline(
         
     | 
| 28 | 
         
             
                task="automatic-speech-recognition",
         
     | 
| 29 | 
         
             
                model=model,
         
     | 
| 30 | 
         
             
                tokenizer=processor.tokenizer,
         
     | 
| 31 | 
         
             
                feature_extractor=processor.feature_extractor,
         
     | 
| 32 | 
         
            +
                chunk_length_s=5,  # β
 Better balance between speed & accuracy
         
     | 
| 33 | 
         
             
                torch_dtype=torch_dtype,
         
     | 
| 34 | 
         
             
                device=device,
         
     | 
| 35 | 
         
             
                generate_kwargs={"num_beams": 5, "language": "en"},  # β
 Beam search for better accuracy
         
     | 
| 36 | 
         
             
            )
         
     | 
| 37 | 
         | 
| 38 | 
         
            +
            # β
 5οΈβ£ Real-Time Streaming Transcription (Microphone)
         
     | 
| 39 | 
         
             
            def stream_transcribe(stream, new_chunk):
         
     | 
| 40 | 
         
             
                start_time = time.time()
         
     | 
| 41 | 
         
             
                try:
         
     | 
| 
         | 
|
| 48 | 
         
             
                    y = y.astype(np.float32)
         
     | 
| 49 | 
         
             
                    y /= np.max(np.abs(y))
         
     | 
| 50 | 
         | 
| 51 | 
         
            +
                    # β
 Resample audio using optimized torchaudio method
         
     | 
| 52 | 
         
             
                    y_tensor = torch.tensor(y)
         
     | 
| 53 | 
         
             
                    y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
         
     | 
| 54 | 
         | 
| 
         | 
|
| 68 | 
         
             
                    print(f"Error: {e}")
         
     | 
| 69 | 
         
             
                    return stream, str(e), "Error"
         
     | 
| 70 | 
         | 
| 71 | 
         
            +
            # β
 6οΈβ£ Transcription for File Upload
         
     | 
| 72 | 
         
             
            def transcribe(inputs, previous_transcription):
         
     | 
| 73 | 
         
             
                start_time = time.time()
         
     | 
| 74 | 
         
             
                try:
         
     | 
| 
         | 
|
| 90 | 
         
             
                    print(f"Error: {e}")
         
     | 
| 91 | 
         
             
                    return previous_transcription, "Error"
         
     | 
| 92 | 
         | 
| 93 | 
         
            +
            # β
 7οΈβ£ Clear Function
         
     | 
| 94 | 
         
             
            def clear():
         
     | 
| 95 | 
         
             
                return ""
         
     | 
| 96 | 
         | 
| 97 | 
         
            +
            # β
 8οΈβ£ Gradio Interface (Microphone Streaming)
         
     | 
| 98 | 
         
             
            with gr.Blocks() as microphone:
         
     | 
| 99 | 
         
             
                gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) ποΈ")
         
     | 
| 100 | 
         
             
                gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
         
     | 
| 
         | 
|
| 114 | 
         
             
                )
         
     | 
| 115 | 
         
             
                clear_button.click(clear, outputs=[output])
         
     | 
| 116 | 
         | 
| 117 | 
         
            +
            # β
 9οΈβ£ Gradio Interface (File Upload)
         
     | 
| 118 | 
         
             
            with gr.Blocks() as file:
         
     | 
| 119 | 
         
             
                gr.Markdown(f"# Upload Audio File for Transcription π΅")
         
     | 
| 120 | 
         
             
                gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
         
     | 
| 
         | 
|
| 131 | 
         
             
                submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
         
     | 
| 132 | 
         
             
                clear_button.click(clear, outputs=[output])
         
     | 
| 133 | 
         | 
| 134 | 
         
            +
            # β
 π Final Gradio App
         
     | 
| 135 | 
         
             
            with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         
     | 
| 136 | 
         
             
                gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
         
     | 
| 137 | 
         | 
| 138 | 
         
            +
            # β
 1οΈβ£1οΈβ£ Run Gradio Locally
         
     | 
| 139 | 
         
             
            if __name__ == "__main__":
         
     | 
| 140 | 
         
             
                demo.launch()
         
     |