Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import librosa | |
| import numpy as np | |
| import webrtcvad | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline | |
| # Model names | |
| TN_MODEL_NAME = "amenIKh/Tunisian_Checkpoint12" | |
| WHISPER_MODEL_NAME = "openai/whisper-small" | |
| # Initialize pipelines | |
| pipe_tn = pipeline( | |
| task="automatic-speech-recognition", | |
| model=TN_MODEL_NAME, | |
| device=0 if torch.cuda.is_available() else -1, | |
| ) | |
| # Load Whisper model and processor | |
| whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME) | |
| whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| whisper_model.to(device) | |
| # Function to apply VAD | |
| def apply_vad(audio, sr, frame_duration_ms=30): | |
| vad = webrtcvad.Vad() | |
| vad.set_mode(3) # Aggressiveness mode, higher value is more aggressive | |
| frame_size = int(sr * frame_duration_ms / 1000) | |
| offset = 0 | |
| voiced_frames = [] | |
| while offset + frame_size < len(audio): | |
| frame = audio[offset:offset + frame_size].astype(np.int16) | |
| is_speech = vad.is_speech(frame.tobytes(), sr) | |
| if is_speech: | |
| voiced_frames.append(frame) | |
| offset += frame_size | |
| if len(voiced_frames) == 0: | |
| return audio # Return original audio if no voiced frames are detected | |
| voiced_audio = np.concatenate(voiced_frames) | |
| return voiced_audio | |
| # Function to transcribe audio based on language | |
| def transcribe_audio(audio, language): | |
| try: | |
| # Load audio | |
| sr = 16000 # Assuming the audio is in 16kHz; adjust if necessary | |
| audio, _ = librosa.load(audio, sr=sr) | |
| # Apply VAD | |
| voiced_audio = apply_vad(audio, sr) | |
| # Select the correct model based on language | |
| if language == "tn": | |
| result = pipe_tn(voiced_audio) | |
| transcription = result.get("text", "") | |
| elif language in ["fr", "en"]: | |
| forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe") | |
| input_features = whisper_processor(voiced_audio, return_tensors="pt").input_features.to(device) | |
| generated_ids = whisper_model.generate( | |
| input_features, | |
| forced_decoder_ids=forced_decoder_ids | |
| ) | |
| transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| else: | |
| return "Unsupported language specified" | |
| return transcription | |
| except Exception as e: | |
| return f"An unexpected error occurred: {str(e)}" | |
| # Define Gradio interface | |
| def gradio_interface(audio, language): | |
| try: | |
| # Extract the file path or microphone input from the Gradio audio input | |
| if isinstance(audio, tuple): | |
| temp_file_path = audio[0] # For microphone recordings, extract file path from the tuple | |
| else: | |
| temp_file_path = audio # For uploaded files | |
| # Perform transcription | |
| result = transcribe_audio(temp_file_path, language) | |
| return result | |
| except Exception as e: | |
| return f"An error occurred: {str(e)}" | |
| # Create the Gradio app | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.Audio(sources=["upload","microphone"],type="filepath", label="Upload Audio"), | |
| gr.Dropdown(choices=["tn", "fr", "en"], label="Select Language") | |
| ], | |
| outputs="text", | |
| title="ASR Transcription Service", | |
| description="Upload an audio file and select the language to transcribe the audio." | |
| ) | |
| # Add the custom HTML with background image | |
| iface.launch() | |