Spaces:

amenIKh
/

Multilingual_ASR

Runtime error

File size: 3,625 Bytes

ef7037b

import gradio as gr
import torch
import librosa
import numpy as np
import webrtcvad
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline

# Model names
TN_MODEL_NAME = "amenIKh/Tunisian_Checkpoint12"
WHISPER_MODEL_NAME = "openai/whisper-small"

# Initialize pipelines
pipe_tn = pipeline(
    task="automatic-speech-recognition",
    model=TN_MODEL_NAME,
    device=0 if torch.cuda.is_available() else -1,
)

# Load Whisper model and processor
whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME)
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model.to(device)

# Function to apply VAD
def apply_vad(audio, sr, frame_duration_ms=30):
    vad = webrtcvad.Vad()
    vad.set_mode(3)  # Aggressiveness mode, higher value is more aggressive

    frame_size = int(sr * frame_duration_ms / 1000)
    offset = 0
    voiced_frames = []

    while offset + frame_size < len(audio):
        frame = audio[offset:offset + frame_size].astype(np.int16)
        is_speech = vad.is_speech(frame.tobytes(), sr)

        if is_speech:
            voiced_frames.append(frame)

        offset += frame_size

    if len(voiced_frames) == 0:
        return audio  # Return original audio if no voiced frames are detected

    voiced_audio = np.concatenate(voiced_frames)
    return voiced_audio

# Function to transcribe audio based on language
def transcribe_audio(audio, language):
    try:
        # Load audio
        sr = 16000  # Assuming the audio is in 16kHz; adjust if necessary
        audio, _ = librosa.load(audio, sr=sr)

        # Apply VAD
        voiced_audio = apply_vad(audio, sr)

        # Select the correct model based on language
        if language == "tn":
            result = pipe_tn(voiced_audio)
            transcription = result.get("text", "")
        elif language in ["fr", "en"]:
            forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe")
            input_features = whisper_processor(voiced_audio, return_tensors="pt").input_features.to(device)
            generated_ids = whisper_model.generate(
                input_features, 
                forced_decoder_ids=forced_decoder_ids
            )
            transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        else:
            return "Unsupported language specified"

        return transcription
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"
# Define Gradio interface
def gradio_interface(audio, language):
    try:
        # Extract the file path or microphone input from the Gradio audio input
        if isinstance(audio, tuple):
            temp_file_path = audio[0]  # For microphone recordings, extract file path from the tuple
        else:
            temp_file_path = audio  # For uploaded files

        # Perform transcription
        result = transcribe_audio(temp_file_path, language)
        
        return result
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Create the Gradio app
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Audio(sources=["upload","microphone"],type="filepath", label="Upload Audio"),
        gr.Dropdown(choices=["tn", "fr", "en"], label="Select Language")
    ],
    outputs="text",
    title="ASR Transcription Service",
    description="Upload an audio file and select the language to transcribe the audio."
)

# Add the custom HTML with background image
iface.launch()