File size: 3,627 Bytes
ef7037b
 
 
 
 
 
 
8888401
ef7037b
 
 
 
 
 
 
 
 
8888401
ef7037b
 
 
1d7c7c7
 
ef7037b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import torch
import librosa
import numpy as np
import webrtcvad
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline


# Model names
TN_MODEL_NAME = "amenIKh/Tunisian_Checkpoint12"
WHISPER_MODEL_NAME = "openai/whisper-small"

# Initialize pipelines
pipe_tn = pipeline(
    task="automatic-speech-recognition",
    model=TN_MODEL_NAME,
    device=0 if torch.cuda.is_available() else -1,

)

# Load Whisper model and processor
whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME)
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model.to(device)

# Function to apply VAD
def apply_vad(audio, sr, frame_duration_ms=30):
    vad = webrtcvad.Vad()
    vad.set_mode(3)  # Aggressiveness mode, higher value is more aggressive

    frame_size = int(sr * frame_duration_ms / 1000)
    offset = 0
    voiced_frames = []

    while offset + frame_size < len(audio):
        frame = audio[offset:offset + frame_size].astype(np.int16)
        is_speech = vad.is_speech(frame.tobytes(), sr)

        if is_speech:
            voiced_frames.append(frame)

        offset += frame_size

    if len(voiced_frames) == 0:
        return audio  # Return original audio if no voiced frames are detected

    voiced_audio = np.concatenate(voiced_frames)
    return voiced_audio

# Function to transcribe audio based on language
def transcribe_audio(audio, language):
    try:
        # Load audio
        sr = 16000  # Assuming the audio is in 16kHz; adjust if necessary
        audio, _ = librosa.load(audio, sr=sr)

        # Apply VAD
        voiced_audio = apply_vad(audio, sr)

        # Select the correct model based on language
        if language == "tn":
            result = pipe_tn(voiced_audio)
            transcription = result.get("text", "")
        elif language in ["fr", "en"]:
            forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe")
            input_features = whisper_processor(voiced_audio, return_tensors="pt").input_features.to(device)
            generated_ids = whisper_model.generate(
                input_features, 
                forced_decoder_ids=forced_decoder_ids
            )
            transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        else:
            return "Unsupported language specified"

        return transcription
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"
# Define Gradio interface
def gradio_interface(audio, language):
    try:
        # Extract the file path or microphone input from the Gradio audio input
        if isinstance(audio, tuple):
            temp_file_path = audio[0]  # For microphone recordings, extract file path from the tuple
        else:
            temp_file_path = audio  # For uploaded files

        # Perform transcription
        result = transcribe_audio(temp_file_path, language)
        
        return result
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Create the Gradio app
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Audio(sources=["upload","microphone"],type="filepath", label="Upload Audio"),
        gr.Dropdown(choices=["tn", "fr", "en"], label="Select Language")
    ],
    outputs="text",
    title="ASR Transcription Service",
    description="Upload an audio file and select the language to transcribe the audio."
)

# Add the custom HTML with background image
iface.launch()