File size: 3,597 Bytes
9a374f9
 
 
 
 
 
 
 
 
3d78fb4
9a374f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from gtts import gTTS
import tempfile

# Function to initialize models with exception handling
def initialize_model():
    try:
        # Load ASR (Automatic Speech Recognition) model for voice-to-text
        asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small")

        # Load Translation model (supports multiple language pairs)
        translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")  # Choose a supported model

        # Load conversational model (fine-tuned on dialogues)
        model_name = "microsoft/DialoGPT-medium"  # Example conversational model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        return asr_model, translation_model, tokenizer, model
    except Exception as e:
        print(f"Error initializing models: {e}")
        return None, None, None, None

# Initialize the models
asr_model, translation_model, tokenizer, conversation_model = initialize_model()

def chatbot_speech_to_speech(audio_input, target_language):
    try:
        # Step 1: Convert Audio to Text
        text_input = asr_model(audio_input)["text"]

        # Step 2: Translate Text to English if the input language is not English
        if target_language != "en":
            translated_text = translation_model(text_input, src_lang=target_language, tgt_lang="en")[0]['translation_text']
        else:
            translated_text = text_input

        # Step 3: Generate conversational response using the dialogue model
        inputs = tokenizer.encode(translated_text + tokenizer.eos_token, return_tensors='pt')
        response_ids = conversation_model.generate(inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)
        response_text = tokenizer.decode(response_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)

        # Step 4: Translate the response text back to the target language
        if target_language != "en":
            final_response = translation_model(response_text, src_lang="en", tgt_lang=target_language)[0]['translation_text']
        else:
            final_response = response_text

        # Step 5: Convert text to speech using gTTS
        tts = gTTS(final_response, lang=target_language)
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_file.name)

        return temp_file.name
    except Exception as e:
        return f"Error in processing: {e}"

# Gradio Interface Function
def interface(audio, language):
    result = chatbot_speech_to_speech(audio, language)
    return result

# Define the Gradio app with Blocks using the latest syntax
with gr.Blocks() as gradio_ui:
    gr.Markdown("# Multilingual Voice-to-Voice Chatbot for Kids")
    gr.Markdown("### Speak to the chatbot in your selected language and receive a spoken response.")
    
    audio_input = gr.Audio(type="filepath", label="Record your message")
    language_dropdown = gr.Dropdown(choices=["en", "fr", "es", "de", "zh", "ur"], label="Select Language")
    
    result_audio = gr.Audio(type="filepath", label="Chatbot Response")
    
    
    submit_btn = gr.Button("Submit")
    submit_btn.click(fn=interface, inputs=[audio_input, language_dropdown], outputs=result_audio)

# Launch the app
if asr_model and translation_model and tokenizer and conversation_model:
    gradio_ui.launch()
else:
    print("Error initializing one or more models. Please check your model configuration.")