File size: 3,597 Bytes
9a374f9 3d78fb4 9a374f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from gtts import gTTS
import tempfile
# Function to initialize models with exception handling
def initialize_model():
try:
# Load ASR (Automatic Speech Recognition) model for voice-to-text
asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# Load Translation model (supports multiple language pairs)
translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul") # Choose a supported model
# Load conversational model (fine-tuned on dialogues)
model_name = "microsoft/DialoGPT-medium" # Example conversational model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
return asr_model, translation_model, tokenizer, model
except Exception as e:
print(f"Error initializing models: {e}")
return None, None, None, None
# Initialize the models
asr_model, translation_model, tokenizer, conversation_model = initialize_model()
def chatbot_speech_to_speech(audio_input, target_language):
try:
# Step 1: Convert Audio to Text
text_input = asr_model(audio_input)["text"]
# Step 2: Translate Text to English if the input language is not English
if target_language != "en":
translated_text = translation_model(text_input, src_lang=target_language, tgt_lang="en")[0]['translation_text']
else:
translated_text = text_input
# Step 3: Generate conversational response using the dialogue model
inputs = tokenizer.encode(translated_text + tokenizer.eos_token, return_tensors='pt')
response_ids = conversation_model.generate(inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)
response_text = tokenizer.decode(response_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)
# Step 4: Translate the response text back to the target language
if target_language != "en":
final_response = translation_model(response_text, src_lang="en", tgt_lang=target_language)[0]['translation_text']
else:
final_response = response_text
# Step 5: Convert text to speech using gTTS
tts = gTTS(final_response, lang=target_language)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_file.name)
return temp_file.name
except Exception as e:
return f"Error in processing: {e}"
# Gradio Interface Function
def interface(audio, language):
result = chatbot_speech_to_speech(audio, language)
return result
# Define the Gradio app with Blocks using the latest syntax
with gr.Blocks() as gradio_ui:
gr.Markdown("# Multilingual Voice-to-Voice Chatbot for Kids")
gr.Markdown("### Speak to the chatbot in your selected language and receive a spoken response.")
audio_input = gr.Audio(type="filepath", label="Record your message")
language_dropdown = gr.Dropdown(choices=["en", "fr", "es", "de", "zh", "ur"], label="Select Language")
result_audio = gr.Audio(type="filepath", label="Chatbot Response")
submit_btn = gr.Button("Submit")
submit_btn.click(fn=interface, inputs=[audio_input, language_dropdown], outputs=result_audio)
# Launch the app
if asr_model and translation_model and tokenizer and conversation_model:
gradio_ui.launch()
else:
print("Error initializing one or more models. Please check your model configuration.")
|