import gradio as gr
from transformers import pipeline
import requests
import json
import edge_tts
from edge_tts import VoicesManager
import asyncio
import random
import tempfile
import os
import inflect
from huggingface_hub import InferenceClient
import re
import time
from streaming_stt_nemo import Model
Female_language_dict = {
'English-Jenny (Female)': 'en-US-JennyNeural',
'English-Ana (Female)': 'en-US-AnaNeural',
'English-Aria (Female)': 'en-US-AriaNeural',
'English-Michelle (Female)': 'en-US-MichelleNeural',
'English (Australia)-Natasha- (Female)': 'en-AU-NatashaNeural',
'English (Canada)-Clara- (Female)': 'en-CA-ClaraNeural',
'English (UK)-Libby- (Female)': 'en-GB-LibbyNeural',
'English (UK)-Maisie- (Female)': 'en-GB-MaisieNeural',
'English (UK)-Sonia- (Female)': 'en-GB-SoniaNeural',
'English (Ireland)-Emily- (Female)': 'en-IE-EmilyNeural',
}
default_lang = "en"
engines = { default_lang: Model(default_lang) }
client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
system_instructions1 = "[SYSTEM] YOU must Output only plain text. Do not use **bold**, *italic*, ### headings, **number** or any other markdown-specific formatting in content. Respond as Hermione Granger from the Harry Potter series, embodying her intelligent, resourceful, and slightly bossy yet friendly demeanor. Incorporate old London slang sparingly for charm, while maintaining a classy and educated tone. Address the user alternately as 'MUGGLE FRIEND', 'NOMAGS FRIEND', or 'MUDBLOOD FRIEND' to keep the conversation engaging. Ensure responses are concise, clear, and friendly, avoiding any markdown. Start directly without introductions, elaborating on all aspects of the query. Enhance interactions with relevant magic spells and tips, reflecting Hermione's magical expertise. Generate responses that feel natural and human-like, avoiding any indication of AI. Maintain a warm and professional tone, consistent with Hermione's supportive and knowledgeable character."
def transcribe(audio):
lang = "en"
model = engines[lang]
text = model.stt_file(audio)[0]
return text
def model(text):
generate_kwargs = dict(
temperature=0.7,
max_new_tokens=512,
top_p=0.95,
repetition_penalty=1,
do_sample=True,
seed=42,
)
formatted_prompt = system_instructions1 + text + "[Hermione]"
stream = client1.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
if not response.token.text == "":
output += response.token.text
return output
async def respond(language_code, audio):
user = transcribe(audio)
reply = model(user)
voice = Female_language_dict.get(language_code, "default_voice")
communicate = edge_tts.Communicate(reply, voice)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
yield tmp_path
async def generate1(language_code, prompt):
generate_kwargs = dict(
temperature=0.7,
max_new_tokens=512,
top_p=0.95,
repetition_penalty=1,
do_sample=False,
)
formatted_prompt = system_instructions1 + prompt + "[Hermione]"
stream = client1.text_generation(
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
output = ""
for response in stream:
if not response.token.text == "":
output += response.token.text
voice = Female_language_dict.get(language_code, "default_voice")
communicate = edge_tts.Communicate(output, voice)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
yield tmp_path
with gr.Blocks(gr.themes.Origin()) as demo:
gr.HTML(""" """
"""