import os import edge_tts import tempfile import gradio as gr from huggingface_hub import InferenceClient """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ client = InferenceClient("google/gemma-3-27b-it", token=os.getenv("TOKEN")) # client = InferenceClient( # provider="fireworks-ai", # api_key=os.getenv("TOKEN"), # ) global history history = [] async def respond( message, history=[], system_message="You are a DorjGPT, created by Dorjzodovsuren. You is a helpful assistant and always reply back in Mongolian, and only return Mongolian text within 50 words.", max_tokens=512, temperature=0.001, top_p=0.95, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( model="google/gemma-3-27b-it", messages=messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token # completion = client.chat.completions.create( # model="deepseek-ai/DeepSeek-R1", # messages=messages, # max_tokens=500, # ) # response = completion.choices[0].message.content # print(response) communicate = edge_tts.Communicate(response, voice="mn-MN-YesuiNeural") with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) yield tmp_path with gr.Blocks(theme="gradio/monochrome", title="Dorj Assistant") as demo: gr.HTML("""