import gradio as gr from huggingface_hub import InferenceClient import random # Initialize the model model = "mistralai/Mixtral-8x7B-Instruct-v0.1" client = InferenceClient(model) def chat_response(prompt, history, seed, temp, tokens, top_p, rep_p): generate_kwargs = { "temperature": temp, "max_new_tokens": tokens, "top_p": top_p, "repetition_penalty": rep_p, "do_sample": True, "seed": seed, } # Include the chat history in the prompt formatted_prompt = "\n".join([f"Q: {user_prompt}\nA: {bot_response}" for user_prompt, bot_response in history]) + f"\nQ: {prompt}\nA:" output = "" # Generating text in streaming mode for response in client.text_generation(formatted_prompt, **generate_kwargs, stream=True): # Assuming response is directly a string or contains a message output += response # Using response directly since it's a string # Yield the updated output for real-time display yield [(prompt, output)] # Append the full response to history after completion history.append((prompt, output)) yield history # Yielding the updated history def clear_chat(): return [], [] # Returning an empty history # Gradio interface with gr.Blocks() as app: gr.HTML("