import gradio as gr from huggingface_hub import InferenceClient """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") ## None type def respond( message: str, history: list[tuple[str, str]], # This will not be used system_message: str, max_tokens: int, temperature: float, top_p: float, ): messages = [{"role": "system", "content": system_message}] # Append only the latest user message messages.append({"role": "user", "content": message}) response = "" try: # Generate response from the model for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): if message.choices[0].delta.content is not None: token = message.choices[0].delta.content response += token yield response except Exception as e: yield f"An error occurred: {e}" """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ ], ) if __name__ == "__main__": demo.launch()