Spaces:
Runtime error
Runtime error
File size: 2,738 Bytes
92c50d2 24c93e5 d6ed3af f16d5ff d6ed3af 92c50d2 4deb2e9 91fb6a1 46e0200 92c50d2 7958865 02c7617 46e0200 02c7617 46e0200 02c7617 46e0200 02c7617 46e0200 02c7617 92c50d2 02c7617 92c50d2 91fb6a1 92c50d2 74cc87b 49a62d8 92c50d2 3d50071 9f95576 92c50d2 46e0200 92c50d2 3d50071 92c50d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from huggingface_hub import InferenceClient
import spaces
import torch
import os
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Tesla T4
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# client = InferenceClient("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", token=os.getenv('deepseekv2'))
# client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=os.getenv('deepseekv2'))
# client = InferenceClient("meta-llama/Llama-3.1-8B-Instruct", token=os.getenv('deepseekv2'))
def choose_model(model_name):
if model_name == "Qwen1.5":
model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
elif model_name == "Llama3-8b-Instruct":
model = "meta-llama/Meta-Llama-3-8B-Instruct"
elif model_name == "Llama3.1-8b-Instruct":
model = "meta-llama/Llama-3.1-8B-Instruct"
client = InferenceClient(f"{model}, token=os.getenv('deepseekv2')")
return
@spaces.GPU(duration=1)
def respond(message, history: list[tuple[str, str]], model, system_message, max_tokens, temperature, top_p):
print(model)
choose_model(model)
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for message in client.chat_completion(messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p):
token = message.choices[0].delta.content
response += token
yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Dropdown(["Qwen1.5", "Llama3-8b-Instruct", "Llama3.1-8b-Instruct"], label="Select Model"),
gr.Textbox(value="You are a friendly and helpful Chatbot, be concise and straight to the point, avoid excessive reasoning.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
]
)
if __name__ == "__main__":
demo.launch()
|