Spaces:
Running
Running
File size: 3,921 Bytes
7831eba 9d49e57 7831eba a7d91d4 37a3c87 a7d91d4 b752df1 7831eba b752df1 446b913 b752df1 7831eba 555ac42 7831eba c7fd9ac 7831eba b5fab19 8baca64 7831eba 0cd27a0 7831eba 555ac42 8baca64 408d3e1 7831eba 408d3e1 8baca64 408d3e1 7831eba d5b1c0a d0520f9 d5b1c0a 70ac69e 9436706 7831eba d5b1c0a 9436706 257a390 793da93 7831eba 793da93 7831eba 555ac42 7831eba 408d3e1 d8d19ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
from huggingface_hub import InferenceClient
import os
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
import requests
from openai import OpenAI, AsyncOpenAI
clients = {}
try:
clients['32B-Pro (beta)'] = [OpenAI(api_key='123', base_url=os.getenv('MODEL_NAME_OR_PATH_32B')), requests.get(os.getenv('MODEL_NAME_OR_PATH_32B') + '/models').json()['data'][0]['id']]
except:
pass
try:
clients['32B QWQ (experimental, without any additional tuning after LEP!)'] = [OpenAI(api_key='123', base_url=os.getenv('MODEL_NAME_OR_PATH_QWQ')), requests.get(os.getenv('MODEL_NAME_OR_PATH_QWQ') + '/models').json()['data'][0]['id']]
except:
pass
try:
clients['7B (work in progress)'] = [OpenAI(api_key='123', base_url=os.getenv('MODEL_NAME_OR_PATH_7B')), requests.get(os.getenv('MODEL_NAME_OR_PATH_7B') + '/models').json()['data'][0]['id']]
except:
pass
try:
clients['3B'] = [OpenAI(api_key='123', base_url=os.getenv('MODEL_NAME_OR_PATH_3B')), requests.get(os.getenv('MODEL_NAME_OR_PATH_3B') + '/models').json()['data'][0]['id']]
except:
pass
def respond(
message,
history: list[tuple[str, str]],
model_name,
system_message,
max_tokens,
temperature,
top_p,
repetition_penalty
):
messages = []
if len(system_message.strip()) > 0:
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
res = clients[model_name][0].chat.completions.create(
model=clients[model_name][1],
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
stream=True,
extra_body={
"repetition_penalty": repetition_penalty,
"add_generation_prompt": True,
}
)
for message in res:
token = message.choices[0].delta.content
response += token
yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
options = ["32B-Pro (beta)", "32B QWQ (experimental, without any additional tuning after LEP!)", "7B (work in progress)", "3B"]
options = options[:2]
system_old = "You are a helpful and harmless assistant. You should think step-by-step. First, reason (the user does not see your reasoning), then give your final answer."
system_new = "Ты Руадапт - полезный и дружелюбный интеллектуальный ассистент для помощи пользователям в их вопросах."
system_new2 = "Ты — Руадапт, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им."
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Radio(choices=options, label="Model:", value=options[0]),
gr.Textbox(value=system_new2, label="System message"),
gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max new tokens"),
gr.Slider(minimum=0.0, maximum=2.0, value=0.3, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
gr.Slider(minimum=0.9, maximum=1.5, value=1.05, step=0.05, label="repetition_penalty"),
],
concurrency_limit=10
)
if __name__ == "__main__":
#print(requests.get(os.getenv('MODEL_NAME_OR_PATH')[:-3] + '/docs'))
demo.launch(share=True)
|