File size: 2,905 Bytes
9c078e7
4a8131a
9c078e7
56f924f
9c078e7
 
 
 
56f924f
 
 
0e24f0d
b1a9b5c
9c078e7
aecc737
0e24f0d
aecc737
 
 
9c078e7
99d2247
 
 
7c5f508
56f924f
 
 
 
 
 
 
 
4d92f8a
56f924f
4d92f8a
56f924f
4d92f8a
 
 
 
 
 
 
 
 
e1ae004
4d92f8a
e1ae004
0e24f0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99d2247
 
0e24f0d
 
 
 
aecc737
0e24f0d
 
4d92f8a
0e24f0d
aecc737
 
 
 
 
 
 
 
 
9c078e7
0e24f0d
 
9c078e7
 
 
 
0e24f0d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import spaces
from huggingface_hub import InferenceClient
from transformers import AutoModelForCausalLM, AutoTokenizer

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
model = AutoModelForCausalLM.from_pretrained("BatsResearch/bonito-v1")
tokenizer = AutoTokenizer.from_pretrained("BatsResearch/bonito-v1")
model.to("cuda")

@spaces.GPU
def respond(
    message,
    task_type,
    max_tokens,
    temperature,
    top_p,
):
    task_type = task_type.lower()
    input_text = "<|tasktype|>\n" + task_type.strip()
    input_text += "\n<|context|>\n" + message.strip() + "\n<|task|>\n"

    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    output = model.generate(
        input_ids,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )
    pred_start = int(input_ids.shape[-1])

    response = tokenizer.decode(output[0][pred_start:], skip_special_tokens=True)

    # check if <|pipe|> is in the response
    if "<|pipe|>" in response:
        pair = response.split("<|pipe|>")
        instruction = pair[0].strip().replace("{{context}}", message)
        response = pair[1].strip()
    else:
        # fallback
        instruction = pair[0].strip().replace("{{context}}", message)
        response = "Unable to generate response. Please regenerate."

    return instruction, response

task_types = [
    "extractive question answering",
    "multiple-choice question answering",
    "question generation",
    "question answering without choices",
    "yes-no question answering",
    "coreference resolution",
    "paraphrase generation",
    "paraphrase identification",
    "sentence completion",
    "sentiment",
    "summarization",
    "text generation",
    "topic classification",
    "word sense disambiguation",
    "textual entailment",
    "natural language inference",
]
# capitalize for better readability
task_types = [task_type.capitalize() for task_type in task_types]

demo = gr.Interface(
    fn=respond,
    inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Dropdown(task_types, label="Task type"),
    ],
    outputs=[gr.Textbox(label="Input"), gr.Textbox(label="Output")],
    additional_inputs=[
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    title="Zephyr Chatbot",
    description="A chatbot that uses the Hugging Face Zephyr model.",
)


if __name__ == "__main__":
    demo.launch()