File size: 4,281 Bytes
bcae2af
 
c566ded
 
 
 
967f284
b1e539f
29ac499
c566ded
967f284
b1e539f
 
 
c566ded
86ef0b6
 
 
 
41dc826
b1e539f
 
 
 
 
c566ded
 
b1e539f
 
 
 
c566ded
bcae2af
 
b1e539f
5f14f54
b1e539f
 
 
c566ded
 
 
b1e539f
bcae2af
c566ded
 
 
 
 
bcae2af
c566ded
 
b1e539f
 
 
 
 
 
 
 
 
 
 
 
 
c566ded
 
 
 
 
b1e539f
 
 
c566ded
 
 
b1e539f
 
 
c566ded
 
b1e539f
 
 
 
 
 
 
 
 
 
 
c566ded
 
 
 
 
 
b1e539f
c566ded
 
b1e539f
bcae2af
b1e539f
c566ded
 
 
bcae2af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1e539f
 
 
 
 
 
 
 
 
c566ded
 
5f14f54
bcae2af
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python

import os
from collections.abc import Iterator
from threading import Thread

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

#
# 1) Custom Pastel Gradient CSS
#
CUSTOM_CSS = """
.gradio-container {
    background: linear-gradient(to right, #FFDEE9, #B5FFFC);
}
"""

#
# 2) Description: Add French greeting, plus any info
#
DESCRIPTION = """# Bonjour Dans le chat du consentement
Mistral-7B Instruct Demo
"""

if not torch.cuda.is_available():
    DESCRIPTION += (
        "\n<p style='color:red;'>Running on CPU - This is likely too large to run effectively.</p>"
    )

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

#
# 3) Load Mistral-7B Instruct (requires gating, GPU recommended)
#
if torch.cuda.is_available():
    model_id = "mistralai/Mistral-7B-Instruct-v0.3"
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True  # Might be needed for custom code
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

def generate(
    message: str,
    chat_history: list[dict],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    """
    This function handles streaming chat text as the model generates it.
    Uses Mistral's 'apply_chat_template' to handle chat-style prompting.
    """
    conversation = [*chat_history, {"role": "user", "content": message}]

    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(
            f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens."
        )
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(
        tokenizer, 
        timeout=20.0, 
        skip_prompt=True, 
        skip_special_tokens=True
    )
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        # Stream partial output as it's generated
        yield "".join(outputs)

#
# 4) Build the Chat Interface with extra sliders
#
demo = gr.ChatInterface(
    fn=generate,
    description=DESCRIPTION,
    css=CUSTOM_CSS,  # Use our pastel gradient
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.2,
        ),
    ],
    stop_btn=None,
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly what the Python programming language is?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
    type="messages",
)

if __name__ == "__main__":
    demo.queue(max_size=20).launch(share=True)