Spaces:

Mahavaury2
/

mistralai-Mistral-7B-Instruct-v0.3

Running

File size: 4,281 Bytes

bcae2af
 
c566ded
 
 
 
967f284
b1e539f
29ac499
c566ded
967f284
b1e539f
 
 
c566ded
86ef0b6
 
 
 
41dc826
b1e539f
 
 
 
 
c566ded
 
b1e539f
 
 
 
c566ded
bcae2af
 
b1e539f
5f14f54
b1e539f
 
 
c566ded
 
 
b1e539f
bcae2af
c566ded
 
 
 
 
bcae2af
c566ded
 
b1e539f
 
 
 
 
 
 
 
 
 
 
 
 
c566ded
 
 
 
 
b1e539f
 
 
c566ded
 
 
b1e539f
 
 
c566ded
 
b1e539f
 
 
 
 
 
 
 
 
 
 
c566ded
 
 
 
 
 
b1e539f
c566ded
 
b1e539f
bcae2af
b1e539f
c566ded
 
 
bcae2af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1e539f
 
 
 
 
 
 
 
 
c566ded
 
5f14f54
bcae2af

#!/usr/bin/env python

import os
from collections.abc import Iterator
from threading import Thread

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

#
# 1) Custom Pastel Gradient CSS
#
CUSTOM_CSS = """
.gradio-container {
    background: linear-gradient(to right, #FFDEE9, #B5FFFC);
}
"""

#
# 2) Description: Add French greeting, plus any info
#
DESCRIPTION = """# Bonjour Dans le chat du consentement
Mistral-7B Instruct Demo
"""

if not torch.cuda.is_available():
    DESCRIPTION += (
        "\n<p style='color:red;'>Running on CPU - This is likely too large to run effectively.</p>"
    )

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

#
# 3) Load Mistral-7B Instruct (requires gating, GPU recommended)
#
if torch.cuda.is_available():
    model_id = "mistralai/Mistral-7B-Instruct-v0.3"
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True  # Might be needed for custom code
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

def generate(
    message: str,
    chat_history: list[dict],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    """
    This function handles streaming chat text as the model generates it.
    Uses Mistral's 'apply_chat_template' to handle chat-style prompting.
    """
    conversation = [*chat_history, {"role": "user", "content": message}]

    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(
            f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens."
        )
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(
        tokenizer, 
        timeout=20.0, 
        skip_prompt=True, 
        skip_special_tokens=True
    )
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        # Stream partial output as it's generated
        yield "".join(outputs)

#
# 4) Build the Chat Interface with extra sliders
#
demo = gr.ChatInterface(
    fn=generate,
    description=DESCRIPTION,
    css=CUSTOM_CSS,  # Use our pastel gradient
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.2,
        ),
    ],
    stop_btn=None,
    examples=[
        ["Hello there! How are you doing?"],
        ["Can you explain briefly what the Python programming language is?"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
    type="messages",
)

if __name__ == "__main__":
    demo.queue(max_size=20).launch(share=True)