Spaces:

Mahavaury2
/

mistralai-Mistral-7B-Instruct-v0.3

Running

File size: 3,010 Bytes

c566ded
 
 
 
 
 
967f284
29ac499
c566ded
967f284
c566ded
30169f7
c566ded
 
86ef0b6
 
30169f7
86ef0b6
 
41dc826
c566ded
30169f7
 
c566ded
30169f7
 
c566ded
 
 
30169f7
c566ded
 
5f14f54
c566ded
 
 
 
 
 
30169f7
 
c566ded
 
 
 
 
 
 
 
 
 
 
 
5f14f54
30169f7
5f14f54
c566ded
 
30169f7
c566ded
30169f7
c566ded
 
30169f7
 
c566ded
 
30169f7
c566ded
30169f7
 
 
c566ded
 
30169f7
 
c566ded
30169f7
c566ded
30169f7
c566ded
30169f7
 
 
c566ded
30169f7
 
c566ded
 
 
 
 
 
 
 
 
30169f7
 
 
c566ded
 
 
 
30169f7
 
 
c566ded
 
5f14f54
30169f7

#!/usr/bin/env python

import os
from collections.abc import Iterator
from threading import Thread

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

#
# 1) Custom Pastel Gradient CSS, and force text to black
#
CUSTOM_CSS = """
.gradio-container {
    background: linear-gradient(to right, #FFDEE9, #B5FFFC);
    color: black; /* ensure text appears in black */
}
"""

#
# 2) Description: "Bonjour Dans le chat du consentement" in black
#    Also add a CPU notice in black if no GPU is found.
#
DESCRIPTION = """# Bonjour Dans le chat du consentement  
Mistral-7B Instruct Demo  
"""

if not torch.cuda.is_available():
    DESCRIPTION += "Running on CPU - This is likely too large to run effectively.\n"

MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

#
# 3) Load Mistral-7B Instruct (requires gating, GPU recommended)
#
if torch.cuda.is_available():
    model_id = "mistralai/Mistral-7B-Instruct-v0.3"
    tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

def generate(
    message: str,
    chat_history: list[dict],
) -> Iterator[str]:
    """
    Minimal chat generation function: no sliders, no extra params.
    """
    conversation = [*chat_history, {"role": "user", "content": message}]

    # Convert conversation to tokens
    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
    # If it exceeds max token length, trim
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

    input_ids = input_ids.to(model.device)

    # Use a streamer to yield tokens as they are generated
    streamer = TextIteratorStreamer(
        tokenizer,
        timeout=20.0,
        skip_prompt=True,
        skip_special_tokens=True
    )

    # Basic generation settings (feel free to adjust if you want)
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=512,        # Adjust if you want more or fewer tokens
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
    )

    # Run generation in a background thread
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

#
# 4) Build the Chat Interface
#    - No additional sliders
#    - No pre-filled example questions
#
demo = gr.ChatInterface(
    fn=generate,
    description=DESCRIPTION,
    css=CUSTOM_CSS,
    examples=None,  # remove example prompts
    type="messages"
)

if __name__ == "__main__":
    demo.queue().launch()