Mahavaury2's picture
Update app.py
30169f7 verified
raw
history blame
3.01 kB
#!/usr/bin/env python
import os
from collections.abc import Iterator
from threading import Thread
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
#
# 1) Custom Pastel Gradient CSS, and force text to black
#
CUSTOM_CSS = """
.gradio-container {
background: linear-gradient(to right, #FFDEE9, #B5FFFC);
color: black; /* ensure text appears in black */
}
"""
#
# 2) Description: "Bonjour Dans le chat du consentement" in black
# Also add a CPU notice in black if no GPU is found.
#
DESCRIPTION = """# Bonjour Dans le chat du consentement
Mistral-7B Instruct Demo
"""
if not torch.cuda.is_available():
DESCRIPTION += "Running on CPU - This is likely too large to run effectively.\n"
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
#
# 3) Load Mistral-7B Instruct (requires gating, GPU recommended)
#
if torch.cuda.is_available():
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
def generate(
message: str,
chat_history: list[dict],
) -> Iterator[str]:
"""
Minimal chat generation function: no sliders, no extra params.
"""
conversation = [*chat_history, {"role": "user", "content": message}]
# Convert conversation to tokens
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
# If it exceeds max token length, trim
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
# Use a streamer to yield tokens as they are generated
streamer = TextIteratorStreamer(
tokenizer,
timeout=20.0,
skip_prompt=True,
skip_special_tokens=True
)
# Basic generation settings (feel free to adjust if you want)
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=512, # Adjust if you want more or fewer tokens
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
)
# Run generation in a background thread
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
#
# 4) Build the Chat Interface
# - No additional sliders
# - No pre-filled example questions
#
demo = gr.ChatInterface(
fn=generate,
description=DESCRIPTION,
css=CUSTOM_CSS,
examples=None, # remove example prompts
type="messages"
)
if __name__ == "__main__":
demo.queue().launch()