Spaces:

Mahavaury2
/

mistralai-Mistral-7B-Instruct-v0.3

Running

App Files Files Community

Mahavaury2 commited on Jan 22

Commit

eb04a36

verified ·

1 Parent(s): 30169f7

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -43

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-#!/usr/bin/env python
 import os
 from collections.abc import Iterator
 from threading import Thread
@@ -8,32 +6,24 @@ import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-#
-# 1) Custom Pastel Gradient CSS, and force text to black
-#
 CUSTOM_CSS = """
 .gradio-container {
     background: linear-gradient(to right, #FFDEE9, #B5FFFC);
-    color: black; /* ensure text appears in black */
 }
 """
-#
-# 2) Description: "Bonjour Dans le chat du consentement" in black
-#    Also add a CPU notice in black if no GPU is found.
-#
 DESCRIPTION = """# Bonjour Dans le chat du consentement
 Mistral-7B Instruct Demo
 """
-if not torch.cuda.is_available():
-    DESCRIPTION += "Running on CPU - This is likely too large to run effectively.\n"
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-#
-# 3) Load Mistral-7B Instruct (requires gating, GPU recommended)
-#
 if torch.cuda.is_available():
     model_id = "mistralai/Mistral-7B-Instruct-v0.3"
     tokenizer = AutoTokenizer.from_pretrained(
@@ -46,26 +36,25 @@ if torch.cuda.is_available():
         device_map="auto",
         trust_remote_code=True
     )
-def generate(
-    message: str,
-    chat_history: list[dict],
-) -> Iterator[str]:
-    """
-    Minimal chat generation function: no sliders, no extra params.
-    """
     conversation = [*chat_history, {"role": "user", "content": message}]
-    # Convert conversation to tokens
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
-    # If it exceeds max token length, trim
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    # Use a streamer to yield tokens as they are generated
     streamer = TextIteratorStreamer(
         tokenizer,
         timeout=20.0,
@@ -73,18 +62,16 @@ def generate(
         skip_special_tokens=True
     )
-    # Basic generation settings (feel free to adjust if you want)
-    generate_kwargs = dict(
-        input_ids=input_ids,
-        streamer=streamer,
-        max_new_tokens=512,        # Adjust if you want more or fewer tokens
-        do_sample=True,
-        temperature=0.7,
-        top_p=0.9,
-        repetition_penalty=1.1,
-    )
-    # Run generation in a background thread
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
@@ -93,16 +80,11 @@ def generate(
         outputs.append(text)
         yield "".join(outputs)
-#
-# 4) Build the Chat Interface
-#    - No additional sliders
-#    - No pre-filled example questions
-#
 demo = gr.ChatInterface(
     fn=generate,
     description=DESCRIPTION,
     css=CUSTOM_CSS,
-    examples=None,  # remove example prompts
     type="messages"
 )

 import os
 from collections.abc import Iterator
 from threading import Thread
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 CUSTOM_CSS = """
 .gradio-container {
     background: linear-gradient(to right, #FFDEE9, #B5FFFC);
+    color: black;
 }
 """
 DESCRIPTION = """# Bonjour Dans le chat du consentement
 Mistral-7B Instruct Demo
 """
+MAX_INPUT_TOKEN_LENGTH = 4096  # just a default
+# Define model/tokenizer at the top so they're visible in all scopes
+tokenizer = None
+model = None
+# Try to load the model only if GPU is available
 if torch.cuda.is_available():
     model_id = "mistralai/Mistral-7B-Instruct-v0.3"
     tokenizer = AutoTokenizer.from_pretrained(
         device_map="auto",
         trust_remote_code=True
     )
+else:
+    # Show a warning in the description
+    DESCRIPTION += "\n**Running on CPU** — This model is too large for CPU inference!"
+def generate(message: str, chat_history: list[dict]) -> Iterator[str]:
+    # If there's no GPU (thus no tokenizer/model), return an error
+    if tokenizer is None or model is None:
+        yield "Error: No GPU available. Unable to load Mistral-7B-Instruct."
+        return
     conversation = [*chat_history, {"role": "user", "content": message}]
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(
         tokenizer,
         timeout=20.0,
         skip_special_tokens=True
     )
+    generate_kwargs = {
+        "input_ids": input_ids,
+        "streamer": streamer,
+        "max_new_tokens": 512,
+        "do_sample": True,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "repetition_penalty": 1.1,
+    }
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
         outputs.append(text)
         yield "".join(outputs)
 demo = gr.ChatInterface(
     fn=generate,
     description=DESCRIPTION,
     css=CUSTOM_CSS,
+    examples=None,
     type="messages"
 )