Spaces:

Mahavaury2
/

mistralai-Mistral-7B-Instruct-v0.3

Running

App Files Files Community

Mahavaury2 commited on Jan 22

Commit

c566ded

verified ·

1 Parent(s): 0387457

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -48

app.py CHANGED Viewed

@@ -1,61 +1,159 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-# 1) Define pastel gradient CSS
-css = """
 .gradio-container {
     background: linear-gradient(to right, #FFDEE9, #B5FFFC);
 }
 """
-title = "Bonjour Dans le chat du consentement"
-# 2) Load the Mistral model & tokenizer from HF Hub
-model_id = "mistralai/Mistral-7B-Instruct-v0.3"
-# If you're on a GPU Space, you can do:
-#    device_map = "auto"
-#    torch_dtype = torch.bfloat16
-# If you're on a CPU-only Space, remove those arguments or set device_map="cpu"
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",          # "auto" if you have GPU
-    torch_dtype=torch.bfloat16, # for GPU. Remove or use float32 on CPU
-    trust_remote_code=True
-)
-# 3) Create a text-generation pipeline
-generate_text = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_length=512,        # adjust as needed
-    temperature=0.7,       # adjust as needed
-    do_sample=True
-)
-def mistral_inference(prompt):
     """
-    Passes user prompt to the pipeline and returns the generated text.
-    We'll strip any special tokens and limit the output.
     """
-    # The pipeline returns a list of dicts [{"generated_text": "..."}]
-    outputs = generate_text(prompt)
-    text_out = outputs[0]["generated_text"]
-    return text_out
-# 4) Build the Gradio interface with a pastel background & greeting
-with gr.Blocks(css=css) as demo:
-    gr.Markdown(f"<h1 style='text-align:center;'>{title}</h1>")
-    user_input = gr.Textbox(label="Entrez votre message ici:", lines=3)
-    output = gr.Textbox(label="Réponse du Modèle:", lines=5)
-    send_button = gr.Button("Envoyer")
-    # Link the button to the inference function
-    send_button.click(fn=mistral_inference, inputs=user_input, outputs=output)
-# 5) Launch the app
 if __name__ == "__main__":
-    demo.launch()

+#!/usr/bin/env python
+import os
+from collections.abc import Iterator
+from threading import Thread
 import gradio as gr
+import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+#
+# 1) Custom Pastel Gradient CSS
+#
+CUSTOM_CSS = """
 .gradio-container {
     background: linear-gradient(to right, #FFDEE9, #B5FFFC);
 }
 """
+#
+# 2) Description: Add French greeting, plus any info
+#
+DESCRIPTION = """# Bonjour Dans le chat du consentement
+Mistral-7B Instruct Demo
+"""
+if not torch.cuda.is_available():
+    DESCRIPTION += (
+        "\n<p style='color:red;'>Running on CPU - This is likely too large to run effectively.</p>"
+    )
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+#
+# 3) Load Mistral-7B Instruct (requires gating, GPU recommended)
+#
+if torch.cuda.is_available():
+    model_id = "mistralai/Mistral-7B-Instruct-v0.3"
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_id,
+        trust_remote_code=True  # Might be needed for custom code
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+def generate(
+    message: str,
+    chat_history: list[dict],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
     """
+    This function handles streaming chat text as the model generates it.
+    Uses Mistral's 'apply_chat_template' to handle chat-style prompting.
     """
+    conversation = [*chat_history, {"role": "user", "content": message}]
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(
+            f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens."
+        )
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        timeout=20.0,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        # Stream partial output as it's generated
+        yield "".join(outputs)
+#
+# 4) Build the Chat Interface with extra sliders
+#
+demo = gr.ChatInterface(
+    fn=generate,
+    description=DESCRIPTION,
+    css=CUSTOM_CSS,  # Use our pastel gradient
+    additional_inputs=[
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Hello there! How are you doing?"],
+        ["Can you explain briefly what the Python programming language is?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
+    ],
+    type="messages",
+)
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()