Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

App Files Files Community

michailroussos commited on Dec 9, 2024

Commit

7c34777

1 Parent(s): e82c023

Browse files

Files changed (1) hide show

app.py +32 -64

app.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import gradio as gr
 from unsloth import FastLanguageModel
-from transformers import AutoTokenizer
 import torch
-# Load the model and tokenizer
-model_name_or_path = "michailroussos/model_llama_8d"
 max_seq_length = 2048
 dtype = None
 print("Loading model...")
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name_or_path,
@@ -15,85 +16,52 @@ model, tokenizer = FastLanguageModel.from_pretrained(
     dtype=dtype,
     load_in_4bit=True,
 )
-FastLanguageModel.for_inference(model)
 print("Model loaded successfully!")
-# Define response function
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-):
     try:
-        # Debug: Print inputs
-        print("\n[DEBUG] Incoming user message:", message)
-        print("[DEBUG] Chat history before appending:", history)
-        # Prepare messages
-        messages = [{"role": "system", "content": system_message}]
-        for user, assistant in history:
-            if user:
-                messages.append({"role": "user", "content": user})
-            if assistant:
-                messages.append({"role": "assistant", "content": assistant})
         messages.append({"role": "user", "content": message})
-        # Debug: Print prepared messages
-        print("[DEBUG] Prepared messages:", messages)
-        # Tokenize and prepare inputs
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-        )
-        # Ensure tensor shapes are correct
-        input_ids = inputs["input_ids"].squeeze(0).to("cuda")
-        attention_mask = inputs["attention_mask"].squeeze(0).to("cuda")
-        # Debug: Print tokenized inputs
-        print("[DEBUG] Tokenized input_ids shape:", input_ids.shape)
-        print("[DEBUG] Tokenized attention_mask shape:", attention_mask.shape)
-        # Generate response
-        output_ids = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=max_tokens,
             temperature=temperature,
-            top_p=top_p,
             use_cache=True,
         )
-        # Decode response
-        response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
-        print("[DEBUG] Decoded response:", response)
-        # Update history
-        history.append((message, response))
-        return response, history
     except Exception as e:
-        print("[ERROR] Exception in respond function:", str(e))
-        return f"Error: {str(e)}", history
-# Create ChatInterface
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
     ],
 )
-# Launch the app
 if __name__ == "__main__":
     demo.launch(share=True)

 import gradio as gr
+from transformers import TextStreamer
 from unsloth import FastLanguageModel
 import torch
+# Model Configuration
 max_seq_length = 2048
 dtype = None
+model_name_or_path = "michailroussos/model_llama_8d"
+# Load Model and Tokenizer
 print("Loading model...")
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name_or_path,
     dtype=dtype,
     load_in_4bit=True,
 )
+FastLanguageModel.for_inference(model)  # Enable faster inference
 print("Model loaded successfully!")
+# Gradio Response Function
+def respond(message, max_new_tokens, temperature, system_message=""):
     try:
+        # Prepare input messages
+        messages = [{"role": "system", "content": system_message}] if system_message else []
         messages.append({"role": "user", "content": message})
+        # Tokenize inputs
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+        ).to("cuda")
+        # Stream response
+        response = []
+        text_streamer = TextStreamer(tokenizer, skip_prompt=True)
+        _ = model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=max_new_tokens,
             temperature=temperature,
             use_cache=True,
+            streamer=text_streamer,
         )
+        return "".join(response)
     except Exception as e:
+        return f"Error: {str(e)}"
+# Gradio UI
+demo = gr.Interface(
+    fn=respond,
+    inputs=[
+        gr.Textbox(label="Your Message", placeholder="Enter your prompt here..."),
+        gr.Slider(minimum=1, maximum=512, step=1, value=128, label="Max New Tokens"),
+        gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"),
+        gr.Textbox(label="System Message", placeholder="Optional system instructions."),
     ],
+    outputs="text",
+    title="LLama-based Chatbot",
+    description="Interact with the model. Enter a prompt and receive a response.",
 )
 if __name__ == "__main__":
     demo.launch(share=True)

more