Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

App Files Files Community

michailroussos commited on Dec 9, 2024

Commit

0787acc

1 Parent(s): 029560f

Browse files

Files changed (1) hide show

app.py +58 -31

app.py CHANGED Viewed

@@ -1,30 +1,50 @@
 import gradio as gr
-from transformers import TextStreamer
 from unsloth import FastLanguageModel
-# Define constants
 max_seq_length = 2048
 dtype = None
-model_name_or_path = "michailroussos/model_llama_8d"
-# Load the model and tokenizer
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name_or_path,
     max_seq_length=max_seq_length,
     dtype=dtype,
     load_in_4bit=True,
 )
-# Optimize model for inference
 FastLanguageModel.for_inference(model)
-# Function to generate a response
-def chat_with_model(user_message, chat_history=None):
     try:
-        # Prepare the input messages
-        messages = [{"role": "user", "content": user_message}]
-        # Tokenize and prepare inputs for the model
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
@@ -32,36 +52,43 @@ def chat_with_model(user_message, chat_history=None):
             return_tensors="pt",
         ).to("cuda")
         # Generate response
         output_ids = model.generate(
             input_ids=inputs["input_ids"],
-            attention_mask=inputs["attention_mask"],  # Ensure attention_mask is included
-            streamer=None,  # Collect output as tensor
-            max_new_tokens=128,
             use_cache=True,
-            temperature=1.5,
-            min_p=0.1,
         )
-        # Decode the generated tokens into a string
-        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        # Append the response to the chat history
-        if chat_history is None:
-            chat_history = []
-        chat_history.append((user_message, response))
-        return "", chat_history
     except Exception as e:
-        return f"Error: {str(e)}", chat_history
-# Create the chat interface
 demo = gr.ChatInterface(
-    fn=chat_with_model,
-    chatbot=gr.Chatbot(label="Chat with Hugging Face Model"),
-    title="Hugging Face Chat Model",
-    description="Chat with a Hugging Face model using FastLanguageModel.",
 )
 # Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from unsloth import FastLanguageModel
+from transformers import AutoTokenizer
+import torch
+# Load the model and tokenizer
+model_name_or_path = "michailroussos/model_llama_8d"
 max_seq_length = 2048
 dtype = None
+print("Loading model...")
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_name_or_path,
     max_seq_length=max_seq_length,
     dtype=dtype,
     load_in_4bit=True,
 )
 FastLanguageModel.for_inference(model)
+print("Model loaded successfully!")
+# Define response function
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    system_message: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+):
     try:
+        # Debug: Print inputs
+        print("\n[DEBUG] Incoming user message:", message)
+        print("[DEBUG] Chat history before appending:", history)
+        # Prepare messages
+        messages = [{"role": "system", "content": system_message}]
+        for user, assistant in history:
+            if user:
+                messages.append({"role": "user", "content": user})
+            if assistant:
+                messages.append({"role": "assistant", "content": assistant})
+        messages.append({"role": "user", "content": message})
+        # Debug: Print prepared messages
+        print("[DEBUG] Prepared messages:", messages)
+        # Tokenize and prepare inputs
         inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             return_tensors="pt",
         ).to("cuda")
+        # Debug: Print tokenized inputs
+        print("[DEBUG] Tokenized inputs:", inputs)
         # Generate response
         output_ids = model.generate(
             input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
             use_cache=True,
         )
+        # Decode response
+        response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+        print("[DEBUG] Decoded response:", response)
+        # Update history
+        history.append((message, response))
+        return response, history
     except Exception as e:
+        print("[ERROR] Exception in respond function:", str(e))
+        return f"Error: {str(e)}", history
+# Create ChatInterface
 demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
+    ],
 )
 # Launch the app
 if __name__ == "__main__":
+    demo.launch(share=True)

more