Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

App Files Files Community

michailroussos commited on Dec 9, 2024

Commit

04cf79a

1 Parent(s): 07df911

debugging

Browse files

Files changed (1) hide show

app.py +22 -3

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 # Load the model and tokenizer locally
 max_seq_length = 2048
-model_name_or_path = "unsloth/Llama-3.2-3B-Instruct"
 # Load model and tokenizer using unsloth
 model, tokenizer = FastLanguageModel.from_pretrained(
@@ -16,17 +16,25 @@ FastLanguageModel.for_inference(model)  # Enable optimized inference
 # Define the response function
 def respond(message, history, system_message, max_tokens, temperature, top_p):
-    # Start by preparing only the conversation history (user-assistant pairs)
     messages = []
     if history:
         for entry in history:
             messages.append({"role": "user", "content": entry["user"]})
             messages.append({"role": "assistant", "content": entry["assistant"]})
     # Add the user's new message to the list
     messages.append({"role": "user", "content": message})
     # Tokenize the input (prepare the data for the model)
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
@@ -34,8 +42,12 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         return_tensors="pt",
     ).to("cuda" if torch.cuda.is_available() else "cpu")
     # Generate the response
     attention_mask = inputs.ne(tokenizer.pad_token_id).long()
     generated_tokens = model.generate(
         input_ids=inputs,
         attention_mask=attention_mask,
@@ -45,19 +57,26 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         top_p=top_p,
     )
     response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
     # Update the conversation history with the new user-assistant pair
     if history is None:
         history = []
     history.append({"user": message, "assistant": response})
-    # Prepare the history for Gradio
     formatted_history = []
     for entry in history:
         formatted_history.append({"role": "user", "content": entry["user"]})
         formatted_history.append({"role": "assistant", "content": entry["assistant"]})
     # Return the formatted history for Gradio to display
     return formatted_history

 # Load the model and tokenizer locally
 max_seq_length = 2048
+model_name_or_path = "michailroussos/model_llama_8d"
 # Load model and tokenizer using unsloth
 model, tokenizer = FastLanguageModel.from_pretrained(
 # Define the response function
 def respond(message, history, system_message, max_tokens, temperature, top_p):
+    # Print to show the inputs at the start
+    print(f"Received message: {message}")
+    print(f"Current history: {history}")
+    # Prepare the messages for the model: Exclude the system message for now
     messages = []
     if history:
         for entry in history:
+            print(f"Adding user message to history: {entry['user']}")
+            print(f"Adding assistant message to history: {entry['assistant']}")
             messages.append({"role": "user", "content": entry["user"]})
             messages.append({"role": "assistant", "content": entry["assistant"]})
     # Add the user's new message to the list
+    print(f"Adding current user message: {message}")
     messages.append({"role": "user", "content": message})
     # Tokenize the input (prepare the data for the model)
+    print("Preparing the input for the model...")
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         return_tensors="pt",
     ).to("cuda" if torch.cuda.is_available() else "cpu")
+    # Print the tokenized inputs
+    print(f"Tokenized inputs: {inputs}")
     # Generate the response
     attention_mask = inputs.ne(tokenizer.pad_token_id).long()
+    print(f"Attention mask: {attention_mask}")
     generated_tokens = model.generate(
         input_ids=inputs,
         attention_mask=attention_mask,
         top_p=top_p,
     )
+    # Decode the generated response
     response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+    print(f"Generated response: {response}")
     # Update the conversation history with the new user-assistant pair
     if history is None:
         history = []
     history.append({"user": message, "assistant": response})
+    # Prepare the history for Gradio: Formatting it correctly
     formatted_history = []
     for entry in history:
+        print(f"Formatting user message for history: {entry['user']}")
+        print(f"Formatting assistant message for history: {entry['assistant']}")
         formatted_history.append({"role": "user", "content": entry["user"]})
         formatted_history.append({"role": "assistant", "content": entry["assistant"]})
+    # Print the final formatted history before returning
+    print(f"Formatted history for Gradio: {formatted_history}")
     # Return the formatted history for Gradio to display
     return formatted_history