Spaces:

Adoetz
/

qwen1.5b

Sleeping

Adoetz commited on Jan 3

Commit

0880f7b

verified ·

1 Parent(s): fb3ab8e

update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,9 +3,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 # Load your custom model and tokenizer
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"  # Replace with your model's Hugging Face repo ID or local path
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
 def respond(
     message,
@@ -24,13 +27,14 @@ def respond(
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     # Format the input for the model
     input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
     # Generate a response
-    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
     outputs = model.generate(
         inputs.input_ids,
         max_new_tokens=max_tokens,
@@ -39,10 +43,18 @@ def respond(
         do_sample=True,
     )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Extract only the assistant's response
     assistant_response = response.split("assistant:")[-1].strip()
     yield assistant_response

 import torch
 # Load your custom model and tokenizer
+MODEL_NAME = "Qwen/Qwen2.5-1.5B"  # Replace with your model's Hugging Face repo ID or local path
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+# Ensure the model is on the CPU
+model.to("cpu")
 def respond(
     message,
         if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
+    # Add the latest user message
     messages.append({"role": "user", "content": message})
     # Format the input for the model
     input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
     # Generate a response
+    inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # Move inputs to CPU
     outputs = model.generate(
         inputs.input_ids,
         max_new_tokens=max_tokens,
         do_sample=True,
     )
+    # Decode the response
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Extract only the assistant's response
+    # Split the response by "assistant:" and take the last part
     assistant_response = response.split("assistant:")[-1].strip()
+    # Remove any repeated history from the response
+    # This ensures the response doesn't include the entire conversation
+    if "user:" in assistant_response:
+        assistant_response = assistant_response.split("user:")[0].strip()
     yield assistant_response