update app.py
Browse files
app.py
CHANGED
@@ -3,9 +3,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
3 |
import torch
|
4 |
|
5 |
# Load your custom model and tokenizer
|
6 |
-
MODEL_NAME = "Qwen/Qwen2.5-1.5B
|
7 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
8 |
-
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME
|
|
|
|
|
|
|
9 |
|
10 |
def respond(
|
11 |
message,
|
@@ -24,13 +27,14 @@ def respond(
|
|
24 |
if assistant_msg:
|
25 |
messages.append({"role": "assistant", "content": assistant_msg})
|
26 |
|
|
|
27 |
messages.append({"role": "user", "content": message})
|
28 |
|
29 |
# Format the input for the model
|
30 |
input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
|
31 |
|
32 |
# Generate a response
|
33 |
-
inputs = tokenizer(input_text, return_tensors="pt").to(
|
34 |
outputs = model.generate(
|
35 |
inputs.input_ids,
|
36 |
max_new_tokens=max_tokens,
|
@@ -39,10 +43,18 @@ def respond(
|
|
39 |
do_sample=True,
|
40 |
)
|
41 |
|
|
|
42 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
43 |
|
44 |
# Extract only the assistant's response
|
|
|
45 |
assistant_response = response.split("assistant:")[-1].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
yield assistant_response
|
47 |
|
48 |
|
|
|
3 |
import torch
|
4 |
|
5 |
# Load your custom model and tokenizer
|
6 |
+
MODEL_NAME = "Qwen/Qwen2.5-1.5B" # Replace with your model's Hugging Face repo ID or local path
|
7 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
8 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
|
9 |
+
|
10 |
+
# Ensure the model is on the CPU
|
11 |
+
model.to("cpu")
|
12 |
|
13 |
def respond(
|
14 |
message,
|
|
|
27 |
if assistant_msg:
|
28 |
messages.append({"role": "assistant", "content": assistant_msg})
|
29 |
|
30 |
+
# Add the latest user message
|
31 |
messages.append({"role": "user", "content": message})
|
32 |
|
33 |
# Format the input for the model
|
34 |
input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
|
35 |
|
36 |
# Generate a response
|
37 |
+
inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # Move inputs to CPU
|
38 |
outputs = model.generate(
|
39 |
inputs.input_ids,
|
40 |
max_new_tokens=max_tokens,
|
|
|
43 |
do_sample=True,
|
44 |
)
|
45 |
|
46 |
+
# Decode the response
|
47 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
48 |
|
49 |
# Extract only the assistant's response
|
50 |
+
# Split the response by "assistant:" and take the last part
|
51 |
assistant_response = response.split("assistant:")[-1].strip()
|
52 |
+
|
53 |
+
# Remove any repeated history from the response
|
54 |
+
# This ensures the response doesn't include the entire conversation
|
55 |
+
if "user:" in assistant_response:
|
56 |
+
assistant_response = assistant_response.split("user:")[0].strip()
|
57 |
+
|
58 |
yield assistant_response
|
59 |
|
60 |
|