Adoetz commited on
Commit
0880f7b
·
verified ·
1 Parent(s): fb3ab8e

update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -3
app.py CHANGED
@@ -3,9 +3,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
  # Load your custom model and tokenizer
6
- MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" # Replace with your model's Hugging Face repo ID or local path
7
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
8
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
 
 
 
9
 
10
  def respond(
11
  message,
@@ -24,13 +27,14 @@ def respond(
24
  if assistant_msg:
25
  messages.append({"role": "assistant", "content": assistant_msg})
26
 
 
27
  messages.append({"role": "user", "content": message})
28
 
29
  # Format the input for the model
30
  input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
31
 
32
  # Generate a response
33
- inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
34
  outputs = model.generate(
35
  inputs.input_ids,
36
  max_new_tokens=max_tokens,
@@ -39,10 +43,18 @@ def respond(
39
  do_sample=True,
40
  )
41
 
 
42
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
43
 
44
  # Extract only the assistant's response
 
45
  assistant_response = response.split("assistant:")[-1].strip()
 
 
 
 
 
 
46
  yield assistant_response
47
 
48
 
 
3
  import torch
4
 
5
  # Load your custom model and tokenizer
6
+ MODEL_NAME = "Qwen/Qwen2.5-1.5B" # Replace with your model's Hugging Face repo ID or local path
7
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
8
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
9
+
10
+ # Ensure the model is on the CPU
11
+ model.to("cpu")
12
 
13
  def respond(
14
  message,
 
27
  if assistant_msg:
28
  messages.append({"role": "assistant", "content": assistant_msg})
29
 
30
+ # Add the latest user message
31
  messages.append({"role": "user", "content": message})
32
 
33
  # Format the input for the model
34
  input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
35
 
36
  # Generate a response
37
+ inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # Move inputs to CPU
38
  outputs = model.generate(
39
  inputs.input_ids,
40
  max_new_tokens=max_tokens,
 
43
  do_sample=True,
44
  )
45
 
46
+ # Decode the response
47
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
 
49
  # Extract only the assistant's response
50
+ # Split the response by "assistant:" and take the last part
51
  assistant_response = response.split("assistant:")[-1].strip()
52
+
53
+ # Remove any repeated history from the response
54
+ # This ensures the response doesn't include the entire conversation
55
+ if "user:" in assistant_response:
56
+ assistant_response = assistant_response.split("user:")[0].strip()
57
+
58
  yield assistant_response
59
 
60