michailroussos commited on
Commit
04cf79a
·
1 Parent(s): 07df911
Files changed (1) hide show
  1. app.py +22 -3
app.py CHANGED
@@ -4,7 +4,7 @@ import torch
4
 
5
  # Load the model and tokenizer locally
6
  max_seq_length = 2048
7
- model_name_or_path = "unsloth/Llama-3.2-3B-Instruct"
8
 
9
  # Load model and tokenizer using unsloth
10
  model, tokenizer = FastLanguageModel.from_pretrained(
@@ -16,17 +16,25 @@ FastLanguageModel.for_inference(model) # Enable optimized inference
16
 
17
  # Define the response function
18
  def respond(message, history, system_message, max_tokens, temperature, top_p):
19
- # Start by preparing only the conversation history (user-assistant pairs)
 
 
 
 
20
  messages = []
21
  if history:
22
  for entry in history:
 
 
23
  messages.append({"role": "user", "content": entry["user"]})
24
  messages.append({"role": "assistant", "content": entry["assistant"]})
25
 
26
  # Add the user's new message to the list
 
27
  messages.append({"role": "user", "content": message})
28
 
29
  # Tokenize the input (prepare the data for the model)
 
30
  inputs = tokenizer.apply_chat_template(
31
  messages,
32
  tokenize=True,
@@ -34,8 +42,12 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
34
  return_tensors="pt",
35
  ).to("cuda" if torch.cuda.is_available() else "cpu")
36
 
 
 
 
37
  # Generate the response
38
  attention_mask = inputs.ne(tokenizer.pad_token_id).long()
 
39
  generated_tokens = model.generate(
40
  input_ids=inputs,
41
  attention_mask=attention_mask,
@@ -45,19 +57,26 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
45
  top_p=top_p,
46
  )
47
 
 
48
  response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
 
49
 
50
  # Update the conversation history with the new user-assistant pair
51
  if history is None:
52
  history = []
53
  history.append({"user": message, "assistant": response})
54
 
55
- # Prepare the history for Gradio
56
  formatted_history = []
57
  for entry in history:
 
 
58
  formatted_history.append({"role": "user", "content": entry["user"]})
59
  formatted_history.append({"role": "assistant", "content": entry["assistant"]})
60
 
 
 
 
61
  # Return the formatted history for Gradio to display
62
  return formatted_history
63
 
 
4
 
5
  # Load the model and tokenizer locally
6
  max_seq_length = 2048
7
+ model_name_or_path = "michailroussos/model_llama_8d"
8
 
9
  # Load model and tokenizer using unsloth
10
  model, tokenizer = FastLanguageModel.from_pretrained(
 
16
 
17
  # Define the response function
18
  def respond(message, history, system_message, max_tokens, temperature, top_p):
19
+ # Print to show the inputs at the start
20
+ print(f"Received message: {message}")
21
+ print(f"Current history: {history}")
22
+
23
+ # Prepare the messages for the model: Exclude the system message for now
24
  messages = []
25
  if history:
26
  for entry in history:
27
+ print(f"Adding user message to history: {entry['user']}")
28
+ print(f"Adding assistant message to history: {entry['assistant']}")
29
  messages.append({"role": "user", "content": entry["user"]})
30
  messages.append({"role": "assistant", "content": entry["assistant"]})
31
 
32
  # Add the user's new message to the list
33
+ print(f"Adding current user message: {message}")
34
  messages.append({"role": "user", "content": message})
35
 
36
  # Tokenize the input (prepare the data for the model)
37
+ print("Preparing the input for the model...")
38
  inputs = tokenizer.apply_chat_template(
39
  messages,
40
  tokenize=True,
 
42
  return_tensors="pt",
43
  ).to("cuda" if torch.cuda.is_available() else "cpu")
44
 
45
+ # Print the tokenized inputs
46
+ print(f"Tokenized inputs: {inputs}")
47
+
48
  # Generate the response
49
  attention_mask = inputs.ne(tokenizer.pad_token_id).long()
50
+ print(f"Attention mask: {attention_mask}")
51
  generated_tokens = model.generate(
52
  input_ids=inputs,
53
  attention_mask=attention_mask,
 
57
  top_p=top_p,
58
  )
59
 
60
+ # Decode the generated response
61
  response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
62
+ print(f"Generated response: {response}")
63
 
64
  # Update the conversation history with the new user-assistant pair
65
  if history is None:
66
  history = []
67
  history.append({"user": message, "assistant": response})
68
 
69
+ # Prepare the history for Gradio: Formatting it correctly
70
  formatted_history = []
71
  for entry in history:
72
+ print(f"Formatting user message for history: {entry['user']}")
73
+ print(f"Formatting assistant message for history: {entry['assistant']}")
74
  formatted_history.append({"role": "user", "content": entry["user"]})
75
  formatted_history.append({"role": "assistant", "content": entry["assistant"]})
76
 
77
+ # Print the final formatted history before returning
78
+ print(f"Formatted history for Gradio: {formatted_history}")
79
+
80
  # Return the formatted history for Gradio to display
81
  return formatted_history
82