Luigi commited on
Commit
b6b3940
·
1 Parent(s): 4731160

add attention mask

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -86,6 +86,12 @@ def load_model(model_name):
86
  # Load the model and tokenizer using Transformers.
87
  model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
88
  tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
 
 
 
 
 
 
89
  LOADED_MODELS[model_name] = (model, tokenizer)
90
  CURRENT_MODEL_NAME = model_name
91
  return model, tokenizer
@@ -158,12 +164,16 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
158
  model, tokenizer = load_model(model_name)
159
  # Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
160
  model = model.to('cuda')
161
- # Tokenize the augmented prompt and move input tensors to GPU.
162
- input_ids = tokenizer(augmented_user_input, return_tensors="pt").input_ids.to('cuda')
 
 
 
163
 
164
  with torch.no_grad():
165
  output_ids = model.generate(
166
  input_ids,
 
167
  max_new_tokens=max_tokens,
168
  temperature=temperature,
169
  top_k=top_k,
 
86
  # Load the model and tokenizer using Transformers.
87
  model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
88
  tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
89
+
90
+ # If the pad token is missing or the same as the eos token, add a new pad token.
91
+ if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
92
+ tokenizer.add_special_tokens({'pad_token': '<pad>'})
93
+ model.resize_token_embeddings(len(tokenizer))
94
+
95
  LOADED_MODELS[model_name] = (model, tokenizer)
96
  CURRENT_MODEL_NAME = model_name
97
  return model, tokenizer
 
164
  model, tokenizer = load_model(model_name)
165
  # Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
166
  model = model.to('cuda')
167
+
168
+ # Tokenize the augmented prompt with padding and retrieve the attention mask.
169
+ encoding = tokenizer(augmented_user_input, return_tensors="pt", padding=True)
170
+ input_ids = encoding["input_ids"].to('cuda')
171
+ attention_mask = encoding["attention_mask"].to('cuda')
172
 
173
  with torch.no_grad():
174
  output_ids = model.generate(
175
  input_ids,
176
+ attention_mask=attention_mask,
177
  max_new_tokens=max_tokens,
178
  temperature=temperature,
179
  top_k=top_k,