Spaces:
Running
on
Zero
Running
on
Zero
add attention mask
Browse files
app.py
CHANGED
@@ -86,6 +86,12 @@ def load_model(model_name):
|
|
86 |
# Load the model and tokenizer using Transformers.
|
87 |
model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
88 |
tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
LOADED_MODELS[model_name] = (model, tokenizer)
|
90 |
CURRENT_MODEL_NAME = model_name
|
91 |
return model, tokenizer
|
@@ -158,12 +164,16 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
|
|
158 |
model, tokenizer = load_model(model_name)
|
159 |
# Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
|
160 |
model = model.to('cuda')
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
163 |
|
164 |
with torch.no_grad():
|
165 |
output_ids = model.generate(
|
166 |
input_ids,
|
|
|
167 |
max_new_tokens=max_tokens,
|
168 |
temperature=temperature,
|
169 |
top_k=top_k,
|
|
|
86 |
# Load the model and tokenizer using Transformers.
|
87 |
model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
88 |
tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
|
89 |
+
|
90 |
+
# If the pad token is missing or the same as the eos token, add a new pad token.
|
91 |
+
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
|
92 |
+
tokenizer.add_special_tokens({'pad_token': '<pad>'})
|
93 |
+
model.resize_token_embeddings(len(tokenizer))
|
94 |
+
|
95 |
LOADED_MODELS[model_name] = (model, tokenizer)
|
96 |
CURRENT_MODEL_NAME = model_name
|
97 |
return model, tokenizer
|
|
|
164 |
model, tokenizer = load_model(model_name)
|
165 |
# Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
|
166 |
model = model.to('cuda')
|
167 |
+
|
168 |
+
# Tokenize the augmented prompt with padding and retrieve the attention mask.
|
169 |
+
encoding = tokenizer(augmented_user_input, return_tensors="pt", padding=True)
|
170 |
+
input_ids = encoding["input_ids"].to('cuda')
|
171 |
+
attention_mask = encoding["attention_mask"].to('cuda')
|
172 |
|
173 |
with torch.no_grad():
|
174 |
output_ids = model.generate(
|
175 |
input_ids,
|
176 |
+
attention_mask=attention_mask,
|
177 |
max_new_tokens=max_tokens,
|
178 |
temperature=temperature,
|
179 |
top_k=top_k,
|