Twelve2five commited on
Commit
af0160e
·
verified ·
1 Parent(s): 0586d21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -36
app.py CHANGED
@@ -207,54 +207,63 @@ def load_model():
207
  log.append(f"Alternative loading also failed: {e2}")
208
  return "\n".join(log)
209
 
210
- # Try to load the tokenizer from the model repository directly
211
  progress(0.3, desc="Loading tokenizer...")
212
  try:
213
- # First attempt: Try loading from local path
214
- tokenizer = AutoTokenizer.from_pretrained(
215
- local_model_path,
216
- padding_side="right",
217
- use_fast=True,
218
- )
219
- log.append("Tokenizer loaded from local files")
220
- except Exception as e:
221
- log.append(f"Could not load tokenizer from local files: {e}")
222
 
223
- # Second attempt: Try loading directly from HF repo
224
  try:
225
- log.append("Attempting to load tokenizer directly from Hugging Face...")
226
  tokenizer = AutoTokenizer.from_pretrained(
227
- hf_model_repo_id,
228
- padding_side="right",
229
  use_fast=True,
 
 
230
  )
231
- log.append("Tokenizer loaded from Hugging Face repository")
232
- except Exception as e2:
233
- # Third attempt: Try loading a compatible tokenizer
234
- log.append(f"Could not load tokenizer from repo: {e2}")
235
- log.append("Attempting to load a compatible LlamaTokenizer...")
236
  try:
 
 
 
 
 
 
 
 
237
  from transformers import LlamaTokenizer
238
-
239
- # Try Meta's standard Llama tokenizer
240
  tokenizer = LlamaTokenizer.from_pretrained(
241
- "meta-llama/Llama-2-7b-hf", # Standard Llama tokenizer
242
- padding_side="right",
243
- use_fast=False, # Try the Python version
244
  )
245
- log.append("Loaded a compatible LlamaTokenizer as fallback")
246
- except Exception as e3:
247
- error_msg = f"Failed to load any compatible tokenizer: {e3}"
248
- log.append(error_msg)
249
- return "\n".join(log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- # Set pad token if not already set
252
- if tokenizer.pad_token is None:
253
- tokenizer.pad_token = tokenizer.eos_token
254
- log.append("Set pad_token to eos_token")
255
-
256
- print(f"Loaded tokenizer vocabulary size: {len(tokenizer)}")
257
-
258
  # Print information about input embeddings
259
  print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
260
 
 
207
  log.append(f"Alternative loading also failed: {e2}")
208
  return "\n".join(log)
209
 
210
+ # --- Load Tokenizer ---
211
  progress(0.3, desc="Loading tokenizer...")
212
  try:
213
+ log.append("Loading a compatible tokenizer...")
214
+ # Use the tokenizer from Meta's official Llama models - should be compatible with Llama 3.2
215
+ tokenizer_id = "meta-llama/Llama-3-8B" # This is a reliable source for a Llama tokenizer
 
 
 
 
 
 
216
 
217
+ # Try with specified tokenizer first
218
  try:
 
219
  tokenizer = AutoTokenizer.from_pretrained(
220
+ tokenizer_id,
 
221
  use_fast=True,
222
+ padding_side="right",
223
+ trust_remote_code=True
224
  )
225
+ log.append(f"Successfully loaded tokenizer from {tokenizer_id}")
226
+ except Exception as e:
227
+ log.append(f"Could not load from {tokenizer_id}: {e}")
228
+ # Fallback to Llama-2 tokenizer
 
229
  try:
230
+ tokenizer = AutoTokenizer.from_pretrained(
231
+ "meta-llama/Llama-2-7b-hf",
232
+ use_fast=True,
233
+ padding_side="right"
234
+ )
235
+ log.append("Loaded Llama-2 tokenizer as fallback")
236
+ except Exception as e2:
237
+ # If that fails too, try the most basic option
238
  from transformers import LlamaTokenizer
 
 
239
  tokenizer = LlamaTokenizer.from_pretrained(
240
+ "hf-internal-testing/llama-tokenizer",
241
+ use_fast=False,
242
+ padding_side="right"
243
  )
244
+ log.append("Loaded basic Llama tokenizer from testing repo")
245
+
246
+ # Set pad token if not already set
247
+ if tokenizer.pad_token is None:
248
+ tokenizer.pad_token = tokenizer.eos_token
249
+ log.append("Set pad_token to eos_token")
250
+
251
+ # Make sure we have necessary special tokens
252
+ if tokenizer.bos_token is None:
253
+ tokenizer.bos_token = "<s>"
254
+ log.append("Set bos_token to <s>")
255
+
256
+ if tokenizer.eos_token is None:
257
+ tokenizer.eos_token = "</s>"
258
+ log.append("Set eos_token to </s>")
259
+
260
+ log.append(f"Loaded model vocab size: {len(tokenizer)}")
261
+
262
+ except Exception as e:
263
+ error_msg = f"All attempts to load a tokenizer failed: {e}"
264
+ log.append(error_msg)
265
+ return "\n".join(log)
266
 
 
 
 
 
 
 
 
267
  # Print information about input embeddings
268
  print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
269