Twelve2five commited on
Commit
e302645
·
verified ·
1 Parent(s): 154b3c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -23
app.py CHANGED
@@ -373,14 +373,28 @@ def train_model(
373
  model_repo_name,
374
  dataset_repo_name,
375
  epochs=1,
376
- batch_size=4, # Increased for A100
377
- grad_accum_steps=4,
378
  learning_rate=2e-4,
379
  progress=gr.Progress()
380
  ):
381
  progress(0, desc="Setting up environment...")
382
  log = []
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  # Clean up any existing model files to save space
385
  if os.path.exists("./model_files"):
386
  try:
@@ -508,45 +522,66 @@ def train_model(
508
  torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
509
  )
510
 
511
- # --- Load Tokenizer (from a compatible model) ---
512
- # Following the pattern from reference code
513
  progress(0.3, desc="Loading tokenizer...")
514
 
515
  # Try to load a compatible tokenizer
516
  try:
517
- # First try loading from standard Llama 3 model
518
  tokenizer = AutoTokenizer.from_pretrained(
519
- "meta-llama/Llama-3-8B", # Using standard Llama 3 tokenizer
520
  padding_side="right",
521
  use_fast=True,
522
- trust_remote_code=True
523
  )
524
- log.append("Loaded tokenizer from meta-llama/Llama-3-8B")
525
  except Exception as e1:
526
- log.append(f"Couldn't load Llama-3 tokenizer: {e1}")
527
  try:
528
- # Fallback to Llama 2
529
  tokenizer = AutoTokenizer.from_pretrained(
530
- "meta-llama/Llama-2-7b-hf",
531
  padding_side="right",
532
  use_fast=True
533
  )
534
- log.append("Loaded Llama-2 tokenizer as fallback")
535
  except Exception as e2:
536
- log.append(f"Couldn't load Llama-2 tokenizer: {e2}")
537
- # Final fallback
538
- from transformers import LlamaTokenizer
539
- tokenizer = LlamaTokenizer.from_pretrained(
540
- "hf-internal-testing/llama-tokenizer",
541
- padding_side="right"
542
- )
543
- log.append("Loaded testing Llama tokenizer as final fallback")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
- # Set pad token and ensure it's usable
546
  if tokenizer.pad_token is None:
547
- tokenizer.pad_token = tokenizer.eos_token
 
548
 
549
- log.append(f"Loaded model vocab size: {model.config.vocab_size}")
 
550
  log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
551
 
552
  # --- QLoRA Preparation ---
 
373
  model_repo_name,
374
  dataset_repo_name,
375
  epochs=1,
376
+ batch_size=4,
377
+ grad_accum_steps=2,
378
  learning_rate=2e-4,
379
  progress=gr.Progress()
380
  ):
381
  progress(0, desc="Setting up environment...")
382
  log = []
383
 
384
+ # Install sentencepiece if it's not already installed
385
+ progress(0.02, desc="Installing required dependencies...")
386
+ try:
387
+ import sentencepiece
388
+ log.append("SentencePiece already installed")
389
+ except ImportError:
390
+ log.append("Installing SentencePiece...")
391
+ try:
392
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
393
+ log.append("SentencePiece installed successfully")
394
+ except Exception as e:
395
+ log.append(f"Error installing SentencePiece: {e}")
396
+ # Continue anyway, we'll try other tokenizer approaches if this fails
397
+
398
  # Clean up any existing model files to save space
399
  if os.path.exists("./model_files"):
400
  try:
 
522
  torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
523
  )
524
 
525
+ # --- Load Tokenizer (using public models) ---
 
526
  progress(0.3, desc="Loading tokenizer...")
527
 
528
  # Try to load a compatible tokenizer
529
  try:
530
+ # First try TinyLlama which is open and uses Llama tokenizer
531
  tokenizer = AutoTokenizer.from_pretrained(
532
+ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Public model with Llama tokenizer
533
  padding_side="right",
534
  use_fast=True,
 
535
  )
536
+ log.append("Loaded tokenizer from TinyLlama/TinyLlama-1.1B-Chat-v1.0")
537
  except Exception as e1:
538
+ log.append(f"Couldn't load TinyLlama tokenizer: {e1}")
539
  try:
540
+ # Try Phi-2 which is also public
541
  tokenizer = AutoTokenizer.from_pretrained(
542
+ "microsoft/phi-2",
543
  padding_side="right",
544
  use_fast=True
545
  )
546
+ log.append("Loaded Phi-2 tokenizer as fallback")
547
  except Exception as e2:
548
+ log.append(f"Couldn't load Phi-2 tokenizer: {e2}")
549
+ try:
550
+ # Try CodeLlama which is popular and public
551
+ tokenizer = AutoTokenizer.from_pretrained(
552
+ "codellama/CodeLlama-7b-hf",
553
+ padding_side="right"
554
+ )
555
+ log.append("Loaded CodeLlama tokenizer as fallback")
556
+ except Exception as e3:
557
+ log.append(f"Couldn't load any standard tokenizers. Using a basic tokenizer instead.")
558
+
559
+ # Create a minimal tokenizer that works with our format
560
+ # Assuming the vocab size is 2048 (from the RVQ token count)
561
+ from transformers import PreTrainedTokenizerFast
562
+
563
+ # Create a very basic tokenizer
564
+ tokenizer = PreTrainedTokenizerFast(
565
+ tokenizer_file=None, # No file needed
566
+ bos_token="<s>",
567
+ eos_token="</s>",
568
+ unk_token="<unk>",
569
+ pad_token="<pad>",
570
+ model_max_length=2048 # Safe default value
571
+ )
572
+
573
+ # Add vocabulary - creating a minimal vocab for the RVQ tokens
574
+ vocab = {f"<token_{i}>": i for i in range(model.config.vocab_size)}
575
+ tokenizer.add_tokens(list(vocab.keys()))
576
+ log.append(f"Created basic tokenizer with {len(tokenizer)} tokens")
577
 
578
+ # Set pad token if not already set
579
  if tokenizer.pad_token is None:
580
+ tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token is not None else "<pad>"
581
+ log.append("Set pad_token to eos_token or <pad>")
582
 
583
+ log.append(f"Tokenizer loaded with vocab size: {len(tokenizer)}")
584
+ log.append(f"Model vocab size: {model.config.vocab_size}")
585
  log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
586
 
587
  # --- QLoRA Preparation ---