Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -373,14 +373,28 @@ def train_model(
|
|
373 |
model_repo_name,
|
374 |
dataset_repo_name,
|
375 |
epochs=1,
|
376 |
-
batch_size=4,
|
377 |
-
grad_accum_steps=
|
378 |
learning_rate=2e-4,
|
379 |
progress=gr.Progress()
|
380 |
):
|
381 |
progress(0, desc="Setting up environment...")
|
382 |
log = []
|
383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
# Clean up any existing model files to save space
|
385 |
if os.path.exists("./model_files"):
|
386 |
try:
|
@@ -508,45 +522,66 @@ def train_model(
|
|
508 |
torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
|
509 |
)
|
510 |
|
511 |
-
# --- Load Tokenizer (
|
512 |
-
# Following the pattern from reference code
|
513 |
progress(0.3, desc="Loading tokenizer...")
|
514 |
|
515 |
# Try to load a compatible tokenizer
|
516 |
try:
|
517 |
-
# First try
|
518 |
tokenizer = AutoTokenizer.from_pretrained(
|
519 |
-
"
|
520 |
padding_side="right",
|
521 |
use_fast=True,
|
522 |
-
trust_remote_code=True
|
523 |
)
|
524 |
-
log.append("Loaded tokenizer from
|
525 |
except Exception as e1:
|
526 |
-
log.append(f"Couldn't load
|
527 |
try:
|
528 |
-
#
|
529 |
tokenizer = AutoTokenizer.from_pretrained(
|
530 |
-
"
|
531 |
padding_side="right",
|
532 |
use_fast=True
|
533 |
)
|
534 |
-
log.append("Loaded
|
535 |
except Exception as e2:
|
536 |
-
log.append(f"Couldn't load
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
-
# Set pad token
|
546 |
if tokenizer.pad_token is None:
|
547 |
-
tokenizer.pad_token = tokenizer.eos_token
|
|
|
548 |
|
549 |
-
log.append(f"
|
|
|
550 |
log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
|
551 |
|
552 |
# --- QLoRA Preparation ---
|
|
|
373 |
model_repo_name,
|
374 |
dataset_repo_name,
|
375 |
epochs=1,
|
376 |
+
batch_size=4,
|
377 |
+
grad_accum_steps=2,
|
378 |
learning_rate=2e-4,
|
379 |
progress=gr.Progress()
|
380 |
):
|
381 |
progress(0, desc="Setting up environment...")
|
382 |
log = []
|
383 |
|
384 |
+
# Install sentencepiece if it's not already installed
|
385 |
+
progress(0.02, desc="Installing required dependencies...")
|
386 |
+
try:
|
387 |
+
import sentencepiece
|
388 |
+
log.append("SentencePiece already installed")
|
389 |
+
except ImportError:
|
390 |
+
log.append("Installing SentencePiece...")
|
391 |
+
try:
|
392 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
|
393 |
+
log.append("SentencePiece installed successfully")
|
394 |
+
except Exception as e:
|
395 |
+
log.append(f"Error installing SentencePiece: {e}")
|
396 |
+
# Continue anyway, we'll try other tokenizer approaches if this fails
|
397 |
+
|
398 |
# Clean up any existing model files to save space
|
399 |
if os.path.exists("./model_files"):
|
400 |
try:
|
|
|
522 |
torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
|
523 |
)
|
524 |
|
525 |
+
# --- Load Tokenizer (using public models) ---
|
|
|
526 |
progress(0.3, desc="Loading tokenizer...")
|
527 |
|
528 |
# Try to load a compatible tokenizer
|
529 |
try:
|
530 |
+
# First try TinyLlama which is open and uses Llama tokenizer
|
531 |
tokenizer = AutoTokenizer.from_pretrained(
|
532 |
+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Public model with Llama tokenizer
|
533 |
padding_side="right",
|
534 |
use_fast=True,
|
|
|
535 |
)
|
536 |
+
log.append("Loaded tokenizer from TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
537 |
except Exception as e1:
|
538 |
+
log.append(f"Couldn't load TinyLlama tokenizer: {e1}")
|
539 |
try:
|
540 |
+
# Try Phi-2 which is also public
|
541 |
tokenizer = AutoTokenizer.from_pretrained(
|
542 |
+
"microsoft/phi-2",
|
543 |
padding_side="right",
|
544 |
use_fast=True
|
545 |
)
|
546 |
+
log.append("Loaded Phi-2 tokenizer as fallback")
|
547 |
except Exception as e2:
|
548 |
+
log.append(f"Couldn't load Phi-2 tokenizer: {e2}")
|
549 |
+
try:
|
550 |
+
# Try CodeLlama which is popular and public
|
551 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
552 |
+
"codellama/CodeLlama-7b-hf",
|
553 |
+
padding_side="right"
|
554 |
+
)
|
555 |
+
log.append("Loaded CodeLlama tokenizer as fallback")
|
556 |
+
except Exception as e3:
|
557 |
+
log.append(f"Couldn't load any standard tokenizers. Using a basic tokenizer instead.")
|
558 |
+
|
559 |
+
# Create a minimal tokenizer that works with our format
|
560 |
+
# Assuming the vocab size is 2048 (from the RVQ token count)
|
561 |
+
from transformers import PreTrainedTokenizerFast
|
562 |
+
|
563 |
+
# Create a very basic tokenizer
|
564 |
+
tokenizer = PreTrainedTokenizerFast(
|
565 |
+
tokenizer_file=None, # No file needed
|
566 |
+
bos_token="<s>",
|
567 |
+
eos_token="</s>",
|
568 |
+
unk_token="<unk>",
|
569 |
+
pad_token="<pad>",
|
570 |
+
model_max_length=2048 # Safe default value
|
571 |
+
)
|
572 |
+
|
573 |
+
# Add vocabulary - creating a minimal vocab for the RVQ tokens
|
574 |
+
vocab = {f"<token_{i}>": i for i in range(model.config.vocab_size)}
|
575 |
+
tokenizer.add_tokens(list(vocab.keys()))
|
576 |
+
log.append(f"Created basic tokenizer with {len(tokenizer)} tokens")
|
577 |
|
578 |
+
# Set pad token if not already set
|
579 |
if tokenizer.pad_token is None:
|
580 |
+
tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token is not None else "<pad>"
|
581 |
+
log.append("Set pad_token to eos_token or <pad>")
|
582 |
|
583 |
+
log.append(f"Tokenizer loaded with vocab size: {len(tokenizer)}")
|
584 |
+
log.append(f"Model vocab size: {model.config.vocab_size}")
|
585 |
log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
|
586 |
|
587 |
# --- QLoRA Preparation ---
|