Spaces:

youssefgt
/

flask_pfe

Sleeping

App Files Files Community

Guetat Youssef commited on Jun 29

Commit

9774f95

1 Parent(s): aba82e3

test

Browse files

Files changed (1) hide show

app.py +65 -43

app.py CHANGED Viewed

@@ -69,7 +69,7 @@ def train_model_background(job_id):
         # Import heavy libraries after setting cache paths
         import torch
-        from datasets import load_dataset
         from huggingface_hub import login
         from transformers import (
             AutoModelForCausalLM,
@@ -77,7 +77,6 @@ def train_model_background(job_id):
             TrainingArguments,
             Trainer,
             TrainerCallback,
-            DataCollatorForLanguageModeling
         )
         from peft import (
             LoraConfig,
@@ -93,9 +92,10 @@ def train_model_background(job_id):
         progress.message = "Loading base model and tokenizer..."
         # === Configuration ===
-        base_model = "microsoft/DialoGPT-small"  # Smaller model for testing
         dataset_name = "ruslanmv/ai-medical-chatbot"
         new_model = f"trained-model-{job_id}"
         # === Load Model and Tokenizer ===
         model = AutoModelForCausalLM.from_pretrained(
@@ -115,6 +115,9 @@ def train_model_background(job_id):
         # Add padding token if not present
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         progress.status = "preparing_model"
         progress.message = "Setting up LoRA configuration..."
@@ -139,49 +142,62 @@ def train_model_background(job_id):
             cache_dir=temp_dir,
             trust_remote_code=True
         )
-        dataset = dataset.shuffle(seed=65).select(range(50))  # Use only 50 samples for faster testing
-        def tokenize_function(examples):
-            # Format the text
-            texts = []
-            for i in range(len(examples['Patient'])):
-                text = f"Patient: {examples['Patient'][i]}\nDoctor: {examples['Doctor'][i]}{tokenizer.eos_token}"
-                texts.append(text)
-            # Tokenize
-            tokenized = tokenizer(
-                texts,
-                truncation=True,
-                padding=False,
-                max_length=256,
-                return_tensors=None
-            )
-            # For causal LM, labels are the same as input_ids
-            tokenized["labels"] = tokenized["input_ids"].copy()
-            return tokenized
-        # Tokenize dataset
-        tokenized_dataset = dataset.map(
-            tokenize_function,
-            batched=True,
-            remove_columns=dataset.column_names,
-            desc="Tokenizing dataset"
-        )
-        # Data collator for language modeling
-        data_collator = DataCollatorForLanguageModeling(
-            tokenizer=tokenizer,
-            mlm=False,  # We're doing causal LM, not masked LM
-        )
         # Calculate total training steps
-        train_size = len(tokenized_dataset)
         batch_size = 2
         gradient_accumulation_steps = 1
         num_epochs = 1
-        steps_per_epoch = train_size // (batch_size * gradient_accumulation_steps)
         total_steps = steps_per_epoch * num_epochs
         progress.total_steps = total_steps
@@ -198,10 +214,10 @@ def train_model_background(job_id):
             gradient_accumulation_steps=gradient_accumulation_steps,
             num_train_epochs=num_epochs,
             logging_steps=1,
-            save_steps=20,
             save_total_limit=1,
             learning_rate=5e-5,
-            warmup_steps=5,
             logging_strategy="steps",
             save_strategy="steps",
             fp16=False,
@@ -209,6 +225,7 @@ def train_model_background(job_id):
             dataloader_num_workers=0,
             remove_unused_columns=False,
             report_to=None,
         )
         # Custom callback to track progress
@@ -219,14 +236,19 @@ def train_model_background(job_id):
             def on_log(self, args, state, control, model=None, logs=None, **kwargs):
                 current_time = time.time()
-                # Update every 5 seconds or on significant step changes
-                if current_time - self.last_update >= 5 or state.global_step % 2 == 0:
                     self.progress_tracker.update_progress(
                         state.global_step,
                         state.max_steps,
                         f"Training step {state.global_step}/{state.max_steps}"
                     )
                     self.last_update = current_time
             def on_train_begin(self, args, state, control, **kwargs):
                 self.progress_tracker.status = "training"
@@ -240,9 +262,9 @@ def train_model_background(job_id):
         trainer = Trainer(
             model=model,
             args=training_args,
-            train_dataset=tokenized_dataset,
-            data_collator=data_collator,
             callbacks=[ProgressCallback(progress)],
         )
         # === Train & Save ===

         # Import heavy libraries after setting cache paths
         import torch
+        from datasets import load_dataset, Dataset
         from huggingface_hub import login
         from transformers import (
             AutoModelForCausalLM,
             TrainingArguments,
             Trainer,
             TrainerCallback,
         )
         from peft import (
             LoraConfig,
         progress.message = "Loading base model and tokenizer..."
         # === Configuration ===
+        base_model = "microsoft/DialoGPT-small"
         dataset_name = "ruslanmv/ai-medical-chatbot"
         new_model = f"trained-model-{job_id}"
+        max_length = 256
         # === Load Model and Tokenizer ===
         model = AutoModelForCausalLM.from_pretrained(
         # Add padding token if not present
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        # Resize token embeddings if needed
+        model.resize_token_embeddings(len(tokenizer))
         progress.status = "preparing_model"
         progress.message = "Setting up LoRA configuration..."
             cache_dir=temp_dir,
             trust_remote_code=True
         )
+        dataset = dataset.shuffle(seed=65).select(range(30))  # Use only 30 samples for faster testing
+        # Custom dataset class for proper handling
+        class CustomDataset(torch.utils.data.Dataset):
+            def __init__(self, texts, tokenizer, max_length):
+                self.texts = texts
+                self.tokenizer = tokenizer
+                self.max_length = max_length
+            def __len__(self):
+                return len(self.texts)
+            def __getitem__(self, idx):
+                text = self.texts[idx]
+                # Tokenize the text
+                encoding = self.tokenizer(
+                    text,
+                    truncation=True,
+                    padding='max_length',
+                    max_length=self.max_length,
+                    return_tensors='pt'
+                )
+                # Flatten the tensors (remove batch dimension)
+                input_ids = encoding['input_ids'].squeeze()
+                attention_mask = encoding['attention_mask'].squeeze()
+                # For causal language modeling, labels are the same as input_ids
+                # But we shift them so the model predicts the next token
+                labels = input_ids.clone()
+                # Set labels to -100 for padding tokens (they won't contribute to loss)
+                labels[attention_mask == 0] = -100
+                return {
+                    'input_ids': input_ids,
+                    'attention_mask': attention_mask,
+                    'labels': labels
+                }
+        # Prepare texts
+        texts = []
+        for item in dataset:
+            text = f"Patient: {item['Patient']}\nDoctor: {item['Doctor']}{tokenizer.eos_token}"
+            texts.append(text)
+        # Create custom dataset
+        train_dataset = CustomDataset(texts, tokenizer, max_length)
         # Calculate total training steps
         batch_size = 2
         gradient_accumulation_steps = 1
         num_epochs = 1
+        steps_per_epoch = len(train_dataset) // (batch_size * gradient_accumulation_steps)
         total_steps = steps_per_epoch * num_epochs
         progress.total_steps = total_steps
             gradient_accumulation_steps=gradient_accumulation_steps,
             num_train_epochs=num_epochs,
             logging_steps=1,
+            save_steps=15,
             save_total_limit=1,
             learning_rate=5e-5,
+            warmup_steps=2,
             logging_strategy="steps",
             save_strategy="steps",
             fp16=False,
             dataloader_num_workers=0,
             remove_unused_columns=False,
             report_to=None,
+            prediction_loss_only=True,
         )
         # Custom callback to track progress
             def on_log(self, args, state, control, model=None, logs=None, **kwargs):
                 current_time = time.time()
+                # Update every 3 seconds
+                if current_time - self.last_update >= 3:
                     self.progress_tracker.update_progress(
                         state.global_step,
                         state.max_steps,
                         f"Training step {state.global_step}/{state.max_steps}"
                     )
                     self.last_update = current_time
+                    # Log training metrics if available
+                    if logs:
+                        loss = logs.get('train_loss', logs.get('loss', 'N/A'))
+                        self.progress_tracker.message = f"Step {state.global_step}/{state.max_steps}, Loss: {loss}"
             def on_train_begin(self, args, state, control, **kwargs):
                 self.progress_tracker.status = "training"
         trainer = Trainer(
             model=model,
             args=training_args,
+            train_dataset=train_dataset,
             callbacks=[ProgressCallback(progress)],
+            tokenizer=tokenizer,
         )
         # === Train & Save ===