Spaces:

youssefgt
/

flask_pfe

Sleeping

App Files Files Community

Guetat Youssef commited on Jun 29

Commit

aba82e3

1 Parent(s): 10b3fe6

test

Browse files

Files changed (2) hide show

app.py +53 -35
requirements.txt +5 -5

app.py CHANGED Viewed

@@ -72,19 +72,17 @@ def train_model_background(job_id):
         from datasets import load_dataset
         from huggingface_hub import login
         from transformers import (
-            AutoConfig,
             AutoModelForCausalLM,
             AutoTokenizer,
-            BitsAndBytesConfig,
             TrainingArguments,
-            logging,
-            TrainerCallback
         )
         from peft import (
             LoraConfig,
             get_peft_model,
         )
-        from trl import SFTTrainer, setup_chat_format
         # === Authentication ===
         hf_token = os.getenv('HF_TOKEN')
@@ -99,11 +97,11 @@ def train_model_background(job_id):
         dataset_name = "ruslanmv/ai-medical-chatbot"
         new_model = f"trained-model-{job_id}"
-        # === Load Model and Tokenizer (without quantization for simplicity) ===
         model = AutoModelForCausalLM.from_pretrained(
             base_model,
             cache_dir=temp_dir,
-            torch_dtype=torch.float32,  # Use float32 for compatibility
             device_map="auto" if torch.cuda.is_available() else "cpu",
             trust_remote_code=True
         )
@@ -121,9 +119,9 @@ def train_model_background(job_id):
         progress.status = "preparing_model"
         progress.message = "Setting up LoRA configuration..."
-        # === LoRA Config (simplified) ===
         peft_config = LoraConfig(
-            r=8,  # Smaller rank
             lora_alpha=16,
             lora_dropout=0.1,
             bias="none",
@@ -141,19 +139,45 @@ def train_model_background(job_id):
             cache_dir=temp_dir,
             trust_remote_code=True
         )
-        dataset = dataset.shuffle(seed=65).select(range(100))  # Use only 100 samples for testing
-        def format_chat_template(row):
-            # Simple formatting without chat template
-            text = f"Patient: {row['Patient']}\nDoctor: {row['Doctor']}"
-            return {"text": text}
-        dataset = dataset.map(format_chat_template, num_proc=1)
-        dataset = dataset.train_test_split(test_size=0.1)
         # Calculate total training steps
-        train_size = len(dataset["train"])
-        batch_size = 1
         gradient_accumulation_steps = 1
         num_epochs = 1
@@ -171,25 +195,20 @@ def train_model_background(job_id):
         training_args = TrainingArguments(
             output_dir=output_dir,
             per_device_train_batch_size=batch_size,
-            per_device_eval_batch_size=1,
             gradient_accumulation_steps=gradient_accumulation_steps,
-            optim="adamw_torch",  # Use standard optimizer
             num_train_epochs=num_epochs,
-            eval_steps=0.5,
             logging_steps=1,
             warmup_steps=5,
             logging_strategy="steps",
-            learning_rate=5e-5,
             fp16=False,
             bf16=False,
-            group_by_length=True,
-            save_steps=10,
-            save_total_limit=1,
-            report_to=None,
             dataloader_num_workers=0,
             remove_unused_columns=False,
-            load_best_model_at_end=False,
-            # Remove evaluation_strategy parameter - not supported in this version
         )
         # Custom callback to track progress
@@ -200,8 +219,8 @@ def train_model_background(job_id):
             def on_log(self, args, state, control, model=None, logs=None, **kwargs):
                 current_time = time.time()
-                # Update every 10 seconds or on significant step changes
-                if current_time - self.last_update >= 10 or state.global_step % 5 == 0:
                     self.progress_tracker.update_progress(
                         state.global_step,
                         state.max_steps,
@@ -218,19 +237,18 @@ def train_model_background(job_id):
                 self.progress_tracker.message = "Training complete, saving model..."
         # === Trainer Initialization ===
-        trainer = SFTTrainer(
             model=model,
-            train_dataset=dataset["train"],
-            peft_config=peft_config,
             args=training_args,
             callbacks=[ProgressCallback(progress)],
-            tokenizer=tokenizer,
-            max_seq_length=256,  # Shorter sequences
         )
         # === Train & Save ===
         trainer.train()
         trainer.save_model(output_dir)
         progress.status = "completed"
         progress.progress = 100

         from datasets import load_dataset
         from huggingface_hub import login
         from transformers import (
             AutoModelForCausalLM,
             AutoTokenizer,
             TrainingArguments,
+            Trainer,
+            TrainerCallback,
+            DataCollatorForLanguageModeling
         )
         from peft import (
             LoraConfig,
             get_peft_model,
         )
         # === Authentication ===
         hf_token = os.getenv('HF_TOKEN')
         dataset_name = "ruslanmv/ai-medical-chatbot"
         new_model = f"trained-model-{job_id}"
+        # === Load Model and Tokenizer ===
         model = AutoModelForCausalLM.from_pretrained(
             base_model,
             cache_dir=temp_dir,
+            torch_dtype=torch.float32,
             device_map="auto" if torch.cuda.is_available() else "cpu",
             trust_remote_code=True
         )
         progress.status = "preparing_model"
         progress.message = "Setting up LoRA configuration..."
+        # === LoRA Config ===
         peft_config = LoraConfig(
+            r=8,
             lora_alpha=16,
             lora_dropout=0.1,
             bias="none",
             cache_dir=temp_dir,
             trust_remote_code=True
         )
+        dataset = dataset.shuffle(seed=65).select(range(50))  # Use only 50 samples for faster testing
+        def tokenize_function(examples):
+            # Format the text
+            texts = []
+            for i in range(len(examples['Patient'])):
+                text = f"Patient: {examples['Patient'][i]}\nDoctor: {examples['Doctor'][i]}{tokenizer.eos_token}"
+                texts.append(text)
+            # Tokenize
+            tokenized = tokenizer(
+                texts,
+                truncation=True,
+                padding=False,
+                max_length=256,
+                return_tensors=None
+            )
+            # For causal LM, labels are the same as input_ids
+            tokenized["labels"] = tokenized["input_ids"].copy()
+            return tokenized
+        # Tokenize dataset
+        tokenized_dataset = dataset.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=dataset.column_names,
+            desc="Tokenizing dataset"
+        )
+        # Data collator for language modeling
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer,
+            mlm=False,  # We're doing causal LM, not masked LM
+        )
         # Calculate total training steps
+        train_size = len(tokenized_dataset)
+        batch_size = 2
         gradient_accumulation_steps = 1
         num_epochs = 1
         training_args = TrainingArguments(
             output_dir=output_dir,
             per_device_train_batch_size=batch_size,
             gradient_accumulation_steps=gradient_accumulation_steps,
             num_train_epochs=num_epochs,
             logging_steps=1,
+            save_steps=20,
+            save_total_limit=1,
+            learning_rate=5e-5,
             warmup_steps=5,
             logging_strategy="steps",
+            save_strategy="steps",
             fp16=False,
             bf16=False,
             dataloader_num_workers=0,
             remove_unused_columns=False,
+            report_to=None,
         )
         # Custom callback to track progress
             def on_log(self, args, state, control, model=None, logs=None, **kwargs):
                 current_time = time.time()
+                # Update every 5 seconds or on significant step changes
+                if current_time - self.last_update >= 5 or state.global_step % 2 == 0:
                     self.progress_tracker.update_progress(
                         state.global_step,
                         state.max_steps,
                 self.progress_tracker.message = "Training complete, saving model..."
         # === Trainer Initialization ===
+        trainer = Trainer(
             model=model,
             args=training_args,
+            train_dataset=tokenized_dataset,
+            data_collator=data_collator,
             callbacks=[ProgressCallback(progress)],
         )
         # === Train & Save ===
         trainer.train()
         trainer.save_model(output_dir)
+        tokenizer.save_pretrained(output_dir)
         progress.status = "completed"
         progress.progress = 100

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
 flask==2.3.3
-transformers>=4.36.0,<4.45.0
-datasets>=2.14.0
-accelerate>=0.24.0
-peft>=0.6.0,<0.8.0
-trl>=0.7.0
 bitsandbytes
 torch>=2.0.0
 torchvision

 flask==2.3.3
+transformers==4.44.2
+datasets==2.20.0
+accelerate==0.33.0
+peft==0.12.0
+trl==0.9.6
 bitsandbytes
 torch>=2.0.0
 torchvision