Spaces:

youssefgt
/

flask_pfe

Sleeping

App Files Files Community

Guetat Youssef commited on Jul 1

Commit

28d8f8e

1 Parent(s): 9774f95

test

Browse files

Files changed (1) hide show

app.py +169 -18

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
-from flask import Flask, jsonify, request
 import threading
 import time
 import os
 import tempfile
 import shutil
 import uuid
 from datetime import datetime, timedelta
 app = Flask(__name__)
@@ -23,6 +25,8 @@ class TrainingProgress:
         self.estimated_finish_time = None
         self.message = "Starting training..."
         self.error = None
     def update_progress(self, current_step, total_steps, message=""):
         self.current_step = current_step
@@ -47,10 +51,67 @@ class TrainingProgress:
             "total_steps": self.total_steps,
             "message": self.message,
             "estimated_finish_time": self.estimated_finish_time.isoformat() if self.estimated_finish_time else None,
-            "error": self.error
         }
-def train_model_background(job_id):
     """Background training function with progress tracking"""
     progress = training_jobs[job_id]
@@ -92,8 +153,7 @@ def train_model_background(job_id):
         progress.message = "Loading base model and tokenizer..."
         # === Configuration ===
-        base_model = "microsoft/DialoGPT-small"
-        dataset_name = "ruslanmv/ai-medical-chatbot"
         new_model = f"trained-model-{job_id}"
         max_length = 256
@@ -138,11 +198,22 @@ def train_model_background(job_id):
         # === Load & Prepare Dataset ===
         dataset = load_dataset(
             dataset_name,
-            split="all",
             cache_dir=temp_dir,
             trust_remote_code=True
         )
-        dataset = dataset.shuffle(seed=65).select(range(30))  # Use only 30 samples for faster testing
         # Custom dataset class for proper handling
         class CustomDataset(torch.utils.data.Dataset):
@@ -171,7 +242,6 @@ def train_model_background(job_id):
                 attention_mask = encoding['attention_mask'].squeeze()
                 # For causal language modeling, labels are the same as input_ids
-                # But we shift them so the model predicts the next token
                 labels = input_ids.clone()
                 # Set labels to -100 for padding tokens (they won't contribute to loss)
@@ -183,10 +253,12 @@ def train_model_background(job_id):
                     'labels': labels
                 }
-        # Prepare texts
         texts = []
         for item in dataset:
-            text = f"Patient: {item['Patient']}\nDoctor: {item['Doctor']}{tokenizer.eos_token}"
             texts.append(text)
         # Create custom dataset
@@ -214,7 +286,7 @@ def train_model_background(job_id):
             gradient_accumulation_steps=gradient_accumulation_steps,
             num_train_epochs=num_epochs,
             logging_steps=1,
-            save_steps=15,
             save_total_limit=1,
             learning_rate=5e-5,
             warmup_steps=2,
@@ -272,15 +344,20 @@ def train_model_background(job_id):
         trainer.save_model(output_dir)
         tokenizer.save_pretrained(output_dir)
         progress.status = "completed"
         progress.progress = 100
-        progress.message = f"Training completed! Model saved to {output_dir}"
-        # Clean up temporary directory after a delay
         def cleanup_temp_dir():
-            time.sleep(300)  # Wait 5 minutes before cleanup
             try:
                 shutil.rmtree(temp_dir)
             except:
                 pass
@@ -300,23 +377,46 @@ def train_model_background(job_id):
         except:
             pass
 # ============== API ROUTES ==============
 @app.route('/api/train', methods=['POST'])
 def start_training():
     """Start training and return job ID for tracking"""
     try:
         job_id = str(uuid.uuid4())[:8]  # Short UUID
         progress = TrainingProgress(job_id)
         training_jobs[job_id] = progress
         # Start training in background thread
-        training_thread = threading.Thread(target=train_model_background, args=(job_id,))
         training_thread.daemon = True
         training_thread.start()
         return jsonify({
             "status": "started",
             "job_id": job_id,
             "message": "Training started. Use /api/status/<job_id> to track progress."
         })
@@ -332,6 +432,40 @@ def get_training_status(job_id):
     progress = training_jobs[job_id]
     return jsonify(progress.to_dict())
 @app.route('/api/jobs', methods=['GET'])
 def list_jobs():
     """List all training jobs"""
@@ -341,11 +475,28 @@ def list_jobs():
 @app.route('/')
 def home():
     return jsonify({
-        "message": "Welcome to LLaMA Fine-tuning API!",
         "endpoints": {
-            "POST /api/train": "Start training",
-            "GET /api/status/<job_id>": "Get training status",
             "GET /api/jobs": "List all jobs"
         }
     })

+from flask import Flask, jsonify, request, send_file
 import threading
 import time
 import os
 import tempfile
 import shutil
 import uuid
+import zipfile
+import io
 from datetime import datetime, timedelta
 app = Flask(__name__)
         self.estimated_finish_time = None
         self.message = "Starting training..."
         self.error = None
+        self.model_path = None
+        self.detected_columns = None
     def update_progress(self, current_step, total_steps, message=""):
         self.current_step = current_step
             "total_steps": self.total_steps,
             "message": self.message,
             "estimated_finish_time": self.estimated_finish_time.isoformat() if self.estimated_finish_time else None,
+            "error": self.error,
+            "model_path": self.model_path,
+            "detected_columns": self.detected_columns
         }
+def detect_qa_columns(dataset):
+    """Automatically detect question and answer columns in the dataset"""
+    # Common patterns for question columns
+    question_patterns = [
+        'question', 'prompt', 'input', 'query', 'patient', 'user', 'human',
+        'instruction', 'context', 'q', 'text', 'source'
+    ]
+    # Common patterns for answer columns
+    answer_patterns = [
+        'answer', 'response', 'output', 'reply', 'doctor', 'assistant', 'ai',
+        'completion', 'target', 'a', 'label', 'ground_truth'
+    ]
+    # Get column names
+    columns = list(dataset.column_names)
+    # Find question column
+    question_col = None
+    for pattern in question_patterns:
+        for col in columns:
+            if pattern.lower() in col.lower():
+                question_col = col
+                break
+        if question_col:
+            break
+    # Find answer column
+    answer_col = None
+    for pattern in answer_patterns:
+        for col in columns:
+            if pattern.lower() in col.lower() and col != question_col:
+                answer_col = col
+                break
+        if answer_col:
+            break
+    # Fallback: use first two text columns if patterns don't match
+    if not question_col or not answer_col:
+        text_columns = []
+        for col in columns:
+            # Check if column contains text data
+            sample = dataset[0][col]
+            if isinstance(sample, str) and len(sample.strip()) > 0:
+                text_columns.append(col)
+        if len(text_columns) >= 2:
+            question_col = text_columns[0]
+            answer_col = text_columns[1]
+        elif len(text_columns) == 1:
+            # Single column case - use it for both (self-supervised)
+            question_col = answer_col = text_columns[0]
+    return question_col, answer_col
+def train_model_background(job_id, dataset_name, base_model_name=None):
     """Background training function with progress tracking"""
     progress = training_jobs[job_id]
         progress.message = "Loading base model and tokenizer..."
         # === Configuration ===
+        base_model = base_model_name or "microsoft/DialoGPT-small"
         new_model = f"trained-model-{job_id}"
         max_length = 256
         # === Load & Prepare Dataset ===
         dataset = load_dataset(
             dataset_name,
+            split="train" if "train" in load_dataset(dataset_name, cache_dir=temp_dir).keys() else "all",
             cache_dir=temp_dir,
             trust_remote_code=True
         )
+        # Automatically detect question and answer columns
+        question_col, answer_col = detect_qa_columns(dataset)
+        if not question_col or not answer_col:
+            raise ValueError("Could not automatically detect question and answer columns in the dataset")
+        progress.detected_columns = {"question": question_col, "answer": answer_col}
+        progress.message = f"Detected columns - Question: {question_col}, Answer: {answer_col}"
+        # Use subset for faster testing (can be made configurable)
+        dataset = dataset.shuffle(seed=65).select(range(min(100, len(dataset))))
         # Custom dataset class for proper handling
         class CustomDataset(torch.utils.data.Dataset):
                 attention_mask = encoding['attention_mask'].squeeze()
                 # For causal language modeling, labels are the same as input_ids
                 labels = input_ids.clone()
                 # Set labels to -100 for padding tokens (they won't contribute to loss)
                     'labels': labels
                 }
+        # Prepare texts using detected columns
         texts = []
         for item in dataset:
+            question = str(item[question_col]).strip()
+            answer = str(item[answer_col]).strip()
+            text = f"Question: {question}\nAnswer: {answer}{tokenizer.eos_token}"
             texts.append(text)
         # Create custom dataset
             gradient_accumulation_steps=gradient_accumulation_steps,
             num_train_epochs=num_epochs,
             logging_steps=1,
+            save_steps=max(1, total_steps // 2),
             save_total_limit=1,
             learning_rate=5e-5,
             warmup_steps=2,
         trainer.save_model(output_dir)
         tokenizer.save_pretrained(output_dir)
+        # Save model info
+        progress.model_path = output_dir
         progress.status = "completed"
         progress.progress = 100
+        progress.message = f"Training completed! Model ready for download."
+        # Keep the temp directory for download (cleanup after 1 hour)
         def cleanup_temp_dir():
+            time.sleep(3600)  # Wait 1 hour before cleanup
             try:
                 shutil.rmtree(temp_dir)
+                # Remove from training_jobs after cleanup
+                if job_id in training_jobs:
+                    del training_jobs[job_id]
             except:
                 pass
         except:
             pass
+def create_model_zip(model_path, job_id):
+    """Create a zip file containing the trained model"""
+    memory_file = io.BytesIO()
+    with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for root, dirs, files in os.walk(model_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                arc_name = os.path.relpath(file_path, model_path)
+                zf.write(file_path, arc_name)
+    memory_file.seek(0)
+    return memory_file
 # ============== API ROUTES ==============
 @app.route('/api/train', methods=['POST'])
 def start_training():
     """Start training and return job ID for tracking"""
     try:
+        data = request.get_json() if request.is_json else {}
+        dataset_name = data.get('dataset_name', 'ruslanmv/ai-medical-chatbot')
+        base_model_name = data.get('base_model', 'microsoft/DialoGPT-small')
         job_id = str(uuid.uuid4())[:8]  # Short UUID
         progress = TrainingProgress(job_id)
         training_jobs[job_id] = progress
         # Start training in background thread
+        training_thread = threading.Thread(
+            target=train_model_background,
+            args=(job_id, dataset_name, base_model_name)
+        )
         training_thread.daemon = True
         training_thread.start()
         return jsonify({
             "status": "started",
             "job_id": job_id,
+            "dataset_name": dataset_name,
+            "base_model": base_model_name,
             "message": "Training started. Use /api/status/<job_id> to track progress."
         })
     progress = training_jobs[job_id]
     return jsonify(progress.to_dict())
+@app.route('/api/download/<job_id>', methods=['GET'])
+def download_model(job_id):
+    """Download the trained model as a zip file"""
+    if job_id not in training_jobs:
+        return jsonify({"status": "error", "message": "Job not found"}), 404
+    progress = training_jobs[job_id]
+    if progress.status != "completed":
+        return jsonify({
+            "status": "error",
+            "message": f"Model not ready for download. Current status: {progress.status}"
+        }), 400
+    if not progress.model_path or not os.path.exists(progress.model_path):
+        return jsonify({
+            "status": "error",
+            "message": "Model files not found. They may have been cleaned up."
+        }), 404
+    try:
+        # Create zip file in memory
+        zip_file = create_model_zip(progress.model_path, job_id)
+        return send_file(
+            zip_file,
+            as_attachment=True,
+            download_name=f"trained_model_{job_id}.zip",
+            mimetype='application/zip'
+        )
+    except Exception as e:
+        return jsonify({"status": "error", "message": f"Download failed: {str(e)}"}), 500
 @app.route('/api/jobs', methods=['GET'])
 def list_jobs():
     """List all training jobs"""
 @app.route('/')
 def home():
     return jsonify({
+        "message": "Welcome to Enhanced LLaMA Fine-tuning API!",
+        "features": [
+            "Automatic question/answer column detection",
+            "Configurable base model and dataset",
+            "Local model download",
+            "Progress tracking with ETA"
+        ],
         "endpoints": {
+            "POST /api/train": "Start training (accepts dataset_name and base_model in JSON)",
+            "GET /api/status/<job_id>": "Get training status and detected columns",
+            "GET /api/download/<job_id>": "Download trained model as zip",
             "GET /api/jobs": "List all jobs"
+        },
+        "usage_example": {
+            "start_training": {
+                "method": "POST",
+                "url": "/api/train",
+                "body": {
+                    "dataset_name": "your-dataset-name",
+                    "base_model": "microsoft/DialoGPT-small"
+                }
+            }
         }
     })