Spaces:

Shilpaj
/

MnistStudio

Sleeping

App Files Files Community

Shilpaj commited on Nov 17, 2024

Commit

61f0070

1 Parent(s): c0a458a

Feat: Complete single model training and inference

Browse files

Files changed (5) hide show

app.py +121 -13
scripts/training/train.py +36 -25
static/js/inference.js +10 -0
templates/train_compare.html +20 -2
templates/train_single.html +34 -1

app.py CHANGED Viewed

@@ -8,11 +8,11 @@ import uvicorn
 import torch
 from scripts.model import Net
 from scripts.training.train import train
-import json
-import os
 from pathlib import Path
-import asyncio
 from fastapi import BackgroundTasks
 app = FastAPI()
@@ -83,10 +83,8 @@ async def train_model(config: TrainingConfig, background_tasks: BackgroundTasks)
 async def websocket_endpoint(websocket: WebSocket):
     await websocket.accept()
     try:
-        # Wait for configuration from client
         config_data = await websocket.receive_json()
-        # Create model instance with the configuration
         model = Net(
             kernels=[
                 config_data['block1'],
@@ -95,7 +93,6 @@ async def websocket_endpoint(websocket: WebSocket):
             ]
         )
-        # Create config object
         from scripts.training.config import NetworkConfig
         config = NetworkConfig()
         config.update(
@@ -104,14 +101,14 @@ async def websocket_endpoint(websocket: WebSocket):
             block3=config_data['block3'],
             optimizer=config_data['optimizer'],
             batch_size=config_data['batch_size'],
-            epochs=1
         )
         print(f"Starting training with config: {config_data}")
-        # Start training with websocket for real-time updates
         try:
-            await train(model, config, websocket)
             await websocket.send_json({
                 "type": "training_complete",
                 "data": {
@@ -134,6 +131,66 @@ async def websocket_endpoint(websocket: WebSocket):
     finally:
         print("WebSocket connection closed")
 # @app.post("/api/train_single")
 # async def train_single_model(config: TrainingConfig):
 #     try:
@@ -165,6 +222,36 @@ async def train_compare_models(config: ComparisonConfig):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/api/inference")
 async def perform_inference(data: dict):
     try:
@@ -175,10 +262,23 @@ async def perform_inference(data: dict):
         model_path = Path("scripts/training/models") / f"{model_name}.pth"
         if not model_path.exists():
             raise HTTPException(status_code=404, detail=f"Model not found: {model_path}")
-        # Load model and perform inference
-        model = Net()
-        model.load_state_dict(torch.load(str(model_path), map_location=torch.device('cpu')))
         model.eval()
         # Process image data and get prediction
@@ -216,7 +316,15 @@ async def perform_inference(data: dict):
                 output = model(image_tensor)
                 prediction = output.argmax(dim=1).item()
-            return {"prediction": prediction}
         except Exception as e:
             raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")

 import torch
 from scripts.model import Net
 from scripts.training.train import train
 from pathlib import Path
 from fastapi import BackgroundTasks
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="torchvision.transforms")
 app = FastAPI()
 async def websocket_endpoint(websocket: WebSocket):
     await websocket.accept()
     try:
         config_data = await websocket.receive_json()
         model = Net(
             kernels=[
                 config_data['block1'],
             ]
         )
         from scripts.training.config import NetworkConfig
         config = NetworkConfig()
         config.update(
             block3=config_data['block3'],
             optimizer=config_data['optimizer'],
             batch_size=config_data['batch_size'],
+            epochs=config_data['epochs']
         )
         print(f"Starting training with config: {config_data}")
         try:
+            # Pass "single" as model_type for single model training
+            await train(model, config, websocket, model_type="single")
             await websocket.send_json({
                 "type": "training_complete",
                 "data": {
     finally:
         print("WebSocket connection closed")
+@app.websocket("/ws/compare")
+async def websocket_compare_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    try:
+        data = await websocket.receive_json()
+        if data.get("type") == "start_comparison":
+            from scripts.training.config import NetworkConfig
+            # Create and train both models
+            model1_config = NetworkConfig()
+            model2_config = NetworkConfig()
+            # Update configs with received data
+            model1_config.update(**data["model1"])
+            model2_config.update(**data["model2"])
+            # Create models with respective configurations
+            model1 = Net(
+                kernels=[
+                    model1_config.block1,
+                    model1_config.block2,
+                    model1_config.block3
+                ]
+            )
+            model2 = Net(
+                kernels=[
+                    model2_config.block1,
+                    model2_config.block2,
+                    model2_config.block3
+                ]
+            )
+            # Train both models with appropriate model_type
+            try:
+                await train(model1, model1_config, websocket, model_type="model_1")
+                await train(model2, model2_config, websocket, model_type="model_2")
+                await websocket.send_json({
+                    "type": "comparison_complete",
+                    "data": {
+                        "message": "Training completed successfully!"
+                    }
+                })
+            except Exception as e:
+                print(f"Training error: {str(e)}")
+                await websocket.send_json({
+                    "type": "training_error",
+                    "data": {
+                        "message": f"Training failed: {str(e)}"
+                    }
+                })
+    except WebSocketDisconnect:
+        print("WebSocket disconnected")
+    except Exception as e:
+        print(f"WebSocket error: {str(e)}")
+    finally:
+        print("WebSocket connection closed")
 # @app.post("/api/train_single")
 # async def train_single_model(config: TrainingConfig):
 #     try:
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+def parse_model_filename(filename):
+    """Extract configuration from model filename"""
+    # Example filename: single_arch_32_64_128_opt_adam_batch_64_20240322_123456.pth
+    try:
+        parts = filename.split('_')
+        # Find architecture values
+        arch_index = parts.index('arch')
+        block1 = int(parts[arch_index + 1])
+        block2 = int(parts[arch_index + 2])
+        block3 = int(parts[arch_index + 3])
+        # Find optimizer
+        opt_index = parts.index('opt')
+        optimizer = parts[opt_index + 1]
+        # Find batch size
+        batch_index = parts.index('batch')
+        batch_size = int(parts[batch_index + 1])
+        return {
+            'block1': block1,
+            'block2': block2,
+            'block3': block3,
+            'optimizer': optimizer,
+            'batch_size': batch_size
+        }
+    except Exception as e:
+        print(f"Error parsing model filename: {e}")
+        return None
 @app.post("/api/inference")
 async def perform_inference(data: dict):
     try:
         model_path = Path("scripts/training/models") / f"{model_name}.pth"
         if not model_path.exists():
             raise HTTPException(status_code=404, detail=f"Model not found: {model_path}")
+        # Parse model configuration from filename
+        config = parse_model_filename(model_name)
+        if not config:
+            raise HTTPException(status_code=500, detail="Could not parse model configuration")
+        # Create model with the correct configuration
+        model = Net(
+            kernels=[
+                config['block1'],
+                config['block2'],
+                config['block3']
+            ]
+        )
+        # Load model weights
+        model.load_state_dict(torch.load(str(model_path), map_location=torch.device('cpu'), weights_only=True))
         model.eval()
         # Process image data and get prediction
                 output = model(image_tensor)
                 prediction = output.argmax(dim=1).item()
+            # Add configuration info to response
+            return {
+                "prediction": prediction,
+                "model_config": {
+                    "architecture": f"{config['block1']}-{config['block2']}-{config['block3']}",
+                    "optimizer": config['optimizer'],
+                    "batch_size": config['batch_size']
+                }
+            }
         except Exception as e:
             raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")

scripts/training/train.py CHANGED Viewed

@@ -13,6 +13,17 @@ import shutil
 from tqdm import tqdm
 import asyncio
 def download_and_extract_mnist_data():
     """Download and extract MNIST dataset from a reliable mirror"""
     base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"
@@ -107,7 +118,7 @@ def validate(model, test_loader, criterion, device):
     return val_loss, val_acc
-async def train(model, config, websocket=None):
     print("\nStarting training...")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
@@ -140,27 +151,25 @@ async def train(model, config, websocket=None):
     print(f"Dataset loaded. Training samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
-    # Initialize optimizer based on config
-    if config.optimizer.lower() == 'adam':
-        optimizer = optim.Adam(model.parameters())
-    else:
-        optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
-    criterion = nn.CrossEntropyLoss()
     print("\nTraining Configuration:")
     print(f"Optimizer: {config.optimizer}")
     print(f"Batch Size: {config.batch_size}")
     print(f"Network Architecture: {config.block1}-{config.block2}-{config.block3}")
     print("\nStarting training loop...")
     best_val_acc = 0
-    history = {
-        'train_loss': [],
-        'train_acc': [],
-        'val_loss': [],
-        'val_acc': []
-    }
     try:
         for epoch in range(config.epochs):
@@ -204,10 +213,11 @@ async def train(model, config, websocket=None):
                 # Send training update through websocket
                 if websocket:
                     try:
                         await websocket.send_json({
                             'type': 'training_update',
                             'data': {
-                                'step': batch_idx + epoch * len(train_loader),
                                 'train_loss': current_loss,
                                 'train_acc': current_acc
                             }
@@ -215,10 +225,6 @@ async def train(model, config, websocket=None):
                     except Exception as e:
                         print(f"Error sending websocket update: {e}")
-            # Calculate epoch metrics
-            train_loss = total_loss / len(train_loader)
-            train_acc = 100. * correct / total
             # Validation phase
             model.eval()
             val_loss = 0
@@ -240,7 +246,7 @@ async def train(model, config, websocket=None):
             # Print epoch results
             print(f"\nEpoch {epoch+1}/{config.epochs} Results:")
-            print(f"Training Loss: {train_loss:.4f} | Training Accuracy: {train_acc:.2f}%")
             print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.2f}%")
             # Send validation update through websocket
@@ -257,12 +263,17 @@ async def train(model, config, websocket=None):
                 except Exception as e:
                     print(f"Error sending websocket update: {e}")
-            # Save best model
             if val_acc > best_val_acc:
                 best_val_acc = val_acc
                 print(f"\nNew best validation accuracy: {val_acc:.2f}%")
-                print("Saving model...")
-                torch.save(model.state_dict(), 'best_model.pth')
     except Exception as e:
         print(f"\nError during training: {e}")
@@ -270,4 +281,4 @@ async def train(model, config, websocket=None):
     print("\nTraining completed!")
     print(f"Best validation accuracy: {best_val_acc:.2f}%")
-    return history

 from tqdm import tqdm
 import asyncio
+def generate_model_filename(config, model_type="single"):
+    """Generate a filename based on model configuration
+    model_type can be "single", "model_1", or "model_2"
+    """
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    arch = f"{config.block1}_{config.block2}_{config.block3}"
+    opt = config.optimizer.lower()
+    batch = str(config.batch_size)
+    return f"{model_type}_arch_{arch}_opt_{opt}_batch_{batch}_{timestamp}.pth"
 def download_and_extract_mnist_data():
     """Download and extract MNIST dataset from a reliable mirror"""
     base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"
     return val_loss, val_acc
+async def train(model, config, websocket=None, model_type="single"):
     print("\nStarting training...")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
     print(f"Dataset loaded. Training samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
     print("\nTraining Configuration:")
+    print(f"Epochs: {config.epochs}")
     print(f"Optimizer: {config.optimizer}")
     print(f"Batch Size: {config.batch_size}")
     print(f"Network Architecture: {config.block1}-{config.block2}-{config.block3}")
     print("\nStarting training loop...")
     best_val_acc = 0
+    criterion = nn.CrossEntropyLoss()
+    # Initialize optimizer based on config
+    if config.optimizer.lower() == 'adam':
+        optimizer = optim.Adam(model.parameters())
+    else:
+        optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+    # Create models directory if it doesn't exist
+    models_dir = Path("scripts/training/models")
+    models_dir.mkdir(parents=True, exist_ok=True)
     try:
         for epoch in range(config.epochs):
                 # Send training update through websocket
                 if websocket:
                     try:
+                        step = batch_idx + epoch * len(train_loader)
                         await websocket.send_json({
                             'type': 'training_update',
                             'data': {
+                                'step': step,
                                 'train_loss': current_loss,
                                 'train_acc': current_acc
                             }
                     except Exception as e:
                         print(f"Error sending websocket update: {e}")
             # Validation phase
             model.eval()
             val_loss = 0
             # Print epoch results
             print(f"\nEpoch {epoch+1}/{config.epochs} Results:")
+            print(f"Training Loss: {current_loss:.4f} | Training Accuracy: {current_acc:.2f}%")
             print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.2f}%")
             # Send validation update through websocket
                 except Exception as e:
                     print(f"Error sending websocket update: {e}")
+            # Save best model with configuration in filename
             if val_acc > best_val_acc:
                 best_val_acc = val_acc
                 print(f"\nNew best validation accuracy: {val_acc:.2f}%")
+                # Generate filename with configuration
+                model_filename = generate_model_filename(config, model_type)
+                model_path = models_dir / model_filename
+                print(f"Saving model as: {model_filename}")
+                torch.save(model.state_dict(), model_path)
     except Exception as e:
         print(f"\nError during training: {e}")
     print("\nTraining completed!")
     print(f"Best validation accuracy: {best_val_acc:.2f}%")
+    return None

static/js/inference.js CHANGED Viewed

@@ -46,8 +46,18 @@ function setupCanvas() {
 }
 function clearCanvas() {
     ctx.fillStyle = "white";
     ctx.fillRect(0, 0, canvas.width, canvas.height);
 }
 async function predict() {

 }
 function clearCanvas() {
+    const canvas = document.getElementById('drawing-canvas');
+    const ctx = canvas.getContext('2d');
+    // Clear the canvas
     ctx.fillStyle = "white";
     ctx.fillRect(0, 0, canvas.width, canvas.height);
+    ctx.beginPath();
+    // Hide and clear prediction result
+    const resultDiv = document.getElementById('prediction-result');
+    resultDiv.classList.add('hidden');
+    resultDiv.innerHTML = '';
 }
 async function predict() {

templates/train_compare.html CHANGED Viewed

@@ -67,6 +67,14 @@
                             <option value="128">128</option>
                         </select>
                     </div>
                 </div>
             </div>
@@ -126,6 +134,14 @@
                             <option value="128">128</option>
                         </select>
                     </div>
                 </div>
             </div>
         </div>
@@ -343,7 +359,8 @@
                 block2: parseInt(document.getElementById('model1_block2').value),
                 block3: parseInt(document.getElementById('model1_block3').value),
                 optimizer: document.getElementById('model1_optimizer').value,
-                batch_size: parseInt(document.getElementById('model1_batch_size').value)
             };
             const model2Config = {
@@ -351,7 +368,8 @@
                 block2: parseInt(document.getElementById('model2_block2').value),
                 block3: parseInt(document.getElementById('model2_block3').value),
                 optimizer: document.getElementById('model2_optimizer').value,
-                batch_size: parseInt(document.getElementById('model2_batch_size').value)
             };
             // Setup WebSocket connection

                             <option value="128">128</option>
                         </select>
                     </div>
+                    <div class="config-item">
+                        <label for="model1_epochs">Epochs:</label>
+                        <select id="model1_epochs" name="epochs">
+                            <option value="1">1</option>
+                            <option value="2">2</option>
+                            <option value="3">3</option>
+                        </select>
+                    </div>
                 </div>
             </div>
                             <option value="128">128</option>
                         </select>
                     </div>
+                    <div class="config-item">
+                        <label for="model2_epochs">Epochs:</label>
+                        <select id="model2_epochs" name="epochs">
+                            <option value="1">1</option>
+                            <option value="2">2</option>
+                            <option value="3">3</option>
+                        </select>
+                    </div>
                 </div>
             </div>
         </div>
                 block2: parseInt(document.getElementById('model1_block2').value),
                 block3: parseInt(document.getElementById('model1_block3').value),
                 optimizer: document.getElementById('model1_optimizer').value,
+                batch_size: parseInt(document.getElementById('model1_batch_size').value),
+                epochs: parseInt(document.getElementById('model1_epochs').value)
             };
             const model2Config = {
                 block2: parseInt(document.getElementById('model2_block2').value),
                 block3: parseInt(document.getElementById('model2_block3').value),
                 optimizer: document.getElementById('model2_optimizer').value,
+                batch_size: parseInt(document.getElementById('model2_batch_size').value),
+                epochs: parseInt(document.getElementById('model2_epochs').value)
             };
             // Setup WebSocket connection

templates/train_single.html CHANGED Viewed

@@ -67,6 +67,14 @@
                         <option value="128">128</option>
                     </select>
                 </div>
             </div>
         </div>
@@ -81,6 +89,13 @@
             <div id="lossChart"></div>
             <div id="accuracyChart"></div>
         </div>
     </div>
     <script>
@@ -215,7 +230,7 @@
                     block3: parseInt(document.getElementById('block3').value),
                     optimizer: document.getElementById('optimizer').value,
                     batch_size: parseInt(document.getElementById('batch_size').value),
-                    epochs: 1  // Add default epochs value
                 };
                 ws.send(JSON.stringify(config));
             };
@@ -261,6 +276,8 @@
                 else if (data.type === 'training_complete') {
                     alert(data.data.message);
                     stopTraining();
                 }
                 else if (data.type === 'training_error') {
                     alert(data.data.message);
@@ -369,6 +386,22 @@
             height: 400px;
             width: 100%;
         }
     </style>
 </body>
 </html>

                         <option value="128">128</option>
                     </select>
                 </div>
+                <div class="config-item">
+                    <label for="epochs">Epochs:</label>
+                    <select id="epochs" name="epochs">
+                        <option value="1">1</option>
+                        <option value="2">2</option>
+                        <option value="3">3</option>
+                    </select>
+                </div>
             </div>
         </div>
             <div id="lossChart"></div>
             <div id="accuracyChart"></div>
         </div>
+        <!-- Inference Controls -->
+        <div class="inference-controls" style="display: none;">
+            <button id="goToInference" onclick="window.location.href='/inference'" class="inference-button">
+                Try Model Inference
+            </button>
+        </div>
     </div>
     <script>
                     block3: parseInt(document.getElementById('block3').value),
                     optimizer: document.getElementById('optimizer').value,
                     batch_size: parseInt(document.getElementById('batch_size').value),
+                    epochs: parseInt(document.getElementById('epochs').value)
                 };
                 ws.send(JSON.stringify(config));
             };
                 else if (data.type === 'training_complete') {
                     alert(data.data.message);
                     stopTraining();
+                    // Show the inference button
+                    document.querySelector('.inference-controls').style.display = 'block';
                 }
                 else if (data.type === 'training_error') {
                     alert(data.data.message);
             height: 400px;
             width: 100%;
         }
+        .inference-controls {
+            margin: 20px 0;
+            text-align: center;
+        }
+        .inference-button {
+            background-color: #28a745;
+            padding: 12px 24px;
+            font-size: 1.1em;
+            transition: background-color 0.3s;
+        }
+        .inference-button:hover {
+            background-color: #218838;
+        }
     </style>
 </body>
 </html>