Spaces:
Running
Running
adds single token logic read/write , adds gpt-oss demo space , adds spaces refactor , adds new version of track tonic , adds logic in launch.sh
Browse files- docs/datasetflow.svg +1 -0
- launch.sh +18 -28
- scripts/dataset_tonic/setup_hf_dataset.py +86 -80
- scripts/deploy_demo_space.py +133 -30
- scripts/model_tonic/push_gpt_oss_to_huggingface.py +2 -2
- scripts/trackio_tonic/configure_trackio.py +22 -48
- scripts/trackio_tonic/deploy_trackio_space.py +11 -38
- scripts/trackio_tonic/switch_to_read_token.py +14 -11
- src/dataset_utils.py +328 -0
- src/monitoring.py +70 -48
- templates/spaces/demo_gpt/README.md +15 -0
- templates/spaces/demo_gpt/app.py +262 -0
- templates/spaces/demo_gpt/requirements.txt +9 -0
- templates/spaces/{demo β demo_smol}/README.md +0 -0
- templates/spaces/{demo β demo_smol}/app.py +0 -0
- templates/spaces/{demo β demo_smol}/requirements.txt +0 -0
- templates/spaces/{README.md β trackio/README.md} +0 -0
- templates/spaces/{app.py β trackio/app.py} +1154 -303
- templates/spaces/trackio/dataset_utils.py +328 -0
- templates/spaces/{requirements.txt β trackio/requirements.txt} +0 -0
- templates/spaces/trackio/trackio_api_client.py +320 -0
- tests/test_data_preservation.py +187 -0
- tests/test_demo_deployment.py +9 -6
- tests/test_deployment.py +11 -7
- tests/test_hf_datasets.py +2 -2
- tests/test_latest_deployment.py +12 -8
- tests/test_readme_template.py +2 -2
- tests/test_real_dataset_access.py +201 -0
- tests/test_trackio_dataset_fix.py +167 -0
- tests/test_trackio_deployment.py +5 -5
- tests/test_trackio_space_diagnostics.py +191 -0
docs/datasetflow.svg
ADDED
|
|
launch.sh
CHANGED
|
@@ -452,8 +452,10 @@ print_step "Step 1: User Authentication"
|
|
| 452 |
echo "================================"
|
| 453 |
|
| 454 |
print_info "You'll need two Hugging Face tokens:"
|
| 455 |
-
echo "1. Write Token - Used
|
| 456 |
-
echo "2. Read Token -
|
|
|
|
|
|
|
| 457 |
echo ""
|
| 458 |
|
| 459 |
print_info "Getting Write Token (for training operations)..."
|
|
@@ -489,7 +491,7 @@ else
|
|
| 489 |
exit 1
|
| 490 |
fi
|
| 491 |
|
| 492 |
-
# Set the main HF_TOKEN to write token for training operations
|
| 493 |
HF_TOKEN="$HF_WRITE_TOKEN"
|
| 494 |
|
| 495 |
# Step 2: Select training configuration
|
|
@@ -669,8 +671,6 @@ fi
|
|
| 669 |
|
| 670 |
# Set environment variables before creating virtual environment
|
| 671 |
print_info "Setting up environment variables..."
|
| 672 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
| 673 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
| 674 |
export HF_TOKEN="$HF_TOKEN"
|
| 675 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
| 676 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
@@ -682,8 +682,6 @@ source smollm3_env/bin/activate
|
|
| 682 |
|
| 683 |
# Re-export environment variables in the virtual environment
|
| 684 |
print_info "Configuring environment variables in virtual environment..."
|
| 685 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
| 686 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
| 687 |
export HF_TOKEN="$HF_TOKEN"
|
| 688 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
| 689 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
@@ -712,16 +710,16 @@ print_status "HF token configured for Python API usage"
|
|
| 712 |
print_info "Username: $HF_USERNAME (auto-detected from token)"
|
| 713 |
print_info "Token available in environment: ${HF_TOKEN:0:10}...${HF_TOKEN: -4}"
|
| 714 |
|
| 715 |
-
# Verify
|
| 716 |
print_info "Verifying token availability in virtual environment..."
|
| 717 |
-
if [ -n "$
|
| 718 |
-
print_status "β
|
| 719 |
-
print_info "
|
| 720 |
-
print_info " HF_READ_TOKEN: ${HF_READ_TOKEN:0:10}...${HF_READ_TOKEN: -4}"
|
| 721 |
print_info " HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN:0:10}...${HUGGING_FACE_HUB_TOKEN: -4}"
|
|
|
|
| 722 |
else
|
| 723 |
-
print_error "β
|
| 724 |
-
print_error "Please check your
|
| 725 |
exit 1
|
| 726 |
fi
|
| 727 |
|
|
@@ -771,8 +769,6 @@ print_info "Username will be auto-detected from token"
|
|
| 771 |
print_info "Secrets will be set automatically via API"
|
| 772 |
|
| 773 |
# Ensure environment variables are available for the script
|
| 774 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
| 775 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
| 776 |
export HF_TOKEN="$HF_TOKEN"
|
| 777 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 778 |
export HF_USERNAME="$HF_USERNAME"
|
|
@@ -792,8 +788,6 @@ print_info "Username will be auto-detected from token"
|
|
| 792 |
print_info "Dataset repository: $TRACKIO_DATASET_REPO"
|
| 793 |
|
| 794 |
# Ensure environment variables are available for the script
|
| 795 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
| 796 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
| 797 |
export HF_TOKEN="$HF_TOKEN"
|
| 798 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 799 |
export HF_USERNAME="$HF_USERNAME"
|
|
@@ -809,8 +803,6 @@ print_info "Configuring Trackio ..."
|
|
| 809 |
print_info "Username will be auto-detected from token"
|
| 810 |
|
| 811 |
# Ensure environment variables are available for the script
|
| 812 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
| 813 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
| 814 |
export HF_TOKEN="$HF_TOKEN"
|
| 815 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 816 |
export HF_USERNAME="$HF_USERNAME"
|
|
@@ -920,7 +912,7 @@ fi
|
|
| 920 |
print_step "Step 16.5: Switching to Read Token for Security"
|
| 921 |
echo "===================================================="
|
| 922 |
|
| 923 |
-
print_info "Switching Trackio Space from write token to read token for security..."
|
| 924 |
print_info "This ensures the space can only read datasets, not write to repositories"
|
| 925 |
|
| 926 |
# Ensure environment variables are available for token switch
|
|
@@ -928,12 +920,12 @@ export HF_TOKEN="$HF_WRITE_TOKEN" # Use write token to update space
|
|
| 928 |
export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
|
| 929 |
export HF_USERNAME="$HF_USERNAME"
|
| 930 |
|
| 931 |
-
# Switch
|
| 932 |
cd scripts/trackio_tonic
|
| 933 |
python switch_to_read_token.py "$HF_USERNAME/$TRACKIO_SPACE_NAME" "$HF_READ_TOKEN" "$HF_WRITE_TOKEN"
|
| 934 |
|
| 935 |
if [ $? -eq 0 ]; then
|
| 936 |
-
print_status "β
Successfully switched Trackio Space to read token"
|
| 937 |
print_info "π Space now uses read-only permissions for security"
|
| 938 |
else
|
| 939 |
print_warning "β οΈ Failed to switch to read token, but continuing with pipeline"
|
|
@@ -957,8 +949,6 @@ if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
|
| 957 |
DEMO_SUBFOLDER=""
|
| 958 |
|
| 959 |
# Ensure environment variables are available for demo deployment
|
| 960 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
| 961 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
| 962 |
export HF_TOKEN="$HF_TOKEN"
|
| 963 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 964 |
export HF_USERNAME="$HF_USERNAME"
|
|
@@ -999,7 +989,7 @@ cat > training_summary.md << EOF
|
|
| 999 |
- **HF Dataset**: $TRACKIO_DATASET_REPO
|
| 1000 |
- **Training Config**: $TRAINING_CONFIG_TYPE
|
| 1001 |
- **Trainer Type**: $TRAINER_TYPE
|
| 1002 |
-
- **Security**:
|
| 1003 |
$(if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
|
| 1004 |
echo "- **Dataset Sample Size**: ${DATASET_SAMPLE_SIZE:-80000}"
|
| 1005 |
fi)
|
|
@@ -1015,7 +1005,7 @@ fi)
|
|
| 1015 |
- **Model Repository**: https://huggingface.co/$REPO_NAME
|
| 1016 |
- **Trackio Monitoring**: $TRACKIO_URL
|
| 1017 |
- **Experiment Data**: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO
|
| 1018 |
-
- **Security**: Trackio Space switched to read-only token for security
|
| 1019 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
| 1020 |
echo "- **Demo Space**: https://huggingface.co/spaces/$HF_USERNAME/${REPO_NAME}-demo"
|
| 1021 |
fi)
|
|
@@ -1053,7 +1043,7 @@ echo ""
|
|
| 1053 |
echo "π Next steps:"
|
| 1054 |
echo "1. Monitor training progress in your Trackio Space"
|
| 1055 |
echo "2. Check the model repository on Hugging Face Hub"
|
| 1056 |
-
echo "3. Your Trackio Space is now secured with read-only permissions"
|
| 1057 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
| 1058 |
echo "3. Make your huggingface space a ZeroGPU Space & Test your model"
|
| 1059 |
fi)
|
|
|
|
| 452 |
echo "================================"
|
| 453 |
|
| 454 |
print_info "You'll need two Hugging Face tokens:"
|
| 455 |
+
echo "1. Write Token - Used initially for training and creating repositories"
|
| 456 |
+
echo "2. Read Token - Will replace the write token in Trackio Space after training for security"
|
| 457 |
+
echo ""
|
| 458 |
+
print_info "The pipeline will start with the write token in HF_TOKEN, then switch to read token automatically."
|
| 459 |
echo ""
|
| 460 |
|
| 461 |
print_info "Getting Write Token (for training operations)..."
|
|
|
|
| 491 |
exit 1
|
| 492 |
fi
|
| 493 |
|
| 494 |
+
# Set the main HF_TOKEN to write token for training operations (will be switched later)
|
| 495 |
HF_TOKEN="$HF_WRITE_TOKEN"
|
| 496 |
|
| 497 |
# Step 2: Select training configuration
|
|
|
|
| 671 |
|
| 672 |
# Set environment variables before creating virtual environment
|
| 673 |
print_info "Setting up environment variables..."
|
|
|
|
|
|
|
| 674 |
export HF_TOKEN="$HF_TOKEN"
|
| 675 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
| 676 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
|
|
| 682 |
|
| 683 |
# Re-export environment variables in the virtual environment
|
| 684 |
print_info "Configuring environment variables in virtual environment..."
|
|
|
|
|
|
|
| 685 |
export HF_TOKEN="$HF_TOKEN"
|
| 686 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
| 687 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
|
|
| 710 |
print_info "Username: $HF_USERNAME (auto-detected from token)"
|
| 711 |
print_info "Token available in environment: ${HF_TOKEN:0:10}...${HF_TOKEN: -4}"
|
| 712 |
|
| 713 |
+
# Verify token is available in the virtual environment
|
| 714 |
print_info "Verifying token availability in virtual environment..."
|
| 715 |
+
if [ -n "$HF_TOKEN" ] && [ -n "$HUGGING_FACE_HUB_TOKEN" ]; then
|
| 716 |
+
print_status "β
Token properly configured in virtual environment"
|
| 717 |
+
print_info " HF_TOKEN: ${HF_TOKEN:0:10}...${HF_TOKEN: -4} (currently using WRITE token)"
|
|
|
|
| 718 |
print_info " HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN:0:10}...${HUGGING_FACE_HUB_TOKEN: -4}"
|
| 719 |
+
print_info " Will be switched to READ token after training for security"
|
| 720 |
else
|
| 721 |
+
print_error "β Token not properly configured in virtual environment"
|
| 722 |
+
print_error "Please check your token and try again"
|
| 723 |
exit 1
|
| 724 |
fi
|
| 725 |
|
|
|
|
| 769 |
print_info "Secrets will be set automatically via API"
|
| 770 |
|
| 771 |
# Ensure environment variables are available for the script
|
|
|
|
|
|
|
| 772 |
export HF_TOKEN="$HF_TOKEN"
|
| 773 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 774 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
|
| 788 |
print_info "Dataset repository: $TRACKIO_DATASET_REPO"
|
| 789 |
|
| 790 |
# Ensure environment variables are available for the script
|
|
|
|
|
|
|
| 791 |
export HF_TOKEN="$HF_TOKEN"
|
| 792 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 793 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
|
| 803 |
print_info "Username will be auto-detected from token"
|
| 804 |
|
| 805 |
# Ensure environment variables are available for the script
|
|
|
|
|
|
|
| 806 |
export HF_TOKEN="$HF_TOKEN"
|
| 807 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 808 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
|
| 912 |
print_step "Step 16.5: Switching to Read Token for Security"
|
| 913 |
echo "===================================================="
|
| 914 |
|
| 915 |
+
print_info "Switching Trackio Space HF_TOKEN from write token to read token for security..."
|
| 916 |
print_info "This ensures the space can only read datasets, not write to repositories"
|
| 917 |
|
| 918 |
# Ensure environment variables are available for token switch
|
|
|
|
| 920 |
export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
|
| 921 |
export HF_USERNAME="$HF_USERNAME"
|
| 922 |
|
| 923 |
+
# Switch HF_TOKEN in Trackio Space from write to read token
|
| 924 |
cd scripts/trackio_tonic
|
| 925 |
python switch_to_read_token.py "$HF_USERNAME/$TRACKIO_SPACE_NAME" "$HF_READ_TOKEN" "$HF_WRITE_TOKEN"
|
| 926 |
|
| 927 |
if [ $? -eq 0 ]; then
|
| 928 |
+
print_status "β
Successfully switched Trackio Space HF_TOKEN to read token"
|
| 929 |
print_info "π Space now uses read-only permissions for security"
|
| 930 |
else
|
| 931 |
print_warning "β οΈ Failed to switch to read token, but continuing with pipeline"
|
|
|
|
| 949 |
DEMO_SUBFOLDER=""
|
| 950 |
|
| 951 |
# Ensure environment variables are available for demo deployment
|
|
|
|
|
|
|
| 952 |
export HF_TOKEN="$HF_TOKEN"
|
| 953 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
| 954 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
|
| 989 |
- **HF Dataset**: $TRACKIO_DATASET_REPO
|
| 990 |
- **Training Config**: $TRAINING_CONFIG_TYPE
|
| 991 |
- **Trainer Type**: $TRAINER_TYPE
|
| 992 |
+
- **Security**: Single HF_TOKEN switched from write to read token
|
| 993 |
$(if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
|
| 994 |
echo "- **Dataset Sample Size**: ${DATASET_SAMPLE_SIZE:-80000}"
|
| 995 |
fi)
|
|
|
|
| 1005 |
- **Model Repository**: https://huggingface.co/$REPO_NAME
|
| 1006 |
- **Trackio Monitoring**: $TRACKIO_URL
|
| 1007 |
- **Experiment Data**: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO
|
| 1008 |
+
- **Security**: Trackio Space HF_TOKEN switched to read-only token for security
|
| 1009 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
| 1010 |
echo "- **Demo Space**: https://huggingface.co/spaces/$HF_USERNAME/${REPO_NAME}-demo"
|
| 1011 |
fi)
|
|
|
|
| 1043 |
echo "π Next steps:"
|
| 1044 |
echo "1. Monitor training progress in your Trackio Space"
|
| 1045 |
echo "2. Check the model repository on Hugging Face Hub"
|
| 1046 |
+
echo "3. Your Trackio Space HF_TOKEN is now secured with read-only permissions"
|
| 1047 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
| 1048 |
echo "3. Make your huggingface space a ZeroGPU Space & Test your model"
|
| 1049 |
fi)
|
scripts/dataset_tonic/setup_hf_dataset.py
CHANGED
|
@@ -145,7 +145,7 @@ def setup_trackio_dataset(dataset_name: str = None, token: str = None) -> bool:
|
|
| 145 |
|
| 146 |
def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
| 147 |
"""
|
| 148 |
-
Add initial experiment data to the dataset.
|
| 149 |
|
| 150 |
Args:
|
| 151 |
repo_id (str): Dataset repository ID
|
|
@@ -163,89 +163,95 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
| 163 |
print("β οΈ No token available for uploading data")
|
| 164 |
return False
|
| 165 |
|
| 166 |
-
#
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
'name': 'smollm3-finetune-demo',
|
| 171 |
-
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
| 172 |
-
'created_at': datetime.now().isoformat(),
|
| 173 |
-
'status': 'completed',
|
| 174 |
-
'metrics': json.dumps([
|
| 175 |
-
{
|
| 176 |
-
'timestamp': datetime.now().isoformat(),
|
| 177 |
-
'step': 100,
|
| 178 |
-
'metrics': {
|
| 179 |
-
'loss': 1.15,
|
| 180 |
-
'grad_norm': 10.5,
|
| 181 |
-
'learning_rate': 5e-6,
|
| 182 |
-
'num_tokens': 1000000.0,
|
| 183 |
-
'mean_token_accuracy': 0.76,
|
| 184 |
-
'epoch': 0.1,
|
| 185 |
-
'total_tokens': 1000000.0,
|
| 186 |
-
'throughput': 2000000.0,
|
| 187 |
-
'step_time': 0.5,
|
| 188 |
-
'batch_size': 2,
|
| 189 |
-
'seq_len': 4096,
|
| 190 |
-
'token_acc': 0.76,
|
| 191 |
-
'gpu_memory_allocated': 15.2,
|
| 192 |
-
'gpu_memory_reserved': 70.1,
|
| 193 |
-
'gpu_utilization': 85.2,
|
| 194 |
-
'cpu_percent': 2.7,
|
| 195 |
-
'memory_percent': 10.1
|
| 196 |
-
}
|
| 197 |
-
}
|
| 198 |
-
]),
|
| 199 |
-
'parameters': json.dumps({
|
| 200 |
-
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
| 201 |
-
'max_seq_length': 4096,
|
| 202 |
-
'batch_size': 2,
|
| 203 |
-
'learning_rate': 5e-6,
|
| 204 |
-
'epochs': 3,
|
| 205 |
-
'dataset': 'OpenHermes-FR',
|
| 206 |
-
'trainer_type': 'SFTTrainer',
|
| 207 |
-
'hardware': 'GPU (H100/A100)',
|
| 208 |
-
'mixed_precision': True,
|
| 209 |
-
'gradient_checkpointing': True,
|
| 210 |
-
'flash_attention': True
|
| 211 |
-
}),
|
| 212 |
-
'artifacts': json.dumps([]),
|
| 213 |
-
'logs': json.dumps([
|
| 214 |
-
{
|
| 215 |
-
'timestamp': datetime.now().isoformat(),
|
| 216 |
-
'level': 'INFO',
|
| 217 |
-
'message': 'Training started successfully'
|
| 218 |
-
},
|
| 219 |
-
{
|
| 220 |
-
'timestamp': datetime.now().isoformat(),
|
| 221 |
-
'level': 'INFO',
|
| 222 |
-
'message': 'Model loaded and configured'
|
| 223 |
-
},
|
| 224 |
-
{
|
| 225 |
-
'timestamp': datetime.now().isoformat(),
|
| 226 |
-
'level': 'INFO',
|
| 227 |
-
'message': 'Dataset loaded and preprocessed'
|
| 228 |
-
}
|
| 229 |
-
]),
|
| 230 |
-
'last_updated': datetime.now().isoformat()
|
| 231 |
-
}
|
| 232 |
-
]
|
| 233 |
|
| 234 |
-
#
|
| 235 |
-
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
|
|
|
|
|
|
|
| 239 |
|
| 240 |
-
#
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
# Add README template
|
| 251 |
add_dataset_readme(repo_id, token)
|
|
|
|
| 145 |
|
| 146 |
def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
| 147 |
"""
|
| 148 |
+
Add initial experiment data to the dataset using data preservation.
|
| 149 |
|
| 150 |
Args:
|
| 151 |
repo_id (str): Dataset repository ID
|
|
|
|
| 163 |
print("β οΈ No token available for uploading data")
|
| 164 |
return False
|
| 165 |
|
| 166 |
+
# Import dataset manager
|
| 167 |
+
import sys
|
| 168 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
| 169 |
+
from dataset_utils import TrackioDatasetManager
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
# Initialize dataset manager
|
| 172 |
+
dataset_manager = TrackioDatasetManager(repo_id, token)
|
| 173 |
|
| 174 |
+
# Check if dataset already has data
|
| 175 |
+
existing_experiments = dataset_manager.load_existing_experiments()
|
| 176 |
+
if existing_experiments:
|
| 177 |
+
print(f"βΉοΈ Dataset already contains {len(existing_experiments)} experiments, preserving existing data")
|
| 178 |
|
| 179 |
+
# Initial experiment data
|
| 180 |
+
initial_experiment = {
|
| 181 |
+
'experiment_id': f'exp_demo_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
|
| 182 |
+
'name': 'smollm3-finetune-demo',
|
| 183 |
+
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
| 184 |
+
'created_at': datetime.now().isoformat(),
|
| 185 |
+
'status': 'completed',
|
| 186 |
+
'metrics': json.dumps([
|
| 187 |
+
{
|
| 188 |
+
'timestamp': datetime.now().isoformat(),
|
| 189 |
+
'step': 100,
|
| 190 |
+
'metrics': {
|
| 191 |
+
'loss': 1.15,
|
| 192 |
+
'grad_norm': 10.5,
|
| 193 |
+
'learning_rate': 5e-6,
|
| 194 |
+
'num_tokens': 1000000.0,
|
| 195 |
+
'mean_token_accuracy': 0.76,
|
| 196 |
+
'epoch': 0.1,
|
| 197 |
+
'total_tokens': 1000000.0,
|
| 198 |
+
'throughput': 2000000.0,
|
| 199 |
+
'step_time': 0.5,
|
| 200 |
+
'batch_size': 2,
|
| 201 |
+
'seq_len': 4096,
|
| 202 |
+
'token_acc': 0.76,
|
| 203 |
+
'gpu_memory_allocated': 15.2,
|
| 204 |
+
'gpu_memory_reserved': 70.1,
|
| 205 |
+
'gpu_utilization': 85.2,
|
| 206 |
+
'cpu_percent': 2.7,
|
| 207 |
+
'memory_percent': 10.1
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
]),
|
| 211 |
+
'parameters': json.dumps({
|
| 212 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
| 213 |
+
'max_seq_length': 4096,
|
| 214 |
+
'batch_size': 2,
|
| 215 |
+
'learning_rate': 5e-6,
|
| 216 |
+
'epochs': 3,
|
| 217 |
+
'dataset': 'OpenHermes-FR',
|
| 218 |
+
'trainer_type': 'SFTTrainer',
|
| 219 |
+
'hardware': 'GPU (H100/A100)',
|
| 220 |
+
'mixed_precision': True,
|
| 221 |
+
'gradient_checkpointing': True,
|
| 222 |
+
'flash_attention': True
|
| 223 |
+
}),
|
| 224 |
+
'artifacts': json.dumps([]),
|
| 225 |
+
'logs': json.dumps([
|
| 226 |
+
{
|
| 227 |
+
'timestamp': datetime.now().isoformat(),
|
| 228 |
+
'level': 'INFO',
|
| 229 |
+
'message': 'Training started successfully'
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
'timestamp': datetime.now().isoformat(),
|
| 233 |
+
'level': 'INFO',
|
| 234 |
+
'message': 'Model loaded and configured'
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
'timestamp': datetime.now().isoformat(),
|
| 238 |
+
'level': 'INFO',
|
| 239 |
+
'message': 'Dataset loaded and preprocessed'
|
| 240 |
+
}
|
| 241 |
+
]),
|
| 242 |
+
'last_updated': datetime.now().isoformat()
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
# Use dataset manager to safely add the experiment
|
| 246 |
+
success = dataset_manager.upsert_experiment(initial_experiment)
|
| 247 |
|
| 248 |
+
if success:
|
| 249 |
+
print(f"β
Successfully added initial experiment data to {repo_id}")
|
| 250 |
+
final_count = len(dataset_manager.load_existing_experiments())
|
| 251 |
+
print(f"π Dataset now contains {final_count} total experiments")
|
| 252 |
+
else:
|
| 253 |
+
print(f"β Failed to add initial experiment data to {repo_id}")
|
| 254 |
+
return False
|
| 255 |
|
| 256 |
# Add README template
|
| 257 |
add_dataset_readme(repo_id, token)
|
scripts/deploy_demo_space.py
CHANGED
|
@@ -38,7 +38,8 @@ class DemoSpaceDeployer:
|
|
| 38 |
"""Deploy demo space to Hugging Face Spaces"""
|
| 39 |
|
| 40 |
def __init__(self, hf_token: str, hf_username: str, model_id: str,
|
| 41 |
-
subfolder: str = "int4", space_name: Optional[str] = None
|
|
|
|
| 42 |
self.hf_token = hf_token
|
| 43 |
self.hf_username = hf_username
|
| 44 |
self.model_id = model_id
|
|
@@ -47,8 +48,13 @@ class DemoSpaceDeployer:
|
|
| 47 |
self.space_id = f"{hf_username}/{self.space_name}"
|
| 48 |
self.space_url = f"https://huggingface.co/spaces/{self.space_id}"
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
self.workspace_dir = Path.cwd()
|
| 53 |
|
| 54 |
# Initialize HF API
|
|
@@ -58,6 +64,107 @@ class DemoSpaceDeployer:
|
|
| 58 |
self.api = None
|
| 59 |
logger.warning("huggingface_hub not available, using CLI fallback")
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def validate_model_exists(self) -> bool:
|
| 62 |
"""Validate that the model exists on Hugging Face Hub"""
|
| 63 |
try:
|
|
@@ -187,14 +294,7 @@ class DemoSpaceDeployer:
|
|
| 187 |
content = f.read()
|
| 188 |
|
| 189 |
# Add environment variable setup at the top
|
| 190 |
-
env_setup =
|
| 191 |
-
# Environment variables for model configuration
|
| 192 |
-
import os
|
| 193 |
-
os.environ['HF_MODEL_ID'] = '{self.model_id}'
|
| 194 |
-
os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
|
| 195 |
-
os.environ['MODEL_NAME'] = '{self.model_id.split("/")[-1]}'
|
| 196 |
-
|
| 197 |
-
"""
|
| 198 |
|
| 199 |
# Insert after imports
|
| 200 |
lines = content.split('\n')
|
|
@@ -335,24 +435,7 @@ Simply start chatting with the model using the interface below!
|
|
| 335 |
logger.info("β
Successfully set HF_TOKEN secret via API")
|
| 336 |
|
| 337 |
# Set model-specific environment variables
|
| 338 |
-
self.
|
| 339 |
-
repo_id=self.space_id,
|
| 340 |
-
key="HF_MODEL_ID",
|
| 341 |
-
value=self.model_id,
|
| 342 |
-
description="Model ID for the demo"
|
| 343 |
-
)
|
| 344 |
-
logger.info(f"β
Successfully set HF_MODEL_ID variable: {self.model_id}")
|
| 345 |
-
|
| 346 |
-
if self.subfolder and self.subfolder.strip():
|
| 347 |
-
self.api.add_space_variable(
|
| 348 |
-
repo_id=self.space_id,
|
| 349 |
-
key="MODEL_SUBFOLDER",
|
| 350 |
-
value=self.subfolder,
|
| 351 |
-
description="Model subfolder for the demo"
|
| 352 |
-
)
|
| 353 |
-
logger.info(f"β
Successfully set MODEL_SUBFOLDER variable: {self.subfolder}")
|
| 354 |
-
else:
|
| 355 |
-
logger.info("βΉοΈ No subfolder specified, using main model")
|
| 356 |
|
| 357 |
return True
|
| 358 |
|
|
@@ -375,6 +458,13 @@ Simply start chatting with the model using the interface below!
|
|
| 375 |
else:
|
| 376 |
logger.info(" MODEL_SUBFOLDER=(empty - using main model)")
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
logger.info(f"\nπ§ To set secrets in your Space:")
|
| 379 |
logger.info(f"1. Go to your Space settings: {self.space_url}/settings")
|
| 380 |
logger.info("2. Navigate to the 'Repository secrets' section")
|
|
@@ -389,6 +479,17 @@ Simply start chatting with the model using the interface below!
|
|
| 389 |
else:
|
| 390 |
logger.info(" Name: MODEL_SUBFOLDER")
|
| 391 |
logger.info(" Value: (leave empty)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
logger.info("4. Save the secrets")
|
| 393 |
|
| 394 |
return True
|
|
@@ -471,6 +572,7 @@ def main():
|
|
| 471 |
parser.add_argument("--model-id", required=True, help="Model ID to deploy demo for")
|
| 472 |
parser.add_argument("--subfolder", default="int4", help="Model subfolder (default: int4)")
|
| 473 |
parser.add_argument("--space-name", help="Custom space name (optional)")
|
|
|
|
| 474 |
|
| 475 |
args = parser.parse_args()
|
| 476 |
|
|
@@ -479,7 +581,8 @@ def main():
|
|
| 479 |
hf_username=args.hf_username,
|
| 480 |
model_id=args.model_id,
|
| 481 |
subfolder=args.subfolder,
|
| 482 |
-
space_name=args.space_name
|
|
|
|
| 483 |
)
|
| 484 |
|
| 485 |
success = deployer.deploy()
|
|
|
|
| 38 |
"""Deploy demo space to Hugging Face Spaces"""
|
| 39 |
|
| 40 |
def __init__(self, hf_token: str, hf_username: str, model_id: str,
|
| 41 |
+
subfolder: str = "int4", space_name: Optional[str] = None,
|
| 42 |
+
demo_type: Optional[str] = None):
|
| 43 |
self.hf_token = hf_token
|
| 44 |
self.hf_username = hf_username
|
| 45 |
self.model_id = model_id
|
|
|
|
| 48 |
self.space_id = f"{hf_username}/{self.space_name}"
|
| 49 |
self.space_url = f"https://huggingface.co/spaces/{self.space_id}"
|
| 50 |
|
| 51 |
+
# Determine demo type from model_id if not provided
|
| 52 |
+
if demo_type is None:
|
| 53 |
+
demo_type = self._detect_demo_type(model_id)
|
| 54 |
+
|
| 55 |
+
# Template paths based on model type
|
| 56 |
+
self.demo_type = demo_type
|
| 57 |
+
self.template_dir = Path(__file__).parent.parent / "templates" / "spaces" / f"demo_{demo_type}"
|
| 58 |
self.workspace_dir = Path.cwd()
|
| 59 |
|
| 60 |
# Initialize HF API
|
|
|
|
| 64 |
self.api = None
|
| 65 |
logger.warning("huggingface_hub not available, using CLI fallback")
|
| 66 |
|
| 67 |
+
def _detect_demo_type(self, model_id: str) -> str:
|
| 68 |
+
"""Detect the appropriate demo type based on model ID"""
|
| 69 |
+
model_id_lower = model_id.lower()
|
| 70 |
+
|
| 71 |
+
# Check for GPT-OSS models
|
| 72 |
+
if "gpt-oss" in model_id_lower or "gpt_oss" in model_id_lower:
|
| 73 |
+
logger.info(f"Detected GPT-OSS model, using demo_gpt template")
|
| 74 |
+
return "gpt"
|
| 75 |
+
|
| 76 |
+
# Check for SmolLM models (default)
|
| 77 |
+
elif "smollm" in model_id_lower or "smol" in model_id_lower:
|
| 78 |
+
logger.info(f"Detected SmolLM model, using demo_smol template")
|
| 79 |
+
return "smol"
|
| 80 |
+
|
| 81 |
+
# Default to SmolLM for unknown models
|
| 82 |
+
else:
|
| 83 |
+
logger.info(f"Unknown model type, defaulting to demo_smol template")
|
| 84 |
+
return "smol"
|
| 85 |
+
|
| 86 |
+
def _generate_env_setup(self) -> str:
|
| 87 |
+
"""Generate environment variable setup based on demo type and model"""
|
| 88 |
+
if self.demo_type == "gpt":
|
| 89 |
+
# For GPT-OSS models, we need more sophisticated environment setup
|
| 90 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
| 91 |
+
|
| 92 |
+
env_setup = f"""
|
| 93 |
+
# Environment variables for GPT-OSS model configuration
|
| 94 |
+
import os
|
| 95 |
+
os.environ['HF_MODEL_ID'] = '{self.model_id}'
|
| 96 |
+
os.environ['LORA_MODEL_ID'] = '{self.model_id}'
|
| 97 |
+
os.environ['BASE_MODEL_ID'] = 'openai/gpt-oss-20b'
|
| 98 |
+
os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
|
| 99 |
+
os.environ['MODEL_NAME'] = '{model_name}'
|
| 100 |
+
|
| 101 |
+
"""
|
| 102 |
+
else:
|
| 103 |
+
# For SmolLM models, use simpler setup
|
| 104 |
+
env_setup = f"""
|
| 105 |
+
# Environment variables for model configuration
|
| 106 |
+
import os
|
| 107 |
+
os.environ['HF_MODEL_ID'] = '{self.model_id}'
|
| 108 |
+
os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
|
| 109 |
+
os.environ['MODEL_NAME'] = '{self.model_id.split("/")[-1]}'
|
| 110 |
+
|
| 111 |
+
"""
|
| 112 |
+
return env_setup
|
| 113 |
+
|
| 114 |
+
def _set_model_variables(self):
|
| 115 |
+
"""Set model-specific environment variables in the space"""
|
| 116 |
+
try:
|
| 117 |
+
# Common variables for all models
|
| 118 |
+
self.api.add_space_variable(
|
| 119 |
+
repo_id=self.space_id,
|
| 120 |
+
key="HF_MODEL_ID",
|
| 121 |
+
value=self.model_id,
|
| 122 |
+
description="Model ID for the demo"
|
| 123 |
+
)
|
| 124 |
+
logger.info(f"β
Successfully set HF_MODEL_ID variable: {self.model_id}")
|
| 125 |
+
|
| 126 |
+
if self.subfolder and self.subfolder.strip():
|
| 127 |
+
self.api.add_space_variable(
|
| 128 |
+
repo_id=self.space_id,
|
| 129 |
+
key="MODEL_SUBFOLDER",
|
| 130 |
+
value=self.subfolder,
|
| 131 |
+
description="Model subfolder for the demo"
|
| 132 |
+
)
|
| 133 |
+
logger.info(f"β
Successfully set MODEL_SUBFOLDER variable: {self.subfolder}")
|
| 134 |
+
else:
|
| 135 |
+
logger.info("βΉοΈ No subfolder specified, using main model")
|
| 136 |
+
|
| 137 |
+
# GPT-OSS specific variables
|
| 138 |
+
if self.demo_type == "gpt":
|
| 139 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
| 140 |
+
|
| 141 |
+
self.api.add_space_variable(
|
| 142 |
+
repo_id=self.space_id,
|
| 143 |
+
key="LORA_MODEL_ID",
|
| 144 |
+
value=self.model_id,
|
| 145 |
+
description="LoRA/Fine-tuned model ID"
|
| 146 |
+
)
|
| 147 |
+
logger.info(f"β
Successfully set LORA_MODEL_ID variable: {self.model_id}")
|
| 148 |
+
|
| 149 |
+
self.api.add_space_variable(
|
| 150 |
+
repo_id=self.space_id,
|
| 151 |
+
key="BASE_MODEL_ID",
|
| 152 |
+
value="openai/gpt-oss-20b",
|
| 153 |
+
description="Base model ID for GPT-OSS"
|
| 154 |
+
)
|
| 155 |
+
logger.info("β
Successfully set BASE_MODEL_ID variable: openai/gpt-oss-20b")
|
| 156 |
+
|
| 157 |
+
self.api.add_space_variable(
|
| 158 |
+
repo_id=self.space_id,
|
| 159 |
+
key="MODEL_NAME",
|
| 160 |
+
value=model_name,
|
| 161 |
+
description="Display name for the model"
|
| 162 |
+
)
|
| 163 |
+
logger.info(f"β
Successfully set MODEL_NAME variable: {model_name}")
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"β Failed to set model variables: {e}")
|
| 167 |
+
|
| 168 |
def validate_model_exists(self) -> bool:
|
| 169 |
"""Validate that the model exists on Hugging Face Hub"""
|
| 170 |
try:
|
|
|
|
| 294 |
content = f.read()
|
| 295 |
|
| 296 |
# Add environment variable setup at the top
|
| 297 |
+
env_setup = self._generate_env_setup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
# Insert after imports
|
| 300 |
lines = content.split('\n')
|
|
|
|
| 435 |
logger.info("β
Successfully set HF_TOKEN secret via API")
|
| 436 |
|
| 437 |
# Set model-specific environment variables
|
| 438 |
+
self._set_model_variables()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
return True
|
| 441 |
|
|
|
|
| 458 |
else:
|
| 459 |
logger.info(" MODEL_SUBFOLDER=(empty - using main model)")
|
| 460 |
|
| 461 |
+
# GPT-OSS specific variables
|
| 462 |
+
if self.demo_type == "gpt":
|
| 463 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
| 464 |
+
logger.info(f" LORA_MODEL_ID={self.model_id}")
|
| 465 |
+
logger.info(f" BASE_MODEL_ID=openai/gpt-oss-20b")
|
| 466 |
+
logger.info(f" MODEL_NAME={model_name}")
|
| 467 |
+
|
| 468 |
logger.info(f"\nπ§ To set secrets in your Space:")
|
| 469 |
logger.info(f"1. Go to your Space settings: {self.space_url}/settings")
|
| 470 |
logger.info("2. Navigate to the 'Repository secrets' section")
|
|
|
|
| 479 |
else:
|
| 480 |
logger.info(" Name: MODEL_SUBFOLDER")
|
| 481 |
logger.info(" Value: (leave empty)")
|
| 482 |
+
|
| 483 |
+
# GPT-OSS specific variables
|
| 484 |
+
if self.demo_type == "gpt":
|
| 485 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
| 486 |
+
logger.info(f" Name: LORA_MODEL_ID")
|
| 487 |
+
logger.info(f" Value: {self.model_id}")
|
| 488 |
+
logger.info(f" Name: BASE_MODEL_ID")
|
| 489 |
+
logger.info(f" Value: openai/gpt-oss-20b")
|
| 490 |
+
logger.info(f" Name: MODEL_NAME")
|
| 491 |
+
logger.info(f" Value: {model_name}")
|
| 492 |
+
|
| 493 |
logger.info("4. Save the secrets")
|
| 494 |
|
| 495 |
return True
|
|
|
|
| 572 |
parser.add_argument("--model-id", required=True, help="Model ID to deploy demo for")
|
| 573 |
parser.add_argument("--subfolder", default="int4", help="Model subfolder (default: int4)")
|
| 574 |
parser.add_argument("--space-name", help="Custom space name (optional)")
|
| 575 |
+
parser.add_argument("--demo-type", choices=["smol", "gpt"], help="Demo type: 'smol' for SmolLM, 'gpt' for GPT-OSS (auto-detected if not specified)")
|
| 576 |
|
| 577 |
args = parser.parse_args()
|
| 578 |
|
|
|
|
| 581 |
hf_username=args.hf_username,
|
| 582 |
model_id=args.model_id,
|
| 583 |
subfolder=args.subfolder,
|
| 584 |
+
space_name=args.space_name,
|
| 585 |
+
demo_type=args.demo_type
|
| 586 |
)
|
| 587 |
|
| 588 |
success = deployer.deploy()
|
scripts/model_tonic/push_gpt_oss_to_huggingface.py
CHANGED
|
@@ -169,8 +169,8 @@ If you use this model in your research, please cite:
|
|
| 169 |
author = {{{author_name}}},
|
| 170 |
title = {{{model_name}}},
|
| 171 |
year = {{{datetime.now().year}}},
|
| 172 |
-
publisher = {Hugging Face},
|
| 173 |
-
journal = {Hugging Face repository},
|
| 174 |
howpublished = {{\\url{{https://huggingface.co/{model_name}}}}}
|
| 175 |
}}
|
| 176 |
```
|
|
|
|
| 169 |
author = {{{author_name}}},
|
| 170 |
title = {{{model_name}}},
|
| 171 |
year = {{{datetime.now().year}}},
|
| 172 |
+
publisher = {{Hugging Face}},
|
| 173 |
+
journal = {{Hugging Face repository}},
|
| 174 |
howpublished = {{\\url{{https://huggingface.co/{model_name}}}}}
|
| 175 |
}}
|
| 176 |
```
|
scripts/trackio_tonic/configure_trackio.py
CHANGED
|
@@ -79,13 +79,11 @@ def configure_trackio():
|
|
| 79 |
print("π§ Trackio Configuration")
|
| 80 |
print("=" * 40)
|
| 81 |
|
| 82 |
-
# Get HF
|
| 83 |
-
|
| 84 |
-
hf_read_token = os.environ.get('HF_READ_TOKEN')
|
| 85 |
-
hf_token = os.environ.get('HF_TOKEN') # Legacy support
|
| 86 |
|
| 87 |
-
# Use
|
| 88 |
-
active_token =
|
| 89 |
|
| 90 |
if active_token:
|
| 91 |
username = get_username_from_token(active_token)
|
|
@@ -102,9 +100,7 @@ def configure_trackio():
|
|
| 102 |
|
| 103 |
# Current configuration
|
| 104 |
current_config = {
|
| 105 |
-
'
|
| 106 |
-
'HF_READ_TOKEN': hf_read_token or 'Not set',
|
| 107 |
-
'HF_TOKEN': hf_token or 'Not set', # Legacy
|
| 108 |
'TRACKIO_DATASET_REPO': dataset_repo,
|
| 109 |
'SPACE_ID': os.environ.get('SPACE_ID', 'Not set'),
|
| 110 |
'TRACKIO_URL': os.environ.get('TRACKIO_URL', 'Not set')
|
|
@@ -116,12 +112,10 @@ def configure_trackio():
|
|
| 116 |
print(f" {status} {key}: {value}")
|
| 117 |
|
| 118 |
print("\nπ― Configuration Options:")
|
| 119 |
-
print("1. Set
|
| 120 |
-
print("2. Set
|
| 121 |
-
print("3. Set
|
| 122 |
-
print("4. Set
|
| 123 |
-
print("5. Set SPACE_ID - HF Space ID (auto-detected)")
|
| 124 |
-
print("6. Set TRACKIO_URL - Trackio Space URL (auto-detected)")
|
| 125 |
|
| 126 |
# Check if running on HF Spaces
|
| 127 |
if os.environ.get('SPACE_ID'):
|
|
@@ -131,37 +125,21 @@ def configure_trackio():
|
|
| 131 |
# Validate configuration
|
| 132 |
print("\nπ Configuration Validation:")
|
| 133 |
|
| 134 |
-
# Check
|
| 135 |
-
if current_config['HF_WRITE_TOKEN'] != 'Not set':
|
| 136 |
-
print("β
HF_WRITE_TOKEN is set")
|
| 137 |
-
print(" This allows training operations and repository creation")
|
| 138 |
-
else:
|
| 139 |
-
print("β HF_WRITE_TOKEN is not set")
|
| 140 |
-
print(" Please set HF_WRITE_TOKEN for training operations")
|
| 141 |
-
print(" Get your token from: https://huggingface.co/settings/tokens")
|
| 142 |
-
|
| 143 |
-
# Check HF_READ_TOKEN
|
| 144 |
-
if current_config['HF_READ_TOKEN'] != 'Not set':
|
| 145 |
-
print("β
HF_READ_TOKEN is set")
|
| 146 |
-
print(" This will be used for Trackio Space security")
|
| 147 |
-
else:
|
| 148 |
-
print("β HF_READ_TOKEN is not set")
|
| 149 |
-
print(" Please set HF_READ_TOKEN for Space security")
|
| 150 |
-
print(" Get your token from: https://huggingface.co/settings/tokens")
|
| 151 |
-
|
| 152 |
-
# Check legacy HF_TOKEN
|
| 153 |
if current_config['HF_TOKEN'] != 'Not set':
|
| 154 |
-
print("β
HF_TOKEN
|
| 155 |
-
print(" This
|
|
|
|
| 156 |
else:
|
| 157 |
-
print("
|
| 158 |
-
print("
|
|
|
|
| 159 |
|
| 160 |
# Check dataset repository
|
| 161 |
print(f"π Dataset Repository: {dataset_repo}")
|
| 162 |
|
| 163 |
# Test dataset access if token is available
|
| 164 |
-
test_token = current_config['
|
| 165 |
if test_token != 'Not set':
|
| 166 |
print("\nπ§ͺ Testing Dataset Access...")
|
| 167 |
try:
|
|
@@ -216,15 +194,13 @@ def configure_trackio():
|
|
| 216 |
# Generate configuration file
|
| 217 |
config_file = "trackio_config.json"
|
| 218 |
config_data = {
|
| 219 |
-
'
|
| 220 |
-
'hf_read_token': current_config['HF_READ_TOKEN'],
|
| 221 |
-
'hf_token': current_config['HF_TOKEN'], # Legacy
|
| 222 |
'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
|
| 223 |
'space_id': current_config['SPACE_ID'],
|
| 224 |
'trackio_url': current_config['TRACKIO_URL'],
|
| 225 |
'username': username,
|
| 226 |
'last_updated': datetime.now().isoformat(),
|
| 227 |
-
'notes': 'Trackio configuration -
|
| 228 |
}
|
| 229 |
|
| 230 |
with open(config_file, 'w') as f:
|
|
@@ -235,16 +211,14 @@ def configure_trackio():
|
|
| 235 |
# Show environment variable commands
|
| 236 |
print("\nπ Environment Variables for HF Space:")
|
| 237 |
print("=" * 50)
|
| 238 |
-
print(f"
|
| 239 |
-
print(f"HF_READ_TOKEN={current_config['HF_READ_TOKEN']}")
|
| 240 |
-
print(f"HF_TOKEN={current_config['HF_TOKEN']}") # Legacy
|
| 241 |
print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
|
| 242 |
if current_config['TRACKIO_URL'] != 'Not set':
|
| 243 |
print(f"TRACKIO_URL={current_config['TRACKIO_URL']}")
|
| 244 |
|
| 245 |
print("\nπ― Next Steps:")
|
| 246 |
-
print("1.
|
| 247 |
-
print("2.
|
| 248 |
print("3. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
|
| 249 |
print("4. Deploy your updated app.py to the Space")
|
| 250 |
print("5. Run setup_hf_dataset.py if you haven't created the dataset yet")
|
|
|
|
| 79 |
print("π§ Trackio Configuration")
|
| 80 |
print("=" * 40)
|
| 81 |
|
| 82 |
+
# Get HF token (single token approach)
|
| 83 |
+
hf_token = os.environ.get('HF_TOKEN')
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
# Use the single HF_TOKEN
|
| 86 |
+
active_token = hf_token
|
| 87 |
|
| 88 |
if active_token:
|
| 89 |
username = get_username_from_token(active_token)
|
|
|
|
| 100 |
|
| 101 |
# Current configuration
|
| 102 |
current_config = {
|
| 103 |
+
'HF_TOKEN': hf_token or 'Not set',
|
|
|
|
|
|
|
| 104 |
'TRACKIO_DATASET_REPO': dataset_repo,
|
| 105 |
'SPACE_ID': os.environ.get('SPACE_ID', 'Not set'),
|
| 106 |
'TRACKIO_URL': os.environ.get('TRACKIO_URL', 'Not set')
|
|
|
|
| 112 |
print(f" {status} {key}: {value}")
|
| 113 |
|
| 114 |
print("\nπ― Configuration Options:")
|
| 115 |
+
print("1. Set HF_TOKEN - Main token (starts as write, switches to read after training)")
|
| 116 |
+
print("2. Set TRACKIO_DATASET_REPO - Dataset repository (optional)")
|
| 117 |
+
print("3. Set SPACE_ID - HF Space ID (auto-detected)")
|
| 118 |
+
print("4. Set TRACKIO_URL - Trackio Space URL (auto-detected)")
|
|
|
|
|
|
|
| 119 |
|
| 120 |
# Check if running on HF Spaces
|
| 121 |
if os.environ.get('SPACE_ID'):
|
|
|
|
| 125 |
# Validate configuration
|
| 126 |
print("\nπ Configuration Validation:")
|
| 127 |
|
| 128 |
+
# Check HF_TOKEN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
if current_config['HF_TOKEN'] != 'Not set':
|
| 130 |
+
print("β
HF_TOKEN is set")
|
| 131 |
+
print(" This allows training operations and dataset access")
|
| 132 |
+
print(" Note: Token will be automatically switched from write to read after training")
|
| 133 |
else:
|
| 134 |
+
print("β HF_TOKEN is not set")
|
| 135 |
+
print(" Please set HF_TOKEN for training operations")
|
| 136 |
+
print(" Get your token from: https://huggingface.co/settings/tokens")
|
| 137 |
|
| 138 |
# Check dataset repository
|
| 139 |
print(f"π Dataset Repository: {dataset_repo}")
|
| 140 |
|
| 141 |
# Test dataset access if token is available
|
| 142 |
+
test_token = current_config['HF_TOKEN']
|
| 143 |
if test_token != 'Not set':
|
| 144 |
print("\nπ§ͺ Testing Dataset Access...")
|
| 145 |
try:
|
|
|
|
| 194 |
# Generate configuration file
|
| 195 |
config_file = "trackio_config.json"
|
| 196 |
config_data = {
|
| 197 |
+
'hf_token': current_config['HF_TOKEN'],
|
|
|
|
|
|
|
| 198 |
'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
|
| 199 |
'space_id': current_config['SPACE_ID'],
|
| 200 |
'trackio_url': current_config['TRACKIO_URL'],
|
| 201 |
'username': username,
|
| 202 |
'last_updated': datetime.now().isoformat(),
|
| 203 |
+
'notes': 'Trackio configuration - HF_TOKEN starts as write token, switches to read token after training'
|
| 204 |
}
|
| 205 |
|
| 206 |
with open(config_file, 'w') as f:
|
|
|
|
| 211 |
# Show environment variable commands
|
| 212 |
print("\nπ Environment Variables for HF Space:")
|
| 213 |
print("=" * 50)
|
| 214 |
+
print(f"HF_TOKEN={current_config['HF_TOKEN']}")
|
|
|
|
|
|
|
| 215 |
print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
|
| 216 |
if current_config['TRACKIO_URL'] != 'Not set':
|
| 217 |
print(f"TRACKIO_URL={current_config['TRACKIO_URL']}")
|
| 218 |
|
| 219 |
print("\nπ― Next Steps:")
|
| 220 |
+
print("1. HF_TOKEN will be automatically set during deployment (starts as write token)")
|
| 221 |
+
print("2. HF_TOKEN will be automatically switched to read token after training")
|
| 222 |
print("3. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
|
| 223 |
print("4. Deploy your updated app.py to the Space")
|
| 224 |
print("5. Run setup_hf_dataset.py if you haven't created the dataset yet")
|
scripts/trackio_tonic/deploy_trackio_space.py
CHANGED
|
@@ -196,16 +196,16 @@ class TrackioSpaceDeployer:
|
|
| 196 |
|
| 197 |
# Get the project root directory (3 levels up from this script)
|
| 198 |
project_root = Path(__file__).parent.parent.parent
|
| 199 |
-
templates_dir = project_root / "templates" / "spaces"
|
| 200 |
|
| 201 |
-
# Files to copy from templates/spaces
|
| 202 |
files_to_copy = [
|
| 203 |
"app.py",
|
| 204 |
"requirements.txt",
|
| 205 |
"README.md"
|
| 206 |
]
|
| 207 |
|
| 208 |
-
# Copy files from templates/spaces to temp directory
|
| 209 |
copied_files = []
|
| 210 |
for file_name in files_to_copy:
|
| 211 |
source_path = templates_dir / file_name
|
|
@@ -334,36 +334,16 @@ class TrackioSpaceDeployer:
|
|
| 334 |
|
| 335 |
repo_id = f"{self.username}/{self.space_name}"
|
| 336 |
|
| 337 |
-
#
|
| 338 |
-
|
| 339 |
-
hf_read_token = os.getenv('HF_READ_TOKEN', self.token)
|
| 340 |
-
hf_token = os.getenv('HF_TOKEN', self.token) # Legacy
|
| 341 |
|
| 342 |
-
# Set the
|
| 343 |
try:
|
| 344 |
-
self.api.add_space_secret(
|
| 345 |
-
repo_id=repo_id,
|
| 346 |
-
key="HF_WRITE_TOKEN",
|
| 347 |
-
value=hf_write_token,
|
| 348 |
-
description="Hugging Face write token for training operations"
|
| 349 |
-
)
|
| 350 |
-
print("β
Successfully set HF_WRITE_TOKEN secret via API")
|
| 351 |
-
|
| 352 |
-
# Set the HF_READ_TOKEN secret for the space using the API
|
| 353 |
-
self.api.add_space_secret(
|
| 354 |
-
repo_id=repo_id,
|
| 355 |
-
key="HF_READ_TOKEN",
|
| 356 |
-
value=hf_read_token,
|
| 357 |
-
description="Hugging Face read token for security"
|
| 358 |
-
)
|
| 359 |
-
print("β
Successfully set HF_READ_TOKEN secret via API")
|
| 360 |
-
|
| 361 |
-
# Set legacy HF_TOKEN secret for backward compatibility
|
| 362 |
self.api.add_space_secret(
|
| 363 |
repo_id=repo_id,
|
| 364 |
key="HF_TOKEN",
|
| 365 |
value=hf_token,
|
| 366 |
-
description="Hugging Face token for dataset access (
|
| 367 |
)
|
| 368 |
print("β
Successfully set HF_TOKEN secret via API")
|
| 369 |
|
|
@@ -401,13 +381,9 @@ class TrackioSpaceDeployer:
|
|
| 401 |
"""Fallback method for manual secret setup"""
|
| 402 |
print("π Manual Space Secrets Configuration:")
|
| 403 |
|
| 404 |
-
#
|
| 405 |
-
|
| 406 |
-
hf_read_token = os.getenv('HF_READ_TOKEN', self.token)
|
| 407 |
-
hf_token = os.getenv('HF_TOKEN', self.token) # Legacy
|
| 408 |
|
| 409 |
-
print(f" HF_WRITE_TOKEN={hf_write_token}")
|
| 410 |
-
print(f" HF_READ_TOKEN={hf_read_token}")
|
| 411 |
print(f" HF_TOKEN={hf_token}")
|
| 412 |
|
| 413 |
dataset_repo = self.dataset_repo or f"{self.username}/trackio-experiments"
|
|
@@ -415,13 +391,9 @@ class TrackioSpaceDeployer:
|
|
| 415 |
print(f" TRACKIO_URL={self.space_url}")
|
| 416 |
|
| 417 |
print("\nπ§ To set secrets in your Space:")
|
| 418 |
-
print("1. Go to your Space settings: {self.space_url}/settings")
|
| 419 |
print("2. Navigate to the 'Repository secrets' section")
|
| 420 |
print("3. Add the following secrets:")
|
| 421 |
-
print(f" Name: HF_WRITE_TOKEN")
|
| 422 |
-
print(f" Value: {hf_write_token}")
|
| 423 |
-
print(f" Name: HF_READ_TOKEN")
|
| 424 |
-
print(f" Value: {hf_read_token}")
|
| 425 |
print(f" Name: HF_TOKEN")
|
| 426 |
print(f" Value: {hf_token}")
|
| 427 |
print(f" Name: TRACKIO_DATASET_REPO")
|
|
@@ -429,6 +401,7 @@ class TrackioSpaceDeployer:
|
|
| 429 |
print(f" Name: TRACKIO_URL")
|
| 430 |
print(f" Value: {self.space_url}")
|
| 431 |
print("4. Save the secrets")
|
|
|
|
| 432 |
|
| 433 |
return True
|
| 434 |
|
|
|
|
| 196 |
|
| 197 |
# Get the project root directory (3 levels up from this script)
|
| 198 |
project_root = Path(__file__).parent.parent.parent
|
| 199 |
+
templates_dir = project_root / "templates" / "spaces" / "trackio"
|
| 200 |
|
| 201 |
+
# Files to copy from templates/spaces/trackio
|
| 202 |
files_to_copy = [
|
| 203 |
"app.py",
|
| 204 |
"requirements.txt",
|
| 205 |
"README.md"
|
| 206 |
]
|
| 207 |
|
| 208 |
+
# Copy files from templates/spaces/trackio to temp directory
|
| 209 |
copied_files = []
|
| 210 |
for file_name in files_to_copy:
|
| 211 |
source_path = templates_dir / file_name
|
|
|
|
| 334 |
|
| 335 |
repo_id = f"{self.username}/{self.space_name}"
|
| 336 |
|
| 337 |
+
# Use the provided token as HF_TOKEN (starts as write token, will be switched to read token later)
|
| 338 |
+
hf_token = self.token
|
|
|
|
|
|
|
| 339 |
|
| 340 |
+
# Set the HF_TOKEN secret for the space using the API
|
| 341 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
self.api.add_space_secret(
|
| 343 |
repo_id=repo_id,
|
| 344 |
key="HF_TOKEN",
|
| 345 |
value=hf_token,
|
| 346 |
+
description="Hugging Face token for dataset access (starts as write, switches to read)"
|
| 347 |
)
|
| 348 |
print("β
Successfully set HF_TOKEN secret via API")
|
| 349 |
|
|
|
|
| 381 |
"""Fallback method for manual secret setup"""
|
| 382 |
print("π Manual Space Secrets Configuration:")
|
| 383 |
|
| 384 |
+
# Use the provided token as HF_TOKEN
|
| 385 |
+
hf_token = self.token
|
|
|
|
|
|
|
| 386 |
|
|
|
|
|
|
|
| 387 |
print(f" HF_TOKEN={hf_token}")
|
| 388 |
|
| 389 |
dataset_repo = self.dataset_repo or f"{self.username}/trackio-experiments"
|
|
|
|
| 391 |
print(f" TRACKIO_URL={self.space_url}")
|
| 392 |
|
| 393 |
print("\nπ§ To set secrets in your Space:")
|
| 394 |
+
print(f"1. Go to your Space settings: {self.space_url}/settings")
|
| 395 |
print("2. Navigate to the 'Repository secrets' section")
|
| 396 |
print("3. Add the following secrets:")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
print(f" Name: HF_TOKEN")
|
| 398 |
print(f" Value: {hf_token}")
|
| 399 |
print(f" Name: TRACKIO_DATASET_REPO")
|
|
|
|
| 401 |
print(f" Name: TRACKIO_URL")
|
| 402 |
print(f" Value: {self.space_url}")
|
| 403 |
print("4. Save the secrets")
|
| 404 |
+
print("\nNote: HF_TOKEN starts as write token and will be switched to read token after training")
|
| 405 |
|
| 406 |
return True
|
| 407 |
|
scripts/trackio_tonic/switch_to_read_token.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Switch Trackio Space from Write Token to Read Token
|
| 4 |
|
| 5 |
This script switches the HF_TOKEN secret in a Trackio Space from a write token
|
| 6 |
to a read token after the experiment is complete, for security purposes.
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
|
@@ -61,8 +62,8 @@ def switch_space_token(space_id: str, read_token: str, write_token: str) -> bool
|
|
| 61 |
|
| 62 |
Args:
|
| 63 |
space_id (str): The space ID (username/space-name)
|
| 64 |
-
read_token (str): The read token to set
|
| 65 |
-
write_token (str): The write token (for
|
| 66 |
|
| 67 |
Returns:
|
| 68 |
bool: True if successful, False otherwise
|
|
@@ -93,23 +94,24 @@ def switch_space_token(space_id: str, read_token: str, write_token: str) -> bool
|
|
| 93 |
# Use the write token to update the space (since we need write access)
|
| 94 |
api = HfApi(token=write_token)
|
| 95 |
|
| 96 |
-
# Update the HF_TOKEN secret in the space
|
| 97 |
try:
|
| 98 |
api.add_space_secret(
|
| 99 |
repo_id=space_id,
|
| 100 |
key="HF_TOKEN",
|
| 101 |
value=read_token,
|
| 102 |
-
description="Hugging Face
|
| 103 |
)
|
| 104 |
-
print(f"β
Successfully switched HF_TOKEN to read token in space: {space_id}")
|
|
|
|
| 105 |
return True
|
| 106 |
|
| 107 |
except Exception as e:
|
| 108 |
-
print(f"β Failed to update
|
| 109 |
return False
|
| 110 |
|
| 111 |
except Exception as e:
|
| 112 |
-
print(f"β Error switching
|
| 113 |
return False
|
| 114 |
|
| 115 |
def main():
|
|
@@ -137,12 +139,13 @@ def main():
|
|
| 137 |
success = switch_space_token(space_id, read_token, write_token)
|
| 138 |
|
| 139 |
if success:
|
| 140 |
-
print("\nβ
|
| 141 |
print(f"π Space: {space_id}")
|
| 142 |
-
print("π HF_TOKEN now uses read-only permissions")
|
| 143 |
print("π‘ The space can still read datasets but cannot write to repositories")
|
|
|
|
| 144 |
else:
|
| 145 |
-
print("\nβ
|
| 146 |
print("Please check your tokens and try again.")
|
| 147 |
sys.exit(1)
|
| 148 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Switch Trackio Space HF_TOKEN from Write Token to Read Token
|
| 4 |
|
| 5 |
This script switches the HF_TOKEN secret in a Trackio Space from a write token
|
| 6 |
to a read token after the experiment is complete, for security purposes.
|
| 7 |
+
The space uses only HF_TOKEN, which starts as write token and gets switched to read token.
|
| 8 |
"""
|
| 9 |
|
| 10 |
import os
|
|
|
|
| 62 |
|
| 63 |
Args:
|
| 64 |
space_id (str): The space ID (username/space-name)
|
| 65 |
+
read_token (str): The read token to set as new HF_TOKEN
|
| 66 |
+
write_token (str): The write token (for authentication to update the space)
|
| 67 |
|
| 68 |
Returns:
|
| 69 |
bool: True if successful, False otherwise
|
|
|
|
| 94 |
# Use the write token to update the space (since we need write access)
|
| 95 |
api = HfApi(token=write_token)
|
| 96 |
|
| 97 |
+
# Update the HF_TOKEN secret in the space from write token to read token
|
| 98 |
try:
|
| 99 |
api.add_space_secret(
|
| 100 |
repo_id=space_id,
|
| 101 |
key="HF_TOKEN",
|
| 102 |
value=read_token,
|
| 103 |
+
description="Hugging Face token for dataset access (switched from write to read for security)"
|
| 104 |
)
|
| 105 |
+
print(f"β
Successfully switched HF_TOKEN from write to read token in space: {space_id}")
|
| 106 |
+
print(f"π Space now uses read-only permissions for enhanced security")
|
| 107 |
return True
|
| 108 |
|
| 109 |
except Exception as e:
|
| 110 |
+
print(f"β Failed to update HF_TOKEN secret: {e}")
|
| 111 |
return False
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
+
print(f"β Error switching HF_TOKEN: {e}")
|
| 115 |
return False
|
| 116 |
|
| 117 |
def main():
|
|
|
|
| 139 |
success = switch_space_token(space_id, read_token, write_token)
|
| 140 |
|
| 141 |
if success:
|
| 142 |
+
print("\nβ
HF_TOKEN switch completed successfully!")
|
| 143 |
print(f"π Space: {space_id}")
|
| 144 |
+
print("π HF_TOKEN now uses read-only permissions for enhanced security")
|
| 145 |
print("π‘ The space can still read datasets but cannot write to repositories")
|
| 146 |
+
print("π― Training is complete - space is now secure for monitoring")
|
| 147 |
else:
|
| 148 |
+
print("\nβ HF_TOKEN switch failed!")
|
| 149 |
print("Please check your tokens and try again.")
|
| 150 |
sys.exit(1)
|
| 151 |
|
src/dataset_utils.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Dataset utilities for Trackio experiment data management
|
| 4 |
+
Provides functions for safe dataset operations with data preservation
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import Dict, Any, List, Optional, Union
|
| 11 |
+
from datasets import Dataset, load_dataset
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class TrackioDatasetManager:
|
| 16 |
+
"""
|
| 17 |
+
Manager class for Trackio experiment datasets with data preservation.
|
| 18 |
+
|
| 19 |
+
This class ensures that existing experiment data is always preserved
|
| 20 |
+
when adding new experiments or updating existing ones.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, dataset_repo: str, hf_token: str):
|
| 24 |
+
"""
|
| 25 |
+
Initialize the dataset manager.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
dataset_repo (str): HF dataset repository ID (e.g., "username/dataset-name")
|
| 29 |
+
hf_token (str): Hugging Face token for authentication
|
| 30 |
+
"""
|
| 31 |
+
self.dataset_repo = dataset_repo
|
| 32 |
+
self.hf_token = hf_token
|
| 33 |
+
self._validate_repo_format()
|
| 34 |
+
|
| 35 |
+
def _validate_repo_format(self):
|
| 36 |
+
"""Validate dataset repository format"""
|
| 37 |
+
if not self.dataset_repo or '/' not in self.dataset_repo:
|
| 38 |
+
raise ValueError(f"Invalid dataset repository format: {self.dataset_repo}")
|
| 39 |
+
|
| 40 |
+
def check_dataset_exists(self) -> bool:
|
| 41 |
+
"""
|
| 42 |
+
Check if the dataset repository exists and is accessible.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
bool: True if dataset exists and is accessible, False otherwise
|
| 46 |
+
"""
|
| 47 |
+
try:
|
| 48 |
+
load_dataset(self.dataset_repo, token=self.hf_token)
|
| 49 |
+
logger.info(f"β
Dataset {self.dataset_repo} exists and is accessible")
|
| 50 |
+
return True
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.info(f"π Dataset {self.dataset_repo} doesn't exist or isn't accessible: {e}")
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
def load_existing_experiments(self) -> List[Dict[str, Any]]:
|
| 56 |
+
"""
|
| 57 |
+
Load all existing experiments from the dataset.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List[Dict[str, Any]]: List of existing experiment dictionaries
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
if not self.check_dataset_exists():
|
| 64 |
+
logger.info("π No existing dataset found, returning empty list")
|
| 65 |
+
return []
|
| 66 |
+
|
| 67 |
+
dataset = load_dataset(self.dataset_repo, token=self.hf_token)
|
| 68 |
+
|
| 69 |
+
if 'train' not in dataset:
|
| 70 |
+
logger.info("π No 'train' split found in dataset")
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
experiments = list(dataset['train'])
|
| 74 |
+
logger.info(f"π Loaded {len(experiments)} existing experiments")
|
| 75 |
+
|
| 76 |
+
# Validate experiment structure
|
| 77 |
+
valid_experiments = []
|
| 78 |
+
for exp in experiments:
|
| 79 |
+
if self._validate_experiment_structure(exp):
|
| 80 |
+
valid_experiments.append(exp)
|
| 81 |
+
else:
|
| 82 |
+
logger.warning(f"β οΈ Skipping invalid experiment: {exp.get('experiment_id', 'unknown')}")
|
| 83 |
+
|
| 84 |
+
logger.info(f"π {len(valid_experiments)} valid experiments loaded")
|
| 85 |
+
return valid_experiments
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"β Failed to load existing experiments: {e}")
|
| 89 |
+
return []
|
| 90 |
+
|
| 91 |
+
def _validate_experiment_structure(self, experiment: Dict[str, Any]) -> bool:
|
| 92 |
+
"""
|
| 93 |
+
Validate that an experiment has the required structure.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
experiment (Dict[str, Any]): Experiment dictionary to validate
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
bool: True if experiment structure is valid
|
| 100 |
+
"""
|
| 101 |
+
required_fields = [
|
| 102 |
+
'experiment_id', 'name', 'description', 'created_at',
|
| 103 |
+
'status', 'metrics', 'parameters', 'artifacts', 'logs'
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
for field in required_fields:
|
| 107 |
+
if field not in experiment:
|
| 108 |
+
logger.warning(f"β οΈ Missing required field '{field}' in experiment")
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
# Validate JSON fields
|
| 112 |
+
json_fields = ['metrics', 'parameters', 'artifacts', 'logs']
|
| 113 |
+
for field in json_fields:
|
| 114 |
+
if isinstance(experiment[field], str):
|
| 115 |
+
try:
|
| 116 |
+
json.loads(experiment[field])
|
| 117 |
+
except json.JSONDecodeError:
|
| 118 |
+
logger.warning(f"β οΈ Invalid JSON in field '{field}' for experiment {experiment.get('experiment_id')}")
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
return True
|
| 122 |
+
|
| 123 |
+
def save_experiments(self, experiments: List[Dict[str, Any]], commit_message: Optional[str] = None) -> bool:
|
| 124 |
+
"""
|
| 125 |
+
Save a list of experiments to the dataset, preserving data integrity.
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
experiments (List[Dict[str, Any]]): List of experiment dictionaries
|
| 129 |
+
commit_message (Optional[str]): Custom commit message
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
bool: True if save was successful, False otherwise
|
| 133 |
+
"""
|
| 134 |
+
try:
|
| 135 |
+
if not experiments:
|
| 136 |
+
logger.warning("β οΈ No experiments to save")
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
# Validate all experiments before saving
|
| 140 |
+
valid_experiments = []
|
| 141 |
+
for exp in experiments:
|
| 142 |
+
if self._validate_experiment_structure(exp):
|
| 143 |
+
# Ensure last_updated is set
|
| 144 |
+
if 'last_updated' not in exp:
|
| 145 |
+
exp['last_updated'] = datetime.now().isoformat()
|
| 146 |
+
valid_experiments.append(exp)
|
| 147 |
+
else:
|
| 148 |
+
logger.error(f"β Invalid experiment structure: {exp.get('experiment_id', 'unknown')}")
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
# Create dataset
|
| 152 |
+
dataset = Dataset.from_list(valid_experiments)
|
| 153 |
+
|
| 154 |
+
# Generate commit message if not provided
|
| 155 |
+
if not commit_message:
|
| 156 |
+
commit_message = f"Update dataset with {len(valid_experiments)} experiments ({datetime.now().isoformat()})"
|
| 157 |
+
|
| 158 |
+
# Push to hub
|
| 159 |
+
dataset.push_to_hub(
|
| 160 |
+
self.dataset_repo,
|
| 161 |
+
token=self.hf_token,
|
| 162 |
+
private=True,
|
| 163 |
+
commit_message=commit_message
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
logger.info(f"β
Successfully saved {len(valid_experiments)} experiments to {self.dataset_repo}")
|
| 167 |
+
return True
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.error(f"β Failed to save experiments to dataset: {e}")
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
def upsert_experiment(self, experiment: Dict[str, Any]) -> bool:
|
| 174 |
+
"""
|
| 175 |
+
Insert a new experiment or update an existing one, preserving all other data.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
experiment (Dict[str, Any]): Experiment dictionary to upsert
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
bool: True if operation was successful, False otherwise
|
| 182 |
+
"""
|
| 183 |
+
try:
|
| 184 |
+
# Validate the experiment structure
|
| 185 |
+
if not self._validate_experiment_structure(experiment):
|
| 186 |
+
logger.error(f"β Invalid experiment structure for {experiment.get('experiment_id', 'unknown')}")
|
| 187 |
+
return False
|
| 188 |
+
|
| 189 |
+
# Load existing experiments
|
| 190 |
+
existing_experiments = self.load_existing_experiments()
|
| 191 |
+
|
| 192 |
+
# Find if experiment already exists
|
| 193 |
+
experiment_id = experiment['experiment_id']
|
| 194 |
+
experiment_found = False
|
| 195 |
+
updated_experiments = []
|
| 196 |
+
|
| 197 |
+
for existing_exp in existing_experiments:
|
| 198 |
+
if existing_exp.get('experiment_id') == experiment_id:
|
| 199 |
+
# Update existing experiment
|
| 200 |
+
logger.info(f"π Updating existing experiment: {experiment_id}")
|
| 201 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
| 202 |
+
updated_experiments.append(experiment)
|
| 203 |
+
experiment_found = True
|
| 204 |
+
else:
|
| 205 |
+
# Preserve existing experiment
|
| 206 |
+
updated_experiments.append(existing_exp)
|
| 207 |
+
|
| 208 |
+
# If experiment doesn't exist, add it
|
| 209 |
+
if not experiment_found:
|
| 210 |
+
logger.info(f"β Adding new experiment: {experiment_id}")
|
| 211 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
| 212 |
+
updated_experiments.append(experiment)
|
| 213 |
+
|
| 214 |
+
# Save all experiments
|
| 215 |
+
commit_message = f"{'Update' if experiment_found else 'Add'} experiment {experiment_id} (preserving {len(existing_experiments)} existing experiments)"
|
| 216 |
+
|
| 217 |
+
return self.save_experiments(updated_experiments, commit_message)
|
| 218 |
+
|
| 219 |
+
except Exception as e:
|
| 220 |
+
logger.error(f"β Failed to upsert experiment: {e}")
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
def get_experiment_by_id(self, experiment_id: str) -> Optional[Dict[str, Any]]:
|
| 224 |
+
"""
|
| 225 |
+
Retrieve a specific experiment by its ID.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
experiment_id (str): The experiment ID to search for
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
Optional[Dict[str, Any]]: The experiment dictionary if found, None otherwise
|
| 232 |
+
"""
|
| 233 |
+
try:
|
| 234 |
+
experiments = self.load_existing_experiments()
|
| 235 |
+
|
| 236 |
+
for exp in experiments:
|
| 237 |
+
if exp.get('experiment_id') == experiment_id:
|
| 238 |
+
logger.info(f"β
Found experiment: {experiment_id}")
|
| 239 |
+
return exp
|
| 240 |
+
|
| 241 |
+
logger.info(f"π Experiment not found: {experiment_id}")
|
| 242 |
+
return None
|
| 243 |
+
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logger.error(f"β Failed to get experiment {experiment_id}: {e}")
|
| 246 |
+
return None
|
| 247 |
+
|
| 248 |
+
def list_experiments(self, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 249 |
+
"""
|
| 250 |
+
List all experiments, optionally filtered by status.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
status_filter (Optional[str]): Filter by experiment status (running, completed, failed, paused)
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
List[Dict[str, Any]]: List of experiments matching the filter
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
experiments = self.load_existing_experiments()
|
| 260 |
+
|
| 261 |
+
if status_filter:
|
| 262 |
+
filtered_experiments = [exp for exp in experiments if exp.get('status') == status_filter]
|
| 263 |
+
logger.info(f"π Found {len(filtered_experiments)} experiments with status '{status_filter}'")
|
| 264 |
+
return filtered_experiments
|
| 265 |
+
|
| 266 |
+
logger.info(f"π Found {len(experiments)} total experiments")
|
| 267 |
+
return experiments
|
| 268 |
+
|
| 269 |
+
except Exception as e:
|
| 270 |
+
logger.error(f"β Failed to list experiments: {e}")
|
| 271 |
+
return []
|
| 272 |
+
|
| 273 |
+
def backup_dataset(self, backup_suffix: Optional[str] = None) -> str:
|
| 274 |
+
"""
|
| 275 |
+
Create a backup of the current dataset.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
backup_suffix (Optional[str]): Optional suffix for backup repo name
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
str: Backup repository name if successful, empty string otherwise
|
| 282 |
+
"""
|
| 283 |
+
try:
|
| 284 |
+
if not backup_suffix:
|
| 285 |
+
backup_suffix = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 286 |
+
|
| 287 |
+
backup_repo = f"{self.dataset_repo}-backup-{backup_suffix}"
|
| 288 |
+
|
| 289 |
+
# Load current experiments
|
| 290 |
+
experiments = self.load_existing_experiments()
|
| 291 |
+
|
| 292 |
+
if not experiments:
|
| 293 |
+
logger.warning("β οΈ No experiments to backup")
|
| 294 |
+
return ""
|
| 295 |
+
|
| 296 |
+
# Create backup dataset manager
|
| 297 |
+
backup_manager = TrackioDatasetManager(backup_repo, self.hf_token)
|
| 298 |
+
|
| 299 |
+
# Save to backup
|
| 300 |
+
success = backup_manager.save_experiments(
|
| 301 |
+
experiments,
|
| 302 |
+
f"Backup of {self.dataset_repo} created on {datetime.now().isoformat()}"
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
if success:
|
| 306 |
+
logger.info(f"β
Backup created: {backup_repo}")
|
| 307 |
+
return backup_repo
|
| 308 |
+
else:
|
| 309 |
+
logger.error("β Failed to create backup")
|
| 310 |
+
return ""
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logger.error(f"β Failed to create backup: {e}")
|
| 314 |
+
return ""
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def create_dataset_manager(dataset_repo: str, hf_token: str) -> TrackioDatasetManager:
|
| 318 |
+
"""
|
| 319 |
+
Factory function to create a TrackioDatasetManager instance.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
dataset_repo (str): HF dataset repository ID
|
| 323 |
+
hf_token (str): Hugging Face token
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
TrackioDatasetManager: Configured dataset manager instance
|
| 327 |
+
"""
|
| 328 |
+
return TrackioDatasetManager(dataset_repo, hf_token)
|
src/monitoring.py
CHANGED
|
@@ -16,6 +16,7 @@ try:
|
|
| 16 |
from scripts.trackio_tonic.trackio_api_client import TrackioAPIClient
|
| 17 |
TRACKIO_AVAILABLE = True
|
| 18 |
except ImportError:
|
|
|
|
| 19 |
TRACKIO_AVAILABLE = False
|
| 20 |
print("Warning: Trackio API client not available. Install with: pip install requests")
|
| 21 |
|
|
@@ -87,20 +88,33 @@ class SmolLM3Monitor:
|
|
| 87 |
try:
|
| 88 |
from datasets import Dataset
|
| 89 |
from huggingface_hub import HfApi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
self.hf_dataset_client = {
|
| 92 |
'Dataset': Dataset,
|
| 93 |
'HfApi': HfApi,
|
| 94 |
'api': HfApi(token=self.hf_token)
|
| 95 |
}
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
except ImportError:
|
| 99 |
logger.warning("β οΈ datasets or huggingface-hub not available. Install with: pip install datasets huggingface-hub")
|
| 100 |
self.hf_dataset_client = None
|
|
|
|
| 101 |
except Exception as e:
|
| 102 |
logger.error("Failed to initialize HF Datasets client: %s", e)
|
| 103 |
self.hf_dataset_client = None
|
|
|
|
| 104 |
|
| 105 |
def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
|
| 106 |
"""Setup Trackio API client"""
|
|
@@ -184,55 +198,38 @@ class SmolLM3Monitor:
|
|
| 184 |
self.experiment_id = f"exp_{timestamp}"
|
| 185 |
|
| 186 |
def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
|
| 187 |
-
"""Save experiment data to HF Dataset"""
|
| 188 |
-
if not self.
|
| 189 |
-
logger.warning("β οΈ
|
| 190 |
return False
|
| 191 |
|
| 192 |
try:
|
| 193 |
-
#
|
| 194 |
-
|
| 195 |
-
logger.error("β Dataset repository is empty")
|
| 196 |
-
return False
|
| 197 |
-
|
| 198 |
-
# Validate dataset repository format
|
| 199 |
-
if '/' not in self.dataset_repo:
|
| 200 |
-
logger.error(f"β Invalid dataset repository format: {self.dataset_repo}")
|
| 201 |
-
return False
|
| 202 |
-
|
| 203 |
-
Dataset = self.hf_dataset_client['Dataset']
|
| 204 |
-
api = self.hf_dataset_client['api']
|
| 205 |
-
|
| 206 |
-
# Create dataset from experiment data with correct structure
|
| 207 |
-
# Match the structure used in setup_hf_dataset.py
|
| 208 |
-
dataset_data = [{
|
| 209 |
'experiment_id': self.experiment_id or f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
| 210 |
'name': self.experiment_name,
|
| 211 |
'description': "SmolLM3 fine-tuning experiment",
|
| 212 |
'created_at': self.start_time.isoformat(),
|
| 213 |
'status': 'running',
|
| 214 |
-
'metrics': json.dumps(self.metrics_history),
|
| 215 |
-
'parameters': json.dumps(experiment_data),
|
| 216 |
-
'artifacts': json.dumps(self.artifacts),
|
| 217 |
-
'logs': json.dumps([]),
|
| 218 |
'last_updated': datetime.now().isoformat()
|
| 219 |
-
}
|
| 220 |
-
|
| 221 |
-
# Create dataset from the experiment data
|
| 222 |
-
dataset = Dataset.from_list(dataset_data)
|
| 223 |
-
|
| 224 |
-
# Push to hub
|
| 225 |
-
dataset.push_to_hub(
|
| 226 |
-
self.dataset_repo,
|
| 227 |
-
token=self.hf_token,
|
| 228 |
-
private=True
|
| 229 |
-
)
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
except Exception as e:
|
| 235 |
-
logger.error(f"Failed to save to HF Dataset: {e}")
|
| 236 |
return False
|
| 237 |
|
| 238 |
def log_configuration(self, config: Dict[str, Any]):
|
|
@@ -556,25 +553,50 @@ class SmolLM3Monitor:
|
|
| 556 |
return "{}?tab=view_experiments".format(self.trackio_client.space_url)
|
| 557 |
return None
|
| 558 |
|
| 559 |
-
def close(self):
|
| 560 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
if self.enable_tracking and self.trackio_client:
|
| 562 |
try:
|
| 563 |
-
# Mark experiment as completed
|
| 564 |
result = self.trackio_client.update_experiment_status(
|
| 565 |
experiment_id=self.experiment_id,
|
| 566 |
-
status=
|
| 567 |
)
|
| 568 |
if "success" in result:
|
| 569 |
-
logger.info("
|
| 570 |
else:
|
| 571 |
-
logger.error("Failed to close monitoring session: %s", result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
except Exception as e:
|
| 573 |
-
logger.error("Failed to
|
| 574 |
|
| 575 |
-
|
| 576 |
-
if self.hf_dataset_client:
|
| 577 |
-
self._save_to_hf_dataset({'status': 'completed'})
|
| 578 |
|
| 579 |
# Utility function to create monitor from config
|
| 580 |
def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
|
|
|
|
| 16 |
from scripts.trackio_tonic.trackio_api_client import TrackioAPIClient
|
| 17 |
TRACKIO_AVAILABLE = True
|
| 18 |
except ImportError:
|
| 19 |
+
TrackioAPIClient = None
|
| 20 |
TRACKIO_AVAILABLE = False
|
| 21 |
print("Warning: Trackio API client not available. Install with: pip install requests")
|
| 22 |
|
|
|
|
| 88 |
try:
|
| 89 |
from datasets import Dataset
|
| 90 |
from huggingface_hub import HfApi
|
| 91 |
+
try:
|
| 92 |
+
from .dataset_utils import create_dataset_manager
|
| 93 |
+
except ImportError:
|
| 94 |
+
# Try importing from same directory
|
| 95 |
+
import sys
|
| 96 |
+
import os
|
| 97 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 98 |
+
from dataset_utils import create_dataset_manager
|
| 99 |
|
| 100 |
self.hf_dataset_client = {
|
| 101 |
'Dataset': Dataset,
|
| 102 |
'HfApi': HfApi,
|
| 103 |
'api': HfApi(token=self.hf_token)
|
| 104 |
}
|
| 105 |
+
|
| 106 |
+
# Initialize dataset manager for safe operations
|
| 107 |
+
self.dataset_manager = create_dataset_manager(self.dataset_repo, self.hf_token)
|
| 108 |
+
logger.info("β
HF Datasets client and manager initialized for %s", self.dataset_repo)
|
| 109 |
|
| 110 |
except ImportError:
|
| 111 |
logger.warning("β οΈ datasets or huggingface-hub not available. Install with: pip install datasets huggingface-hub")
|
| 112 |
self.hf_dataset_client = None
|
| 113 |
+
self.dataset_manager = None
|
| 114 |
except Exception as e:
|
| 115 |
logger.error("Failed to initialize HF Datasets client: %s", e)
|
| 116 |
self.hf_dataset_client = None
|
| 117 |
+
self.dataset_manager = None
|
| 118 |
|
| 119 |
def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
|
| 120 |
"""Setup Trackio API client"""
|
|
|
|
| 198 |
self.experiment_id = f"exp_{timestamp}"
|
| 199 |
|
| 200 |
def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
|
| 201 |
+
"""Save experiment data to HF Dataset with data preservation using dataset manager"""
|
| 202 |
+
if not self.dataset_manager:
|
| 203 |
+
logger.warning("β οΈ Dataset manager not available")
|
| 204 |
return False
|
| 205 |
|
| 206 |
try:
|
| 207 |
+
# Prepare current experiment data with standardized structure
|
| 208 |
+
current_experiment = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
'experiment_id': self.experiment_id or f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
| 210 |
'name': self.experiment_name,
|
| 211 |
'description': "SmolLM3 fine-tuning experiment",
|
| 212 |
'created_at': self.start_time.isoformat(),
|
| 213 |
'status': 'running',
|
| 214 |
+
'metrics': json.dumps(self.metrics_history, default=str),
|
| 215 |
+
'parameters': json.dumps(experiment_data, default=str),
|
| 216 |
+
'artifacts': json.dumps(self.artifacts, default=str),
|
| 217 |
+
'logs': json.dumps([], default=str),
|
| 218 |
'last_updated': datetime.now().isoformat()
|
| 219 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
+
# Use dataset manager to safely upsert the experiment
|
| 222 |
+
success = self.dataset_manager.upsert_experiment(current_experiment)
|
| 223 |
|
| 224 |
+
if success:
|
| 225 |
+
logger.info(f"β
Experiment data saved to HF Dataset: {self.dataset_repo}")
|
| 226 |
+
return True
|
| 227 |
+
else:
|
| 228 |
+
logger.error(f"β Failed to save experiment data to HF Dataset")
|
| 229 |
+
return False
|
| 230 |
+
|
| 231 |
except Exception as e:
|
| 232 |
+
logger.error(f"β Failed to save to HF Dataset: {e}")
|
| 233 |
return False
|
| 234 |
|
| 235 |
def log_configuration(self, config: Dict[str, Any]):
|
|
|
|
| 553 |
return "{}?tab=view_experiments".format(self.trackio_client.space_url)
|
| 554 |
return None
|
| 555 |
|
| 556 |
+
def close(self, final_status: str = "completed"):
|
| 557 |
+
"""
|
| 558 |
+
Close the monitoring session with final status update
|
| 559 |
+
|
| 560 |
+
Args:
|
| 561 |
+
final_status (str): Final status for the experiment (completed, failed, etc.)
|
| 562 |
+
"""
|
| 563 |
+
logger.info(f"π Closing monitoring session with status: {final_status}")
|
| 564 |
+
|
| 565 |
if self.enable_tracking and self.trackio_client:
|
| 566 |
try:
|
| 567 |
+
# Mark experiment as completed in Trackio
|
| 568 |
result = self.trackio_client.update_experiment_status(
|
| 569 |
experiment_id=self.experiment_id,
|
| 570 |
+
status=final_status
|
| 571 |
)
|
| 572 |
if "success" in result:
|
| 573 |
+
logger.info("β
Trackio monitoring session closed")
|
| 574 |
else:
|
| 575 |
+
logger.error("β Failed to close Trackio monitoring session: %s", result)
|
| 576 |
+
except Exception as e:
|
| 577 |
+
logger.error("β Failed to close Trackio monitoring session: %s", e)
|
| 578 |
+
|
| 579 |
+
# Final save to HF Dataset with proper status update
|
| 580 |
+
if self.dataset_manager:
|
| 581 |
+
try:
|
| 582 |
+
# Update experiment with final status
|
| 583 |
+
final_experiment_data = {
|
| 584 |
+
'status': final_status,
|
| 585 |
+
'experiment_end_time': datetime.now().isoformat(),
|
| 586 |
+
'final_metrics_count': len(self.metrics_history),
|
| 587 |
+
'total_artifacts': len(self.artifacts)
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
success = self._save_to_hf_dataset(final_experiment_data)
|
| 591 |
+
if success:
|
| 592 |
+
logger.info("β
Final experiment data saved to HF Dataset")
|
| 593 |
+
else:
|
| 594 |
+
logger.error("β Failed to save final experiment data")
|
| 595 |
+
|
| 596 |
except Exception as e:
|
| 597 |
+
logger.error(f"β Failed to save final experiment data: {e}")
|
| 598 |
|
| 599 |
+
logger.info(f"π― Monitoring session closed for experiment: {self.experiment_id}")
|
|
|
|
|
|
|
| 600 |
|
| 601 |
# Utility function to create monitor from config
|
| 602 |
def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
|
templates/spaces/demo_gpt/README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: GPT-OSS-20B Multilingual Reasoner Demo
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.40.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
This demo showcases the GPT-OSS-20B model fine-tuned with LoRA for enhanced multilingual reasoning capabilities. The model is based on OpenAI's GPT-OSS-20B base model with a LoRA adapter from Tonic.
|
| 14 |
+
|
| 15 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
templates/spaces/demo_gpt/app.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
|
| 2 |
+
import torch
|
| 3 |
+
from threading import Thread
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import spaces
|
| 6 |
+
import re
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
from peft import PeftModel
|
| 10 |
+
|
| 11 |
+
# ----------------------------------------------------------------------
|
| 12 |
+
# Environment Variables Configuration
|
| 13 |
+
# ----------------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
# Get model configuration from environment variables
|
| 16 |
+
BASE_MODEL_ID = os.getenv('BASE_MODEL_ID', 'openai/gpt-oss-20b')
|
| 17 |
+
LORA_MODEL_ID = os.getenv('LORA_MODEL_ID', os.getenv('HF_MODEL_ID', 'Tonic/gpt-oss-20b-multilingual-reasoner'))
|
| 18 |
+
MODEL_NAME = os.getenv('MODEL_NAME', 'GPT-OSS Multilingual Reasoner')
|
| 19 |
+
MODEL_SUBFOLDER = os.getenv('MODEL_SUBFOLDER', '')
|
| 20 |
+
|
| 21 |
+
# If the LORA_MODEL_ID is the same as BASE_MODEL_ID, this is a merged model, not LoRA
|
| 22 |
+
USE_LORA = LORA_MODEL_ID != BASE_MODEL_ID and not LORA_MODEL_ID.startswith(BASE_MODEL_ID)
|
| 23 |
+
|
| 24 |
+
print(f"π§ Configuration:")
|
| 25 |
+
print(f" Base Model: {BASE_MODEL_ID}")
|
| 26 |
+
print(f" Model ID: {LORA_MODEL_ID}")
|
| 27 |
+
print(f" Model Name: {MODEL_NAME}")
|
| 28 |
+
print(f" Model Subfolder: {MODEL_SUBFOLDER}")
|
| 29 |
+
print(f" Use LoRA: {USE_LORA}")
|
| 30 |
+
|
| 31 |
+
# ----------------------------------------------------------------------
|
| 32 |
+
# KaTeX delimiter config for Gradio
|
| 33 |
+
# ----------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
LATEX_DELIMS = [
|
| 36 |
+
{"left": "$$", "right": "$$", "display": True},
|
| 37 |
+
{"left": "$", "right": "$", "display": False},
|
| 38 |
+
{"left": "\\[", "right": "\\]", "display": True},
|
| 39 |
+
{"left": "\\(", "right": "\\)", "display": False},
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
# Configure logging
|
| 43 |
+
logging.basicConfig(level=logging.INFO)
|
| 44 |
+
|
| 45 |
+
# Load the model
|
| 46 |
+
try:
|
| 47 |
+
if USE_LORA:
|
| 48 |
+
# Load base model and LoRA adapter separately
|
| 49 |
+
print(f"π Loading base model: {BASE_MODEL_ID}")
|
| 50 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 51 |
+
BASE_MODEL_ID,
|
| 52 |
+
torch_dtype="auto",
|
| 53 |
+
device_map="auto",
|
| 54 |
+
attn_implementation="kernels-community/vllm-flash-attn3"
|
| 55 |
+
)
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
|
| 57 |
+
|
| 58 |
+
# Load the LoRA adapter
|
| 59 |
+
try:
|
| 60 |
+
print(f"π Loading LoRA adapter: {LORA_MODEL_ID}")
|
| 61 |
+
if MODEL_SUBFOLDER and MODEL_SUBFOLDER.strip():
|
| 62 |
+
model = PeftModel.from_pretrained(base_model, LORA_MODEL_ID, subfolder=MODEL_SUBFOLDER)
|
| 63 |
+
else:
|
| 64 |
+
model = PeftModel.from_pretrained(base_model, LORA_MODEL_ID)
|
| 65 |
+
print("β
LoRA model loaded successfully!")
|
| 66 |
+
except Exception as lora_error:
|
| 67 |
+
print(f"β οΈ LoRA adapter failed to load: {lora_error}")
|
| 68 |
+
print("π Falling back to base model...")
|
| 69 |
+
model = base_model
|
| 70 |
+
else:
|
| 71 |
+
# Load merged/fine-tuned model directly
|
| 72 |
+
print(f"π Loading merged model: {LORA_MODEL_ID}")
|
| 73 |
+
model_kwargs = {
|
| 74 |
+
"torch_dtype": "auto",
|
| 75 |
+
"device_map": "auto",
|
| 76 |
+
"attn_implementation": "kernels-community/vllm-flash-attn3"
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
if MODEL_SUBFOLDER and MODEL_SUBFOLDER.strip():
|
| 80 |
+
model = AutoModelForCausalLM.from_pretrained(LORA_MODEL_ID, subfolder=MODEL_SUBFOLDER, **model_kwargs)
|
| 81 |
+
tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_ID, subfolder=MODEL_SUBFOLDER)
|
| 82 |
+
else:
|
| 83 |
+
model = AutoModelForCausalLM.from_pretrained(LORA_MODEL_ID, **model_kwargs)
|
| 84 |
+
tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_ID)
|
| 85 |
+
print("β
Merged model loaded successfully!")
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"β Error loading model: {e}")
|
| 89 |
+
raise e
|
| 90 |
+
|
| 91 |
+
def format_conversation_history(chat_history):
|
| 92 |
+
messages = []
|
| 93 |
+
for item in chat_history:
|
| 94 |
+
role = item["role"]
|
| 95 |
+
content = item["content"]
|
| 96 |
+
if isinstance(content, list):
|
| 97 |
+
content = content[0]["text"] if content and "text" in content[0] else str(content)
|
| 98 |
+
messages.append({"role": role, "content": content})
|
| 99 |
+
return messages
|
| 100 |
+
|
| 101 |
+
def format_analysis_response(text):
|
| 102 |
+
"""Enhanced response formatting with better structure and LaTeX support."""
|
| 103 |
+
# Look for analysis section followed by final response
|
| 104 |
+
m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL | re.IGNORECASE)
|
| 105 |
+
if m:
|
| 106 |
+
reasoning = m.group(1).strip()
|
| 107 |
+
response = text.split("assistantfinal", 1)[-1].strip()
|
| 108 |
+
|
| 109 |
+
# Clean up the reasoning section
|
| 110 |
+
reasoning = re.sub(r'^analysis\s*', '', reasoning, flags=re.IGNORECASE).strip()
|
| 111 |
+
|
| 112 |
+
# Format with improved structure
|
| 113 |
+
formatted = (
|
| 114 |
+
f"**π€ Analysis & Reasoning:**\n\n"
|
| 115 |
+
f"*{reasoning}*\n\n"
|
| 116 |
+
f"---\n\n"
|
| 117 |
+
f"**π¬ Final Response:**\n\n{response}"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Ensure LaTeX delimiters are balanced
|
| 121 |
+
if formatted.count("$") % 2:
|
| 122 |
+
formatted += "$"
|
| 123 |
+
|
| 124 |
+
return formatted
|
| 125 |
+
|
| 126 |
+
# Fallback: clean up the text and return as-is
|
| 127 |
+
cleaned = re.sub(r'^analysis\s*', '', text, flags=re.IGNORECASE).strip()
|
| 128 |
+
if cleaned.count("$") % 2:
|
| 129 |
+
cleaned += "$"
|
| 130 |
+
return cleaned
|
| 131 |
+
|
| 132 |
+
@spaces.GPU(duration=60)
|
| 133 |
+
def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
|
| 134 |
+
if not input_data.strip():
|
| 135 |
+
yield "Please enter a prompt."
|
| 136 |
+
return
|
| 137 |
+
|
| 138 |
+
# Log the request
|
| 139 |
+
logging.info(f"[User] {input_data}")
|
| 140 |
+
logging.info(f"[System] {system_prompt} | Temp={temperature} | Max tokens={max_new_tokens}")
|
| 141 |
+
|
| 142 |
+
new_message = {"role": "user", "content": input_data}
|
| 143 |
+
system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
|
| 144 |
+
processed_history = format_conversation_history(chat_history)
|
| 145 |
+
messages = system_message + processed_history + [new_message]
|
| 146 |
+
prompt = tokenizer.apply_chat_template(
|
| 147 |
+
messages,
|
| 148 |
+
tokenize=False,
|
| 149 |
+
add_generation_prompt=True
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Create streamer for proper streaming
|
| 153 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 154 |
+
|
| 155 |
+
# Prepare generation kwargs
|
| 156 |
+
generation_kwargs = {
|
| 157 |
+
"max_new_tokens": max_new_tokens,
|
| 158 |
+
"do_sample": True,
|
| 159 |
+
"temperature": temperature,
|
| 160 |
+
"top_p": top_p,
|
| 161 |
+
"top_k": top_k,
|
| 162 |
+
"repetition_penalty": repetition_penalty,
|
| 163 |
+
"pad_token_id": tokenizer.eos_token_id,
|
| 164 |
+
"streamer": streamer,
|
| 165 |
+
"use_cache": True
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
# Tokenize input using the chat template
|
| 169 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 170 |
+
|
| 171 |
+
# Start generation in a separate thread
|
| 172 |
+
thread = Thread(target=model.generate, kwargs={**inputs, **generation_kwargs})
|
| 173 |
+
thread.start()
|
| 174 |
+
|
| 175 |
+
# Stream the response with enhanced formatting
|
| 176 |
+
collected_text = ""
|
| 177 |
+
buffer = ""
|
| 178 |
+
yielded_once = False
|
| 179 |
+
|
| 180 |
+
try:
|
| 181 |
+
for chunk in streamer:
|
| 182 |
+
if not chunk:
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
collected_text += chunk
|
| 186 |
+
buffer += chunk
|
| 187 |
+
|
| 188 |
+
# Initial yield to show immediate response
|
| 189 |
+
if not yielded_once:
|
| 190 |
+
yield chunk
|
| 191 |
+
buffer = ""
|
| 192 |
+
yielded_once = True
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
# Yield accumulated text periodically for smooth streaming
|
| 196 |
+
if "\n" in buffer or len(buffer) > 150:
|
| 197 |
+
# Use enhanced formatting for partial text
|
| 198 |
+
partial_formatted = format_analysis_response(collected_text)
|
| 199 |
+
yield partial_formatted
|
| 200 |
+
buffer = ""
|
| 201 |
+
|
| 202 |
+
# Final formatting with complete text
|
| 203 |
+
final_formatted = format_analysis_response(collected_text)
|
| 204 |
+
yield final_formatted
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logging.exception("Generation streaming failed")
|
| 208 |
+
yield f"β Error during generation: {e}"
|
| 209 |
+
|
| 210 |
+
demo = gr.ChatInterface(
|
| 211 |
+
fn=generate_response,
|
| 212 |
+
additional_inputs=[
|
| 213 |
+
gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
|
| 214 |
+
gr.Textbox(
|
| 215 |
+
label="System Prompt",
|
| 216 |
+
value="You are a helpful assistant. Reasoning: medium",
|
| 217 |
+
lines=4,
|
| 218 |
+
placeholder="Change system prompt"
|
| 219 |
+
),
|
| 220 |
+
gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
|
| 221 |
+
gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
|
| 222 |
+
gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
|
| 223 |
+
gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
|
| 224 |
+
],
|
| 225 |
+
examples=[
|
| 226 |
+
[{"text": "Explain Newton's laws clearly and concisely with mathematical formulas"}],
|
| 227 |
+
[{"text": "Write a Python function to calculate the Fibonacci sequence"}],
|
| 228 |
+
[{"text": "What are the benefits of open weight AI models? Include analysis."}],
|
| 229 |
+
[{"text": "Solve this equation: $x^2 + 5x + 6 = 0$"}],
|
| 230 |
+
],
|
| 231 |
+
cache_examples=False,
|
| 232 |
+
type="messages",
|
| 233 |
+
description=f"""
|
| 234 |
+
|
| 235 |
+
# ππ»ββοΈWelcome to π{MODEL_NAME} Demo !
|
| 236 |
+
|
| 237 |
+
**Model**: `{LORA_MODEL_ID}`
|
| 238 |
+
**Base**: `{BASE_MODEL_ID}`
|
| 239 |
+
|
| 240 |
+
β¨ **Enhanced Features:**
|
| 241 |
+
- π§ **Advanced Reasoning**: Detailed analysis and step-by-step thinking
|
| 242 |
+
- π **LaTeX Support**: Mathematical formulas rendered beautifully (use `$` or `$$`)
|
| 243 |
+
- π― **Improved Formatting**: Clear separation of reasoning and final responses
|
| 244 |
+
- π **Smart Logging**: Better error handling and request tracking
|
| 245 |
+
|
| 246 |
+
π‘ **Usage Tips:**
|
| 247 |
+
- Adjust reasoning level in system prompt (e.g., "Reasoning: high")
|
| 248 |
+
- Use LaTeX for math: `$E = mc^2$` or `$$\\int x^2 dx$$`
|
| 249 |
+
- Wait a couple of seconds initially for model loading
|
| 250 |
+
""",
|
| 251 |
+
fill_height=True,
|
| 252 |
+
textbox=gr.Textbox(
|
| 253 |
+
label="Query Input",
|
| 254 |
+
placeholder="Type your prompt (supports LaTeX: $x^2 + y^2 = z^2$)"
|
| 255 |
+
),
|
| 256 |
+
stop_btn="Stop Generation",
|
| 257 |
+
multimodal=False,
|
| 258 |
+
theme=gr.themes.Soft()
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
if __name__ == "__main__":
|
| 262 |
+
demo.launch(share=True)
|
templates/spaces/demo_gpt/requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
git+https://github.com/huggingface/transformers.git
|
| 3 |
+
peft
|
| 4 |
+
trl
|
| 5 |
+
bitsandbytes
|
| 6 |
+
triton
|
| 7 |
+
accelerate
|
| 8 |
+
kernels
|
| 9 |
+
openai-harmony
|
templates/spaces/{demo β demo_smol}/README.md
RENAMED
|
File without changes
|
templates/spaces/{demo β demo_smol}/app.py
RENAMED
|
File without changes
|
templates/spaces/{demo β demo_smol}/requirements.txt
RENAMED
|
File without changes
|
templates/spaces/{README.md β trackio/README.md}
RENAMED
|
File without changes
|
templates/spaces/{app.py β trackio/app.py}
RENAMED
|
@@ -14,6 +14,8 @@ import plotly.graph_objects as go
|
|
| 14 |
import plotly.express as px
|
| 15 |
import pandas as pd
|
| 16 |
import numpy as np
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Setup logging
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -27,9 +29,24 @@ class TrackioSpace:
|
|
| 27 |
self.current_experiment = None
|
| 28 |
|
| 29 |
# Get dataset repository and HF token from parameters or environment variables
|
| 30 |
-
self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', '
|
| 31 |
self.hf_token = hf_token or os.environ.get('HF_TOKEN')
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
logger.info(f"π§ Using dataset repository: {self.dataset_repo}")
|
| 34 |
|
| 35 |
if not self.hf_token:
|
|
@@ -38,47 +55,139 @@ class TrackioSpace:
|
|
| 38 |
self._load_experiments()
|
| 39 |
|
| 40 |
def _load_experiments(self):
|
| 41 |
-
"""Load experiments from HF Dataset"""
|
| 42 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
if self.hf_token:
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
if
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
if exp_id:
|
| 57 |
-
self.experiments[exp_id] = {
|
| 58 |
-
'id': exp_id,
|
| 59 |
-
'name': row.get('name', ''),
|
| 60 |
-
'description': row.get('description', ''),
|
| 61 |
-
'created_at': row.get('created_at', ''),
|
| 62 |
-
'status': row.get('status', 'running'),
|
| 63 |
-
'metrics': json.loads(row.get('metrics', '[]')),
|
| 64 |
-
'parameters': json.loads(row.get('parameters', '{}')),
|
| 65 |
-
'artifacts': json.loads(row.get('artifacts', '[]')),
|
| 66 |
-
'logs': json.loads(row.get('logs', '[]'))
|
| 67 |
-
}
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
except Exception as e:
|
| 80 |
-
logger.
|
| 81 |
-
|
| 82 |
|
| 83 |
def _load_backup_experiments(self):
|
| 84 |
"""Load backup experiments when dataset is not available"""
|
|
@@ -312,12 +421,61 @@ class TrackioSpace:
|
|
| 312 |
logger.info(f"β
Loaded {len(backup_experiments)} backup experiments")
|
| 313 |
|
| 314 |
def _save_experiments(self):
|
| 315 |
-
"""Save experiments to HF Dataset"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
try:
|
| 317 |
if self.hf_token:
|
| 318 |
from datasets import Dataset
|
| 319 |
from huggingface_hub import HfApi
|
| 320 |
|
|
|
|
|
|
|
| 321 |
# Convert experiments to dataset format
|
| 322 |
dataset_data = []
|
| 323 |
for exp_id, exp_data in self.experiments.items():
|
|
@@ -327,10 +485,10 @@ class TrackioSpace:
|
|
| 327 |
'description': exp_data.get('description', ''),
|
| 328 |
'created_at': exp_data.get('created_at', ''),
|
| 329 |
'status': exp_data.get('status', 'running'),
|
| 330 |
-
'metrics': json.dumps(exp_data.get('metrics', [])),
|
| 331 |
-
'parameters': json.dumps(exp_data.get('parameters', {})),
|
| 332 |
-
'artifacts': json.dumps(exp_data.get('artifacts', [])),
|
| 333 |
-
'logs': json.dumps(exp_data.get('logs', [])),
|
| 334 |
'last_updated': datetime.now().isoformat()
|
| 335 |
})
|
| 336 |
|
|
@@ -342,16 +500,17 @@ class TrackioSpace:
|
|
| 342 |
dataset.push_to_hub(
|
| 343 |
self.dataset_repo,
|
| 344 |
token=self.hf_token,
|
| 345 |
-
private=True
|
|
|
|
| 346 |
)
|
| 347 |
|
| 348 |
-
logger.info(f"β
Saved {len(dataset_data)} experiments to {self.dataset_repo}")
|
| 349 |
|
| 350 |
else:
|
| 351 |
logger.warning("β οΈ No HF_TOKEN available, experiments not saved to dataset")
|
| 352 |
|
| 353 |
except Exception as e:
|
| 354 |
-
logger.error(f"Failed to save experiments
|
| 355 |
# Fall back to local file for backup
|
| 356 |
try:
|
| 357 |
data = {
|
|
@@ -363,7 +522,7 @@ class TrackioSpace:
|
|
| 363 |
json.dump(data, f, indent=2, default=str)
|
| 364 |
logger.info("β
Saved backup to local file")
|
| 365 |
except Exception as backup_e:
|
| 366 |
-
logger.error(f"Failed to save backup: {backup_e}")
|
| 367 |
|
| 368 |
def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
|
| 369 |
"""Create a new experiment"""
|
|
@@ -483,7 +642,10 @@ def update_trackio_config(hf_token: str, dataset_repo: str) -> str:
|
|
| 483 |
# Reload experiments with new configuration
|
| 484 |
trackio_space._load_experiments()
|
| 485 |
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
except Exception as e:
|
| 489 |
return f"β Failed to update configuration: {str(e)}"
|
|
@@ -502,10 +664,42 @@ def test_dataset_connection(hf_token: str, dataset_repo: str) -> str:
|
|
| 502 |
# Test loading the dataset
|
| 503 |
dataset = load_dataset(dataset_repo, token=hf_token)
|
| 504 |
|
| 505 |
-
# Count experiments
|
| 506 |
experiment_count = len(dataset['train']) if 'train' in dataset else 0
|
| 507 |
|
| 508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
except Exception as e:
|
| 511 |
return f"β Connection failed: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token is correct\n2. Verify the dataset repository exists\n3. Ensure your token has read access to the dataset"
|
|
@@ -534,12 +728,34 @@ def create_dataset_repository(hf_token: str, dataset_repo: str) -> str:
|
|
| 534 |
# Check if dataset exists
|
| 535 |
try:
|
| 536 |
api.dataset_info(dataset_repo)
|
| 537 |
-
return f"β
Dataset {dataset_repo} already exists
|
| 538 |
except:
|
| 539 |
# Dataset doesn't exist, create it
|
| 540 |
pass
|
| 541 |
|
| 542 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
empty_dataset = Dataset.from_dict({
|
| 544 |
'experiment_id': [],
|
| 545 |
'name': [],
|
|
@@ -557,22 +773,34 @@ def create_dataset_repository(hf_token: str, dataset_repo: str) -> str:
|
|
| 557 |
empty_dataset.push_to_hub(
|
| 558 |
dataset_repo,
|
| 559 |
token=hf_token,
|
| 560 |
-
private=True
|
|
|
|
| 561 |
)
|
| 562 |
|
| 563 |
-
return f"β
Dataset {dataset_repo} created successfully!\nπ View at: https://huggingface.co/datasets/{dataset_repo}\nπ Ready to store experiments"
|
| 564 |
|
| 565 |
except Exception as e:
|
| 566 |
-
return f"β Failed to create dataset: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token has write permissions\n2. Verify the username in the repository name\n3. Ensure the dataset name is valid"
|
| 567 |
|
| 568 |
# Initialize API client for remote data
|
| 569 |
api_client = None
|
| 570 |
try:
|
| 571 |
from trackio_api_client import TrackioAPIClient
|
| 572 |
-
|
| 573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
except ImportError:
|
| 575 |
logger.warning("β οΈ API client not available, using local data only")
|
|
|
|
|
|
|
| 576 |
|
| 577 |
# Add Hugging Face Spaces compatibility
|
| 578 |
def is_huggingface_spaces():
|
|
@@ -616,6 +844,7 @@ def parse_remote_metrics_data(experiment_details: str) -> pd.DataFrame:
|
|
| 616 |
lines = experiment_details.split('\n')
|
| 617 |
metrics_data = []
|
| 618 |
|
|
|
|
| 619 |
for line in lines:
|
| 620 |
if 'Step:' in line and 'Metrics:' in line:
|
| 621 |
# Extract step and metrics from the line
|
|
@@ -637,6 +866,11 @@ def parse_remote_metrics_data(experiment_details: str) -> pd.DataFrame:
|
|
| 637 |
logger.warning(f"Failed to parse metrics line: {line} - {e}")
|
| 638 |
continue
|
| 639 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
if metrics_data:
|
| 641 |
return pd.DataFrame(metrics_data)
|
| 642 |
else:
|
|
@@ -647,22 +881,65 @@ def parse_remote_metrics_data(experiment_details: str) -> pd.DataFrame:
|
|
| 647 |
return pd.DataFrame()
|
| 648 |
|
| 649 |
def get_metrics_dataframe(experiment_id: str) -> pd.DataFrame:
|
| 650 |
-
"""Get metrics as a pandas DataFrame for plotting - tries
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
|
| 667 |
def create_experiment_interface(name: str, description: str) -> str:
|
| 668 |
"""Create a new experiment"""
|
|
@@ -919,12 +1196,622 @@ def create_demo_experiment():
|
|
| 919 |
except Exception as e:
|
| 920 |
return f"β Error creating demo experiment: {str(e)}"
|
| 921 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 922 |
# Create Gradio interface
|
| 923 |
with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as demo:
|
| 924 |
gr.Markdown("# π Trackio Experiment Tracking & Monitoring")
|
| 925 |
gr.Markdown("Monitor and track your ML experiments with real-time visualization!")
|
| 926 |
|
| 927 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 928 |
# Configuration Tab
|
| 929 |
with gr.Tab("βοΈ Configuration"):
|
| 930 |
gr.Markdown("### Configure HF Datasets Connection")
|
|
@@ -941,7 +1828,7 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
| 941 |
dataset_repo_input = gr.Textbox(
|
| 942 |
label="Dataset Repository",
|
| 943 |
placeholder="your-username/your-dataset-name",
|
| 944 |
-
value="
|
| 945 |
info="HF Dataset repository for experiment storage"
|
| 946 |
)
|
| 947 |
|
|
@@ -953,9 +1840,9 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
| 953 |
gr.Markdown("### Current Configuration")
|
| 954 |
current_config_output = gr.Textbox(
|
| 955 |
label="Status",
|
| 956 |
-
lines=
|
| 957 |
interactive=False,
|
| 958 |
-
value=f"π Dataset: {trackio_space.dataset_repo}\nπ HF Token: {'Set' if trackio_space.hf_token else 'Not set'}\nπ Experiments: {len(trackio_space.experiments)}"
|
| 959 |
)
|
| 960 |
|
| 961 |
with gr.Column():
|
|
@@ -978,12 +1865,204 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
| 978 |
- `HF_TOKEN`: Your Hugging Face token
|
| 979 |
- `TRACKIO_DATASET_REPO`: Dataset repository
|
| 980 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
**Actions:**
|
| 982 |
- **Update Configuration**: Apply new settings and reload experiments
|
| 983 |
- **Test Connection**: Verify access to the dataset repository
|
| 984 |
- **Create Dataset**: Create a new dataset repository if it doesn't exist
|
| 985 |
""")
|
| 986 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
update_config_btn.click(
|
| 988 |
update_trackio_config,
|
| 989 |
inputs=[hf_token_input, dataset_repo_input],
|
|
@@ -1001,237 +2080,9 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
| 1001 |
inputs=[hf_token_input, dataset_repo_input],
|
| 1002 |
outputs=current_config_output
|
| 1003 |
)
|
|
|
|
| 1004 |
|
| 1005 |
-
|
| 1006 |
-
with gr.Tab("Create Experiment"):
|
| 1007 |
-
gr.Markdown("### Create a New Experiment")
|
| 1008 |
-
with gr.Row():
|
| 1009 |
-
with gr.Column():
|
| 1010 |
-
experiment_name = gr.Textbox(
|
| 1011 |
-
label="Experiment Name",
|
| 1012 |
-
placeholder="my_smollm3_finetune",
|
| 1013 |
-
value="smollm3_finetune"
|
| 1014 |
-
)
|
| 1015 |
-
experiment_description = gr.Textbox(
|
| 1016 |
-
label="Description",
|
| 1017 |
-
placeholder="Fine-tuning SmolLM3 model on custom dataset",
|
| 1018 |
-
value="SmolLM3 fine-tuning experiment"
|
| 1019 |
-
)
|
| 1020 |
-
create_btn = gr.Button("Create Experiment", variant="primary")
|
| 1021 |
-
|
| 1022 |
-
with gr.Column():
|
| 1023 |
-
create_output = gr.Textbox(
|
| 1024 |
-
label="Result",
|
| 1025 |
-
lines=5,
|
| 1026 |
-
interactive=False
|
| 1027 |
-
)
|
| 1028 |
-
|
| 1029 |
-
create_btn.click(
|
| 1030 |
-
create_experiment_interface,
|
| 1031 |
-
inputs=[experiment_name, experiment_description],
|
| 1032 |
-
outputs=create_output
|
| 1033 |
-
)
|
| 1034 |
-
|
| 1035 |
-
# Log Metrics Tab
|
| 1036 |
-
with gr.Tab("Log Metrics"):
|
| 1037 |
-
gr.Markdown("### Log Training Metrics")
|
| 1038 |
-
with gr.Row():
|
| 1039 |
-
with gr.Column():
|
| 1040 |
-
metrics_exp_id = gr.Textbox(
|
| 1041 |
-
label="Experiment ID",
|
| 1042 |
-
placeholder="exp_20231201_143022"
|
| 1043 |
-
)
|
| 1044 |
-
metrics_json = gr.Textbox(
|
| 1045 |
-
label="Metrics (JSON)",
|
| 1046 |
-
placeholder='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5}',
|
| 1047 |
-
value='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5, "gpu_memory": 22.5}'
|
| 1048 |
-
)
|
| 1049 |
-
metrics_step = gr.Textbox(
|
| 1050 |
-
label="Step (optional)",
|
| 1051 |
-
placeholder="100"
|
| 1052 |
-
)
|
| 1053 |
-
log_metrics_btn = gr.Button("Log Metrics", variant="primary")
|
| 1054 |
-
|
| 1055 |
-
with gr.Column():
|
| 1056 |
-
metrics_output = gr.Textbox(
|
| 1057 |
-
label="Result",
|
| 1058 |
-
lines=5,
|
| 1059 |
-
interactive=False
|
| 1060 |
-
)
|
| 1061 |
-
|
| 1062 |
-
log_metrics_btn.click(
|
| 1063 |
-
log_metrics_interface,
|
| 1064 |
-
inputs=[metrics_exp_id, metrics_json, metrics_step],
|
| 1065 |
-
outputs=metrics_output
|
| 1066 |
-
)
|
| 1067 |
-
|
| 1068 |
-
# Log Parameters Tab
|
| 1069 |
-
with gr.Tab("Log Parameters"):
|
| 1070 |
-
gr.Markdown("### Log Experiment Parameters")
|
| 1071 |
-
with gr.Row():
|
| 1072 |
-
with gr.Column():
|
| 1073 |
-
params_exp_id = gr.Textbox(
|
| 1074 |
-
label="Experiment ID",
|
| 1075 |
-
placeholder="exp_20231201_143022"
|
| 1076 |
-
)
|
| 1077 |
-
parameters_json = gr.Textbox(
|
| 1078 |
-
label="Parameters (JSON)",
|
| 1079 |
-
placeholder='{"learning_rate": 2e-5, "batch_size": 4}',
|
| 1080 |
-
value='{"learning_rate": 3.5e-6, "batch_size": 8, "model_name": "HuggingFaceTB/SmolLM3-3B", "max_iters": 18000, "mixed_precision": "bf16"}'
|
| 1081 |
-
)
|
| 1082 |
-
log_params_btn = gr.Button("Log Parameters", variant="primary")
|
| 1083 |
-
|
| 1084 |
-
with gr.Column():
|
| 1085 |
-
params_output = gr.Textbox(
|
| 1086 |
-
label="Result",
|
| 1087 |
-
lines=5,
|
| 1088 |
-
interactive=False
|
| 1089 |
-
)
|
| 1090 |
-
|
| 1091 |
-
log_params_btn.click(
|
| 1092 |
-
log_parameters_interface,
|
| 1093 |
-
inputs=[params_exp_id, parameters_json],
|
| 1094 |
-
outputs=params_output
|
| 1095 |
-
)
|
| 1096 |
-
|
| 1097 |
-
# View Experiments Tab
|
| 1098 |
-
with gr.Tab("View Experiments"):
|
| 1099 |
-
gr.Markdown("### View Experiment Details")
|
| 1100 |
-
with gr.Row():
|
| 1101 |
-
with gr.Column():
|
| 1102 |
-
view_exp_id = gr.Textbox(
|
| 1103 |
-
label="Experiment ID",
|
| 1104 |
-
placeholder="exp_20231201_143022"
|
| 1105 |
-
)
|
| 1106 |
-
view_btn = gr.Button("View Experiment", variant="primary")
|
| 1107 |
-
list_btn = gr.Button("List All Experiments", variant="secondary")
|
| 1108 |
-
|
| 1109 |
-
with gr.Column():
|
| 1110 |
-
view_output = gr.Textbox(
|
| 1111 |
-
label="Experiment Details",
|
| 1112 |
-
lines=20,
|
| 1113 |
-
interactive=False
|
| 1114 |
-
)
|
| 1115 |
-
|
| 1116 |
-
view_btn.click(
|
| 1117 |
-
get_experiment_details,
|
| 1118 |
-
inputs=[view_exp_id],
|
| 1119 |
-
outputs=view_output
|
| 1120 |
-
)
|
| 1121 |
-
|
| 1122 |
-
list_btn.click(
|
| 1123 |
-
list_experiments_interface,
|
| 1124 |
-
inputs=[],
|
| 1125 |
-
outputs=view_output
|
| 1126 |
-
)
|
| 1127 |
-
|
| 1128 |
-
# Visualization Tab
|
| 1129 |
-
with gr.Tab("π Visualizations"):
|
| 1130 |
-
gr.Markdown("### Training Metrics Visualization")
|
| 1131 |
-
with gr.Row():
|
| 1132 |
-
with gr.Column():
|
| 1133 |
-
plot_exp_id = gr.Textbox(
|
| 1134 |
-
label="Experiment ID",
|
| 1135 |
-
placeholder="exp_20231201_143022"
|
| 1136 |
-
)
|
| 1137 |
-
metric_dropdown = gr.Dropdown(
|
| 1138 |
-
label="Metric to Plot",
|
| 1139 |
-
choices=[
|
| 1140 |
-
"loss", "accuracy", "learning_rate", "gpu_memory", "training_time",
|
| 1141 |
-
"total_tokens", "truncated_tokens", "padding_tokens", "throughput", "step_time",
|
| 1142 |
-
"batch_size", "seq_len", "token_acc", "train/gate_ortho", "train/center"
|
| 1143 |
-
],
|
| 1144 |
-
value="loss"
|
| 1145 |
-
)
|
| 1146 |
-
plot_btn = gr.Button("Create Plot", variant="primary")
|
| 1147 |
-
|
| 1148 |
-
with gr.Column():
|
| 1149 |
-
plot_output = gr.Plot(label="Training Metrics")
|
| 1150 |
-
|
| 1151 |
-
plot_btn.click(
|
| 1152 |
-
create_metrics_plot,
|
| 1153 |
-
inputs=[plot_exp_id, metric_dropdown],
|
| 1154 |
-
outputs=plot_output
|
| 1155 |
-
)
|
| 1156 |
-
|
| 1157 |
-
gr.Markdown("### Experiment Comparison")
|
| 1158 |
-
with gr.Row():
|
| 1159 |
-
with gr.Column():
|
| 1160 |
-
comparison_exp_ids = gr.Textbox(
|
| 1161 |
-
label="Experiment IDs (comma-separated)",
|
| 1162 |
-
placeholder="exp_1,exp_2,exp_3"
|
| 1163 |
-
)
|
| 1164 |
-
comparison_btn = gr.Button("Compare Experiments", variant="primary")
|
| 1165 |
-
|
| 1166 |
-
with gr.Column():
|
| 1167 |
-
comparison_plot = gr.Plot(label="Experiment Comparison")
|
| 1168 |
-
|
| 1169 |
-
comparison_btn.click(
|
| 1170 |
-
create_experiment_comparison,
|
| 1171 |
-
inputs=[comparison_exp_ids],
|
| 1172 |
-
outputs=comparison_plot
|
| 1173 |
-
)
|
| 1174 |
-
|
| 1175 |
-
# Demo Data Tab
|
| 1176 |
-
with gr.Tab("π― Demo Data"):
|
| 1177 |
-
gr.Markdown("### Generate Demo Training Data")
|
| 1178 |
-
gr.Markdown("Use this to simulate training data for testing the interface")
|
| 1179 |
-
with gr.Row():
|
| 1180 |
-
with gr.Column():
|
| 1181 |
-
demo_exp_id = gr.Textbox(
|
| 1182 |
-
label="Experiment ID",
|
| 1183 |
-
placeholder="exp_20231201_143022"
|
| 1184 |
-
)
|
| 1185 |
-
demo_btn = gr.Button("Generate Demo Data", variant="primary")
|
| 1186 |
-
create_demo_btn = gr.Button("Create Demo Experiment", variant="secondary")
|
| 1187 |
-
|
| 1188 |
-
with gr.Column():
|
| 1189 |
-
demo_output = gr.Textbox(
|
| 1190 |
-
label="Result",
|
| 1191 |
-
lines=5,
|
| 1192 |
-
interactive=False
|
| 1193 |
-
)
|
| 1194 |
-
|
| 1195 |
-
demo_btn.click(
|
| 1196 |
-
simulate_training_data,
|
| 1197 |
-
inputs=[demo_exp_id],
|
| 1198 |
-
outputs=demo_output
|
| 1199 |
-
)
|
| 1200 |
-
|
| 1201 |
-
create_demo_btn.click(
|
| 1202 |
-
create_demo_experiment,
|
| 1203 |
-
inputs=[],
|
| 1204 |
-
outputs=demo_output
|
| 1205 |
-
)
|
| 1206 |
-
|
| 1207 |
-
# Update Status Tab
|
| 1208 |
-
with gr.Tab("Update Status"):
|
| 1209 |
-
gr.Markdown("### Update Experiment Status")
|
| 1210 |
-
with gr.Row():
|
| 1211 |
-
with gr.Column():
|
| 1212 |
-
status_exp_id = gr.Textbox(
|
| 1213 |
-
label="Experiment ID",
|
| 1214 |
-
placeholder="exp_20231201_143022"
|
| 1215 |
-
)
|
| 1216 |
-
status_dropdown = gr.Dropdown(
|
| 1217 |
-
label="Status",
|
| 1218 |
-
choices=["running", "completed", "failed", "paused"],
|
| 1219 |
-
value="running"
|
| 1220 |
-
)
|
| 1221 |
-
update_status_btn = gr.Button("Update Status", variant="primary")
|
| 1222 |
-
|
| 1223 |
-
with gr.Column():
|
| 1224 |
-
status_output = gr.Textbox(
|
| 1225 |
-
label="Result",
|
| 1226 |
-
lines=3,
|
| 1227 |
-
interactive=False
|
| 1228 |
-
)
|
| 1229 |
-
|
| 1230 |
-
update_status_btn.click(
|
| 1231 |
-
update_experiment_status_interface,
|
| 1232 |
-
inputs=[status_exp_id, status_dropdown],
|
| 1233 |
-
outputs=status_output
|
| 1234 |
-
)
|
| 1235 |
|
| 1236 |
# Launch the app
|
| 1237 |
if __name__ == "__main__":
|
|
|
|
| 14 |
import plotly.express as px
|
| 15 |
import pandas as pd
|
| 16 |
import numpy as np
|
| 17 |
+
import plotly.io as pio
|
| 18 |
+
pio.templates.default = "plotly_white"
|
| 19 |
|
| 20 |
# Setup logging
|
| 21 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 29 |
self.current_experiment = None
|
| 30 |
|
| 31 |
# Get dataset repository and HF token from parameters or environment variables
|
| 32 |
+
self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', 'Tonic/trackio-experiments')
|
| 33 |
self.hf_token = hf_token or os.environ.get('HF_TOKEN')
|
| 34 |
|
| 35 |
+
# Initialize dataset manager for safe operations
|
| 36 |
+
self.dataset_manager = None
|
| 37 |
+
if self.hf_token and self.dataset_repo:
|
| 38 |
+
try:
|
| 39 |
+
# Import dataset manager
|
| 40 |
+
import sys
|
| 41 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
| 42 |
+
from dataset_utils import TrackioDatasetManager
|
| 43 |
+
self.dataset_manager = TrackioDatasetManager(self.dataset_repo, self.hf_token)
|
| 44 |
+
logger.info("β
Dataset manager initialized for safe operations")
|
| 45 |
+
except ImportError:
|
| 46 |
+
logger.warning("β οΈ Dataset manager not available, using legacy data handling")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.warning(f"β οΈ Failed to initialize dataset manager: {e}")
|
| 49 |
+
|
| 50 |
logger.info(f"π§ Using dataset repository: {self.dataset_repo}")
|
| 51 |
|
| 52 |
if not self.hf_token:
|
|
|
|
| 55 |
self._load_experiments()
|
| 56 |
|
| 57 |
def _load_experiments(self):
|
| 58 |
+
"""Load experiments from HF Dataset with data preservation support"""
|
| 59 |
try:
|
| 60 |
+
# Try using dataset manager first for safe operations
|
| 61 |
+
if self.dataset_manager:
|
| 62 |
+
logger.info("π Loading experiments using dataset manager")
|
| 63 |
+
experiments_list = self.dataset_manager.load_existing_experiments()
|
| 64 |
+
|
| 65 |
+
# Convert list to dict format expected by the interface
|
| 66 |
+
self.experiments = {}
|
| 67 |
+
for exp_data in experiments_list:
|
| 68 |
+
exp_id = exp_data.get('experiment_id')
|
| 69 |
+
if exp_id:
|
| 70 |
+
converted_experiment = self._convert_dataset_row_to_experiment(exp_data)
|
| 71 |
+
if converted_experiment:
|
| 72 |
+
self.experiments[exp_id] = converted_experiment
|
| 73 |
+
|
| 74 |
+
logger.info(f"β
Loaded {len(self.experiments)} experiments using dataset manager")
|
| 75 |
+
|
| 76 |
+
# Sort experiments by creation date (newest first)
|
| 77 |
+
self.experiments = dict(sorted(
|
| 78 |
+
self.experiments.items(),
|
| 79 |
+
key=lambda x: x[1].get('created_at', ''),
|
| 80 |
+
reverse=True
|
| 81 |
+
))
|
| 82 |
+
|
| 83 |
+
# If no experiments found, use backup
|
| 84 |
+
if not self.experiments:
|
| 85 |
+
logger.info("π No experiments found in dataset, using backup data")
|
| 86 |
+
self._load_backup_experiments()
|
| 87 |
+
|
| 88 |
+
return
|
| 89 |
+
|
| 90 |
+
# Fallback to direct dataset loading if dataset manager not available
|
| 91 |
if self.hf_token:
|
| 92 |
+
success = self._load_experiments_direct()
|
| 93 |
+
if success:
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
# Final fallback to backup data
|
| 97 |
+
logger.info("π Using backup data")
|
| 98 |
+
self._load_backup_experiments()
|
| 99 |
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.error(f"β Failed to load experiments: {e}")
|
| 102 |
+
self._load_backup_experiments()
|
| 103 |
+
|
| 104 |
+
def _load_experiments_direct(self) -> bool:
|
| 105 |
+
"""Load experiments directly from HF Dataset without dataset manager"""
|
| 106 |
+
try:
|
| 107 |
+
from datasets import load_dataset
|
| 108 |
+
|
| 109 |
+
logger.info(f"π Loading experiments directly from {self.dataset_repo}")
|
| 110 |
+
dataset = load_dataset(self.dataset_repo, token=self.hf_token)
|
| 111 |
+
logger.info(f"β
Successfully loaded dataset from {self.dataset_repo}")
|
| 112 |
+
|
| 113 |
+
# Convert dataset to experiments dict
|
| 114 |
+
self.experiments = {}
|
| 115 |
+
if 'train' in dataset:
|
| 116 |
+
for row in dataset['train']:
|
| 117 |
+
exp_id = row.get('experiment_id')
|
| 118 |
+
if exp_id:
|
| 119 |
+
converted_experiment = self._convert_dataset_row_to_experiment(row)
|
| 120 |
+
if converted_experiment:
|
| 121 |
+
self.experiments[exp_id] = converted_experiment
|
| 122 |
+
|
| 123 |
+
logger.info(f"π Successfully loaded {len(self.experiments)} experiments from dataset")
|
| 124 |
+
|
| 125 |
+
# Sort experiments by creation date (newest first)
|
| 126 |
+
self.experiments = dict(sorted(
|
| 127 |
+
self.experiments.items(),
|
| 128 |
+
key=lambda x: x[1].get('created_at', ''),
|
| 129 |
+
reverse=True
|
| 130 |
+
))
|
| 131 |
+
|
| 132 |
+
return True
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.warning(f"β οΈ Failed to load from dataset directly: {e}")
|
| 136 |
+
return False
|
| 137 |
+
|
| 138 |
+
def _convert_dataset_row_to_experiment(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 139 |
+
"""Convert a dataset row to experiment format, handling JSON parsing safely"""
|
| 140 |
+
try:
|
| 141 |
+
exp_id = row.get('experiment_id')
|
| 142 |
+
if not exp_id:
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
# Parse JSON fields safely
|
| 146 |
+
try:
|
| 147 |
+
metrics_raw = row.get('metrics', '[]')
|
| 148 |
+
if isinstance(metrics_raw, str):
|
| 149 |
+
metrics = json.loads(metrics_raw) if metrics_raw else []
|
| 150 |
+
else:
|
| 151 |
+
metrics = metrics_raw if metrics_raw else []
|
| 152 |
|
| 153 |
+
parameters_raw = row.get('parameters', '{}')
|
| 154 |
+
if isinstance(parameters_raw, str):
|
| 155 |
+
parameters = json.loads(parameters_raw) if parameters_raw else {}
|
| 156 |
+
else:
|
| 157 |
+
parameters = parameters_raw if parameters_raw else {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
+
artifacts_raw = row.get('artifacts', '[]')
|
| 160 |
+
if isinstance(artifacts_raw, str):
|
| 161 |
+
artifacts = json.loads(artifacts_raw) if artifacts_raw else []
|
| 162 |
+
else:
|
| 163 |
+
artifacts = artifacts_raw if artifacts_raw else []
|
| 164 |
|
| 165 |
+
logs_raw = row.get('logs', '[]')
|
| 166 |
+
if isinstance(logs_raw, str):
|
| 167 |
+
logs = json.loads(logs_raw) if logs_raw else []
|
| 168 |
+
else:
|
| 169 |
+
logs = logs_raw if logs_raw else []
|
| 170 |
+
|
| 171 |
+
except json.JSONDecodeError as json_err:
|
| 172 |
+
logger.warning(f"JSON decode error for experiment {exp_id}: {json_err}")
|
| 173 |
+
metrics, parameters, artifacts, logs = [], {}, [], []
|
| 174 |
+
|
| 175 |
+
return {
|
| 176 |
+
'id': exp_id,
|
| 177 |
+
'name': row.get('name', ''),
|
| 178 |
+
'description': row.get('description', ''),
|
| 179 |
+
'created_at': row.get('created_at', ''),
|
| 180 |
+
'status': row.get('status', 'running'),
|
| 181 |
+
'metrics': metrics,
|
| 182 |
+
'parameters': parameters,
|
| 183 |
+
'artifacts': artifacts,
|
| 184 |
+
'logs': logs,
|
| 185 |
+
'last_updated': row.get('last_updated', '')
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
except Exception as e:
|
| 189 |
+
logger.warning(f"Failed to convert dataset row to experiment: {e}")
|
| 190 |
+
return None
|
| 191 |
|
| 192 |
def _load_backup_experiments(self):
|
| 193 |
"""Load backup experiments when dataset is not available"""
|
|
|
|
| 421 |
logger.info(f"β
Loaded {len(backup_experiments)} backup experiments")
|
| 422 |
|
| 423 |
def _save_experiments(self):
|
| 424 |
+
"""Save experiments to HF Dataset with data preservation"""
|
| 425 |
+
try:
|
| 426 |
+
# Use dataset manager for safe operations if available
|
| 427 |
+
if self.dataset_manager:
|
| 428 |
+
logger.info("πΎ Saving experiments using dataset manager (data preservation)")
|
| 429 |
+
|
| 430 |
+
# Convert current experiments to dataset format
|
| 431 |
+
experiments_to_save = []
|
| 432 |
+
for exp_id, exp_data in self.experiments.items():
|
| 433 |
+
experiment_entry = {
|
| 434 |
+
'experiment_id': exp_id,
|
| 435 |
+
'name': exp_data.get('name', ''),
|
| 436 |
+
'description': exp_data.get('description', ''),
|
| 437 |
+
'created_at': exp_data.get('created_at', ''),
|
| 438 |
+
'status': exp_data.get('status', 'running'),
|
| 439 |
+
'metrics': json.dumps(exp_data.get('metrics', []), default=str),
|
| 440 |
+
'parameters': json.dumps(exp_data.get('parameters', {}), default=str),
|
| 441 |
+
'artifacts': json.dumps(exp_data.get('artifacts', []), default=str),
|
| 442 |
+
'logs': json.dumps(exp_data.get('logs', []), default=str),
|
| 443 |
+
'last_updated': datetime.now().isoformat()
|
| 444 |
+
}
|
| 445 |
+
experiments_to_save.append(experiment_entry)
|
| 446 |
+
|
| 447 |
+
# Use dataset manager to save with data preservation
|
| 448 |
+
success = self.dataset_manager.save_experiments(
|
| 449 |
+
experiments_to_save,
|
| 450 |
+
f"Update experiments from Trackio Space ({len(experiments_to_save)} total experiments)"
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
if success:
|
| 454 |
+
logger.info(f"β
Successfully saved {len(experiments_to_save)} experiments with data preservation")
|
| 455 |
+
else:
|
| 456 |
+
logger.error("β Failed to save experiments using dataset manager")
|
| 457 |
+
# Fallback to legacy method
|
| 458 |
+
self._save_experiments_legacy()
|
| 459 |
+
|
| 460 |
+
return
|
| 461 |
+
|
| 462 |
+
# Fallback to legacy method if dataset manager not available
|
| 463 |
+
self._save_experiments_legacy()
|
| 464 |
+
|
| 465 |
+
except Exception as e:
|
| 466 |
+
logger.error(f"β Failed to save experiments: {e}")
|
| 467 |
+
# Fallback to legacy method
|
| 468 |
+
self._save_experiments_legacy()
|
| 469 |
+
|
| 470 |
+
def _save_experiments_legacy(self):
|
| 471 |
+
"""Legacy save method without data preservation (fallback only)"""
|
| 472 |
try:
|
| 473 |
if self.hf_token:
|
| 474 |
from datasets import Dataset
|
| 475 |
from huggingface_hub import HfApi
|
| 476 |
|
| 477 |
+
logger.warning("β οΈ Using legacy save method - data preservation not guaranteed")
|
| 478 |
+
|
| 479 |
# Convert experiments to dataset format
|
| 480 |
dataset_data = []
|
| 481 |
for exp_id, exp_data in self.experiments.items():
|
|
|
|
| 485 |
'description': exp_data.get('description', ''),
|
| 486 |
'created_at': exp_data.get('created_at', ''),
|
| 487 |
'status': exp_data.get('status', 'running'),
|
| 488 |
+
'metrics': json.dumps(exp_data.get('metrics', []), default=str),
|
| 489 |
+
'parameters': json.dumps(exp_data.get('parameters', {}), default=str),
|
| 490 |
+
'artifacts': json.dumps(exp_data.get('artifacts', []), default=str),
|
| 491 |
+
'logs': json.dumps(exp_data.get('logs', []), default=str),
|
| 492 |
'last_updated': datetime.now().isoformat()
|
| 493 |
})
|
| 494 |
|
|
|
|
| 500 |
dataset.push_to_hub(
|
| 501 |
self.dataset_repo,
|
| 502 |
token=self.hf_token,
|
| 503 |
+
private=True,
|
| 504 |
+
commit_message=f"Legacy update: {len(dataset_data)} experiments"
|
| 505 |
)
|
| 506 |
|
| 507 |
+
logger.info(f"β
Saved {len(dataset_data)} experiments to {self.dataset_repo} (legacy method)")
|
| 508 |
|
| 509 |
else:
|
| 510 |
logger.warning("β οΈ No HF_TOKEN available, experiments not saved to dataset")
|
| 511 |
|
| 512 |
except Exception as e:
|
| 513 |
+
logger.error(f"β Failed to save experiments with legacy method: {e}")
|
| 514 |
# Fall back to local file for backup
|
| 515 |
try:
|
| 516 |
data = {
|
|
|
|
| 522 |
json.dump(data, f, indent=2, default=str)
|
| 523 |
logger.info("β
Saved backup to local file")
|
| 524 |
except Exception as backup_e:
|
| 525 |
+
logger.error(f"β Failed to save backup: {backup_e}")
|
| 526 |
|
| 527 |
def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
|
| 528 |
"""Create a new experiment"""
|
|
|
|
| 642 |
# Reload experiments with new configuration
|
| 643 |
trackio_space._load_experiments()
|
| 644 |
|
| 645 |
+
# Check if dataset manager is available
|
| 646 |
+
manager_status = "β
Available (data preservation enabled)" if trackio_space.dataset_manager else "β οΈ Not available (legacy mode)"
|
| 647 |
+
|
| 648 |
+
return f"β
Configuration updated successfully!\nπ Dataset: {trackio_space.dataset_repo}\nπ HF Token: {'Set' if trackio_space.hf_token else 'Not set'}\nπ‘οΈ Data Manager: {manager_status}\nπ Loaded {len(trackio_space.experiments)} experiments"
|
| 649 |
|
| 650 |
except Exception as e:
|
| 651 |
return f"β Failed to update configuration: {str(e)}"
|
|
|
|
| 664 |
# Test loading the dataset
|
| 665 |
dataset = load_dataset(dataset_repo, token=hf_token)
|
| 666 |
|
| 667 |
+
# Count experiments and analyze structure
|
| 668 |
experiment_count = len(dataset['train']) if 'train' in dataset else 0
|
| 669 |
|
| 670 |
+
# Get column information
|
| 671 |
+
columns = list(dataset['train'].column_names) if 'train' in dataset else []
|
| 672 |
+
|
| 673 |
+
# Sample first few experiment IDs
|
| 674 |
+
sample_experiments = []
|
| 675 |
+
if 'train' in dataset and experiment_count > 0:
|
| 676 |
+
for i, row in enumerate(dataset['train']):
|
| 677 |
+
if i >= 3: # Only show first 3
|
| 678 |
+
break
|
| 679 |
+
sample_experiments.append(row.get('experiment_id', 'unknown'))
|
| 680 |
+
|
| 681 |
+
result = f"β
Connection successful!\nπ Dataset: {dataset_repo}\nπ Found {experiment_count} experiments\nπ Dataset URL: https://huggingface.co/datasets/{dataset_repo}\n\n"
|
| 682 |
+
result += f"π Dataset Columns: {', '.join(columns)}\n"
|
| 683 |
+
if sample_experiments:
|
| 684 |
+
result += f"π¬ Sample Experiments: {', '.join(sample_experiments)}\n"
|
| 685 |
+
|
| 686 |
+
# Test parsing one experiment if available
|
| 687 |
+
if 'train' in dataset and experiment_count > 0:
|
| 688 |
+
first_row = dataset['train'][0]
|
| 689 |
+
exp_id = first_row.get('experiment_id', 'unknown')
|
| 690 |
+
metrics_raw = first_row.get('metrics', '[]')
|
| 691 |
+
|
| 692 |
+
try:
|
| 693 |
+
if isinstance(metrics_raw, str):
|
| 694 |
+
metrics = json.loads(metrics_raw)
|
| 695 |
+
metrics_count = len(metrics) if isinstance(metrics, list) else 0
|
| 696 |
+
result += f"π First experiment ({exp_id}) metrics: {metrics_count} entries\n"
|
| 697 |
+
else:
|
| 698 |
+
result += f"π First experiment ({exp_id}) metrics: Non-string format\n"
|
| 699 |
+
except json.JSONDecodeError as e:
|
| 700 |
+
result += f"β οΈ JSON parse error in first experiment: {e}\n"
|
| 701 |
+
|
| 702 |
+
return result
|
| 703 |
|
| 704 |
except Exception as e:
|
| 705 |
return f"β Connection failed: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token is correct\n2. Verify the dataset repository exists\n3. Ensure your token has read access to the dataset"
|
|
|
|
| 728 |
# Check if dataset exists
|
| 729 |
try:
|
| 730 |
api.dataset_info(dataset_repo)
|
| 731 |
+
return f"β
Dataset {dataset_repo} already exists!\nπ‘οΈ Data preservation is enabled for existing datasets\nπ View at: https://huggingface.co/datasets/{dataset_repo}"
|
| 732 |
except:
|
| 733 |
# Dataset doesn't exist, create it
|
| 734 |
pass
|
| 735 |
|
| 736 |
+
# Try to initialize dataset manager to use its repository creation
|
| 737 |
+
try:
|
| 738 |
+
# Import dataset manager
|
| 739 |
+
import sys
|
| 740 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
| 741 |
+
from dataset_utils import TrackioDatasetManager
|
| 742 |
+
|
| 743 |
+
# Create dataset manager instance
|
| 744 |
+
dataset_manager = TrackioDatasetManager(dataset_repo, hf_token)
|
| 745 |
+
|
| 746 |
+
# Check if dataset exists using the manager
|
| 747 |
+
exists = dataset_manager.check_dataset_exists()
|
| 748 |
+
if exists:
|
| 749 |
+
return f"β
Dataset {dataset_repo} already exists!\nπ‘οΈ Data preservation is enabled\nπ View at: https://huggingface.co/datasets/{dataset_repo}"
|
| 750 |
+
|
| 751 |
+
except ImportError:
|
| 752 |
+
# Dataset manager not available, use legacy method
|
| 753 |
+
pass
|
| 754 |
+
except Exception as e:
|
| 755 |
+
# Dataset manager failed, use legacy method
|
| 756 |
+
logger.warning(f"Dataset manager failed: {e}, using legacy method")
|
| 757 |
+
|
| 758 |
+
# Create empty dataset with proper structure
|
| 759 |
empty_dataset = Dataset.from_dict({
|
| 760 |
'experiment_id': [],
|
| 761 |
'name': [],
|
|
|
|
| 773 |
empty_dataset.push_to_hub(
|
| 774 |
dataset_repo,
|
| 775 |
token=hf_token,
|
| 776 |
+
private=True,
|
| 777 |
+
commit_message="Create Trackio experiment dataset with data preservation support"
|
| 778 |
)
|
| 779 |
|
| 780 |
+
return f"β
Dataset {dataset_repo} created successfully!\nπ‘οΈ Data preservation is now enabled\nπ View at: https://huggingface.co/datasets/{dataset_repo}\nπ Ready to store experiments safely"
|
| 781 |
|
| 782 |
except Exception as e:
|
| 783 |
+
return f"β Failed to create dataset: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token has write permissions\n2. Verify the username in the repository name\n3. Ensure the dataset name is valid\n4. Check internet connectivity"
|
| 784 |
|
| 785 |
# Initialize API client for remote data
|
| 786 |
api_client = None
|
| 787 |
try:
|
| 788 |
from trackio_api_client import TrackioAPIClient
|
| 789 |
+
# Get Trackio URL from environment or use default
|
| 790 |
+
trackio_url = os.environ.get('TRACKIO_URL', 'https://tonic-test-trackio-test.hf.space')
|
| 791 |
+
|
| 792 |
+
# Clean up URL to avoid double protocol issues
|
| 793 |
+
if trackio_url.startswith('https://https://'):
|
| 794 |
+
trackio_url = trackio_url.replace('https://https://', 'https://')
|
| 795 |
+
elif trackio_url.startswith('http://http://'):
|
| 796 |
+
trackio_url = trackio_url.replace('http://http://', 'http://')
|
| 797 |
+
|
| 798 |
+
api_client = TrackioAPIClient(trackio_url)
|
| 799 |
+
logger.info(f"β
API client initialized for remote data access: {trackio_url}")
|
| 800 |
except ImportError:
|
| 801 |
logger.warning("β οΈ API client not available, using local data only")
|
| 802 |
+
except Exception as e:
|
| 803 |
+
logger.warning(f"β οΈ Failed to initialize API client: {e}, using local data only")
|
| 804 |
|
| 805 |
# Add Hugging Face Spaces compatibility
|
| 806 |
def is_huggingface_spaces():
|
|
|
|
| 844 |
lines = experiment_details.split('\n')
|
| 845 |
metrics_data = []
|
| 846 |
|
| 847 |
+
# First try to parse the new format with structured experiment details
|
| 848 |
for line in lines:
|
| 849 |
if 'Step:' in line and 'Metrics:' in line:
|
| 850 |
# Extract step and metrics from the line
|
|
|
|
| 866 |
logger.warning(f"Failed to parse metrics line: {line} - {e}")
|
| 867 |
continue
|
| 868 |
|
| 869 |
+
# If no metrics found in text format, try to parse from the dataset directly
|
| 870 |
+
if not metrics_data:
|
| 871 |
+
logger.info("No metrics found in text format, trying to parse from experiment structure")
|
| 872 |
+
# This will be handled by the updated get_remote_experiment_data function
|
| 873 |
+
|
| 874 |
if metrics_data:
|
| 875 |
return pd.DataFrame(metrics_data)
|
| 876 |
else:
|
|
|
|
| 881 |
return pd.DataFrame()
|
| 882 |
|
| 883 |
def get_metrics_dataframe(experiment_id: str) -> pd.DataFrame:
|
| 884 |
+
"""Get metrics as a pandas DataFrame for plotting - tries dataset first, then local backup"""
|
| 885 |
+
try:
|
| 886 |
+
# First try to get data directly from the dataset using the dataset manager
|
| 887 |
+
if trackio_space.dataset_manager:
|
| 888 |
+
logger.info(f"Getting metrics for {experiment_id} from dataset")
|
| 889 |
+
experiment_data = trackio_space.dataset_manager.get_experiment_by_id(experiment_id)
|
| 890 |
+
|
| 891 |
+
if experiment_data:
|
| 892 |
+
# Parse metrics from the dataset
|
| 893 |
+
metrics_json = experiment_data.get('metrics', '[]')
|
| 894 |
+
if isinstance(metrics_json, str):
|
| 895 |
+
try:
|
| 896 |
+
metrics_list = json.loads(metrics_json)
|
| 897 |
+
|
| 898 |
+
# Convert to DataFrame format
|
| 899 |
+
df_data = []
|
| 900 |
+
for metric_entry in metrics_list:
|
| 901 |
+
if isinstance(metric_entry, dict):
|
| 902 |
+
step = metric_entry.get('step', 0)
|
| 903 |
+
timestamp = metric_entry.get('timestamp', '')
|
| 904 |
+
metrics = metric_entry.get('metrics', {})
|
| 905 |
+
|
| 906 |
+
row = {'step': step, 'timestamp': timestamp}
|
| 907 |
+
row.update(metrics)
|
| 908 |
+
df_data.append(row)
|
| 909 |
+
|
| 910 |
+
if df_data:
|
| 911 |
+
logger.info(f"Found {len(df_data)} metrics entries from dataset for {experiment_id}")
|
| 912 |
+
return pd.DataFrame(df_data)
|
| 913 |
+
else:
|
| 914 |
+
logger.warning(f"No valid metrics found in dataset for {experiment_id}")
|
| 915 |
+
except json.JSONDecodeError as e:
|
| 916 |
+
logger.warning(f"Failed to parse metrics JSON for {experiment_id}: {e}")
|
| 917 |
+
else:
|
| 918 |
+
logger.warning(f"Metrics data is not a JSON string for {experiment_id}")
|
| 919 |
+
else:
|
| 920 |
+
logger.warning(f"Experiment {experiment_id} not found in dataset")
|
| 921 |
+
|
| 922 |
+
# Try legacy remote data approach
|
| 923 |
+
remote_data = get_remote_experiment_data(experiment_id)
|
| 924 |
+
if remote_data:
|
| 925 |
+
logger.info(f"Using remote API data for {experiment_id}")
|
| 926 |
+
# Parse the remote experiment details to extract metrics
|
| 927 |
+
df = parse_remote_metrics_data(remote_data["data"])
|
| 928 |
+
if not df.empty:
|
| 929 |
+
logger.info(f"Found {len(df)} metrics entries from remote API")
|
| 930 |
+
return df
|
| 931 |
+
else:
|
| 932 |
+
logger.warning(f"No metrics found in remote API data for {experiment_id}")
|
| 933 |
+
|
| 934 |
+
# Fall back to local data
|
| 935 |
+
logger.info(f"Using local backup data for {experiment_id}")
|
| 936 |
+
return trackio_space.get_metrics_dataframe(experiment_id)
|
| 937 |
+
|
| 938 |
+
except Exception as e:
|
| 939 |
+
logger.error(f"Error getting metrics dataframe for {experiment_id}: {e}")
|
| 940 |
+
# Fall back to local data
|
| 941 |
+
logger.info(f"Falling back to local data for {experiment_id}")
|
| 942 |
+
return trackio_space.get_metrics_dataframe(experiment_id)
|
| 943 |
|
| 944 |
def create_experiment_interface(name: str, description: str) -> str:
|
| 945 |
"""Create a new experiment"""
|
|
|
|
| 1196 |
except Exception as e:
|
| 1197 |
return f"β Error creating demo experiment: {str(e)}"
|
| 1198 |
|
| 1199 |
+
|
| 1200 |
+
# Helper functions for the new interface
|
| 1201 |
+
def get_experiment_dropdown_choices() -> list:
|
| 1202 |
+
"""Get the list of experiments for the dropdown"""
|
| 1203 |
+
experiments = list(trackio_space.experiments.keys())
|
| 1204 |
+
if not experiments:
|
| 1205 |
+
return ["No experiments available"]
|
| 1206 |
+
return experiments
|
| 1207 |
+
|
| 1208 |
+
def refresh_experiment_dropdown() -> tuple:
|
| 1209 |
+
"""Refresh the experiment dropdown and return current choices"""
|
| 1210 |
+
choices = get_experiment_dropdown_choices()
|
| 1211 |
+
current_value = choices[0] if choices and choices[0] != "No experiments available" else None
|
| 1212 |
+
return gr.Dropdown(choices=choices, value=current_value)
|
| 1213 |
+
|
| 1214 |
+
def get_available_metrics_for_experiments(experiment_ids: list) -> list:
|
| 1215 |
+
"""Get all available metrics across selected experiments"""
|
| 1216 |
+
try:
|
| 1217 |
+
all_metrics = set()
|
| 1218 |
+
for exp_id in experiment_ids:
|
| 1219 |
+
df = get_metrics_dataframe(exp_id)
|
| 1220 |
+
if not df.empty:
|
| 1221 |
+
# Get numeric columns (excluding step and timestamp)
|
| 1222 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 1223 |
+
numeric_cols = [col for col in numeric_cols if col not in ['step']]
|
| 1224 |
+
all_metrics.update(numeric_cols)
|
| 1225 |
+
|
| 1226 |
+
return sorted(list(all_metrics))
|
| 1227 |
+
except Exception as e:
|
| 1228 |
+
logger.error(f"Error getting available metrics: {str(e)}")
|
| 1229 |
+
return ["loss", "accuracy"]
|
| 1230 |
+
|
| 1231 |
+
def create_test_plot() -> go.Figure:
|
| 1232 |
+
"""Create a simple test plot to verify plotly rendering works"""
|
| 1233 |
+
try:
|
| 1234 |
+
# Create simple test data
|
| 1235 |
+
x = [1, 2, 3, 4, 5]
|
| 1236 |
+
y = [1, 4, 2, 3, 5]
|
| 1237 |
+
|
| 1238 |
+
fig = go.Figure()
|
| 1239 |
+
fig.add_trace(go.Scatter(
|
| 1240 |
+
x=x,
|
| 1241 |
+
y=y,
|
| 1242 |
+
mode='lines+markers',
|
| 1243 |
+
name='Test Data',
|
| 1244 |
+
line=dict(width=2, color='blue'),
|
| 1245 |
+
marker=dict(size=5, color='red'),
|
| 1246 |
+
connectgaps=True,
|
| 1247 |
+
hovertemplate='<b>X:</b> %{x}<br><b>Y:</b> %{y}<extra></extra>'
|
| 1248 |
+
))
|
| 1249 |
+
|
| 1250 |
+
fig.update_layout(
|
| 1251 |
+
title="Test Plot - If you can see this, plotly is working!",
|
| 1252 |
+
xaxis_title="X Axis",
|
| 1253 |
+
yaxis_title="Y Axis",
|
| 1254 |
+
plot_bgcolor='white',
|
| 1255 |
+
paper_bgcolor='white',
|
| 1256 |
+
font=dict(size=14),
|
| 1257 |
+
margin=dict(l=50, r=50, t=80, b=50)
|
| 1258 |
+
)
|
| 1259 |
+
|
| 1260 |
+
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
|
| 1261 |
+
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
|
| 1262 |
+
|
| 1263 |
+
logger.info("Test plot created successfully")
|
| 1264 |
+
return fig
|
| 1265 |
+
|
| 1266 |
+
except Exception as e:
|
| 1267 |
+
logger.error(f"Error creating test plot: {str(e)}")
|
| 1268 |
+
fig = go.Figure()
|
| 1269 |
+
fig.add_annotation(
|
| 1270 |
+
text=f"Test plot error: {str(e)}",
|
| 1271 |
+
xref="paper", yref="paper",
|
| 1272 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1273 |
+
font=dict(size=14, color="red")
|
| 1274 |
+
)
|
| 1275 |
+
return fig
|
| 1276 |
+
|
| 1277 |
+
def get_experiment_status_summary(experiment_id: str) -> str:
|
| 1278 |
+
"""Get a formatted summary of experiment status and metadata"""
|
| 1279 |
+
try:
|
| 1280 |
+
experiment = trackio_space.get_experiment(experiment_id)
|
| 1281 |
+
if not experiment:
|
| 1282 |
+
return f"Experiment {experiment_id} not found."
|
| 1283 |
+
|
| 1284 |
+
summary = f"π EXPERIMENT STATUS SUMMARY\n{'='*50}\n"
|
| 1285 |
+
summary += f"ID: {experiment['id']}\n"
|
| 1286 |
+
summary += f"Name: {experiment['name']}\n"
|
| 1287 |
+
summary += f"Description: {experiment['description']}\n"
|
| 1288 |
+
summary += f"Status: {experiment['status']}\n"
|
| 1289 |
+
summary += f"Created: {experiment['created_at']}\n"
|
| 1290 |
+
summary += f"Metrics entries: {len(experiment['metrics'])}\n"
|
| 1291 |
+
summary += f"Parameters: {len(experiment['parameters'])}\n"
|
| 1292 |
+
summary += f"Artifacts: {len(experiment['artifacts'])}\n"
|
| 1293 |
+
summary += f"Logs: {len(experiment['logs'])}\n"
|
| 1294 |
+
|
| 1295 |
+
# Add latest metrics if available
|
| 1296 |
+
if experiment['metrics']:
|
| 1297 |
+
latest = experiment['metrics'][-1]
|
| 1298 |
+
summary += f"\nπ LATEST METRICS (Step {latest.get('step', 'N/A')}):\n"
|
| 1299 |
+
for k, v in latest.get('metrics', {}).items():
|
| 1300 |
+
summary += f" {k}: {v}\n"
|
| 1301 |
+
|
| 1302 |
+
return summary
|
| 1303 |
+
except Exception as e:
|
| 1304 |
+
return f"Error generating status summary: {str(e)}"
|
| 1305 |
+
|
| 1306 |
+
def get_experiment_parameters_summary(experiment_id: str) -> str:
|
| 1307 |
+
"""Get a formatted summary of experiment parameters"""
|
| 1308 |
+
try:
|
| 1309 |
+
experiment = trackio_space.get_experiment(experiment_id)
|
| 1310 |
+
if not experiment:
|
| 1311 |
+
return f"Experiment {experiment_id} not found."
|
| 1312 |
+
|
| 1313 |
+
params = experiment.get('parameters', {})
|
| 1314 |
+
if not params:
|
| 1315 |
+
return "No parameters logged for this experiment."
|
| 1316 |
+
|
| 1317 |
+
summary = f"π§ PARAMETERS FOR {experiment_id}\n{'='*50}\n"
|
| 1318 |
+
|
| 1319 |
+
# Group parameters by category
|
| 1320 |
+
model_params = {k: v for k, v in params.items() if 'model' in k.lower() or 'name' in k.lower()}
|
| 1321 |
+
training_params = {k: v for k, v in params.items() if any(x in k.lower() for x in ['learning', 'batch', 'epoch', 'step', 'iter', 'optimizer'])}
|
| 1322 |
+
data_params = {k: v for k, v in params.items() if any(x in k.lower() for x in ['data', 'dataset', 'file', 'split'])}
|
| 1323 |
+
other_params = {k: v for k, v in params.items() if k not in model_params and k not in training_params and k not in data_params}
|
| 1324 |
+
|
| 1325 |
+
if model_params:
|
| 1326 |
+
summary += "π€ MODEL PARAMETERS:\n"
|
| 1327 |
+
for k, v in model_params.items():
|
| 1328 |
+
summary += f" {k}: {v}\n"
|
| 1329 |
+
summary += "\n"
|
| 1330 |
+
|
| 1331 |
+
if training_params:
|
| 1332 |
+
summary += "π TRAINING PARAMETERS:\n"
|
| 1333 |
+
for k, v in training_params.items():
|
| 1334 |
+
summary += f" {k}: {v}\n"
|
| 1335 |
+
summary += "\n"
|
| 1336 |
+
|
| 1337 |
+
if data_params:
|
| 1338 |
+
summary += "π DATA PARAMETERS:\n"
|
| 1339 |
+
for k, v in data_params.items():
|
| 1340 |
+
summary += f" {k}: {v}\n"
|
| 1341 |
+
summary += "\n"
|
| 1342 |
+
|
| 1343 |
+
if other_params:
|
| 1344 |
+
summary += "βοΈ OTHER PARAMETERS:\n"
|
| 1345 |
+
for k, v in other_params.items():
|
| 1346 |
+
summary += f" {k}: {v}\n"
|
| 1347 |
+
|
| 1348 |
+
return summary
|
| 1349 |
+
except Exception as e:
|
| 1350 |
+
return f"Error generating parameters summary: {str(e)}"
|
| 1351 |
+
|
| 1352 |
+
def get_experiment_metrics_summary(experiment_id: str) -> str:
|
| 1353 |
+
"""Get a summary of all metrics for an experiment"""
|
| 1354 |
+
try:
|
| 1355 |
+
df = get_metrics_dataframe(experiment_id)
|
| 1356 |
+
if df.empty:
|
| 1357 |
+
return "No metrics data available for this experiment.\n\nπ‘ This could mean:\nβ’ The experiment hasn't started logging metrics yet\nβ’ The experiment is using a different data format\nβ’ No training has been performed on this experiment"
|
| 1358 |
+
|
| 1359 |
+
# Get numeric columns (excluding step and timestamp)
|
| 1360 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 1361 |
+
numeric_cols = [col for col in numeric_cols if col not in ['step']]
|
| 1362 |
+
|
| 1363 |
+
if not numeric_cols:
|
| 1364 |
+
return "No numeric metrics found for this experiment.\n\nπ‘ This could mean:\nβ’ Only timestamp data is available\nβ’ Metrics are stored in a different format\nβ’ The experiment hasn't logged any numeric metrics yet"
|
| 1365 |
+
|
| 1366 |
+
summary = f"π METRICS SUMMARY FOR {experiment_id}\n{'='*50}\n"
|
| 1367 |
+
summary += f"Total data points: {len(df)}\n"
|
| 1368 |
+
summary += f"Steps range: {df['step'].min()} - {df['step'].max()}\n"
|
| 1369 |
+
summary += f"Available metrics: {', '.join(numeric_cols)}\n\n"
|
| 1370 |
+
|
| 1371 |
+
for col in numeric_cols:
|
| 1372 |
+
if col in df.columns:
|
| 1373 |
+
values = df[col].dropna()
|
| 1374 |
+
if len(values) > 0:
|
| 1375 |
+
summary += f"{col}:\n"
|
| 1376 |
+
summary += f" Min: {values.min():.6f}\n"
|
| 1377 |
+
summary += f" Max: {values.max():.6f}\n"
|
| 1378 |
+
summary += f" Mean: {values.mean():.6f}\n"
|
| 1379 |
+
summary += f" Latest: {values.iloc[-1]:.6f}\n\n"
|
| 1380 |
+
|
| 1381 |
+
return summary
|
| 1382 |
+
except Exception as e:
|
| 1383 |
+
return f"Error generating metrics summary: {str(e)}"
|
| 1384 |
+
|
| 1385 |
+
def create_combined_metrics_plot(experiment_id: str) -> go.Figure:
|
| 1386 |
+
"""Create a combined plot showing all metrics for an experiment"""
|
| 1387 |
+
try:
|
| 1388 |
+
if not experiment_id:
|
| 1389 |
+
fig = go.Figure()
|
| 1390 |
+
fig.add_annotation(
|
| 1391 |
+
text="No experiment selected",
|
| 1392 |
+
xref="paper", yref="paper",
|
| 1393 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1394 |
+
font=dict(size=16, color="gray")
|
| 1395 |
+
)
|
| 1396 |
+
fig.update_layout(
|
| 1397 |
+
title="Select an Experiment",
|
| 1398 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
| 1399 |
+
)
|
| 1400 |
+
return fig
|
| 1401 |
+
|
| 1402 |
+
df = get_metrics_dataframe(experiment_id)
|
| 1403 |
+
if df.empty:
|
| 1404 |
+
fig = go.Figure()
|
| 1405 |
+
fig.add_annotation(
|
| 1406 |
+
text="No metrics data available for this experiment",
|
| 1407 |
+
xref="paper", yref="paper",
|
| 1408 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1409 |
+
font=dict(size=16, color="red")
|
| 1410 |
+
)
|
| 1411 |
+
fig.update_layout(
|
| 1412 |
+
title="No Data Available",
|
| 1413 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
| 1414 |
+
)
|
| 1415 |
+
return fig
|
| 1416 |
+
|
| 1417 |
+
# Get numeric columns (excluding step and timestamp)
|
| 1418 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 1419 |
+
numeric_cols = [col for col in numeric_cols if col not in ['step']]
|
| 1420 |
+
|
| 1421 |
+
if not numeric_cols:
|
| 1422 |
+
fig = go.Figure()
|
| 1423 |
+
fig.add_annotation(
|
| 1424 |
+
text="No numeric metrics found for this experiment",
|
| 1425 |
+
xref="paper", yref="paper",
|
| 1426 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1427 |
+
font=dict(size=16, color="orange")
|
| 1428 |
+
)
|
| 1429 |
+
fig.update_layout(
|
| 1430 |
+
title="No Metrics Found",
|
| 1431 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
| 1432 |
+
)
|
| 1433 |
+
return fig
|
| 1434 |
+
|
| 1435 |
+
# Create subplots for multiple metrics
|
| 1436 |
+
from plotly.subplots import make_subplots
|
| 1437 |
+
|
| 1438 |
+
# Determine number of rows and columns for subplots
|
| 1439 |
+
n_metrics = len(numeric_cols)
|
| 1440 |
+
n_cols = min(3, n_metrics) # Max 3 columns
|
| 1441 |
+
n_rows = (n_metrics + n_cols - 1) // n_cols
|
| 1442 |
+
|
| 1443 |
+
fig = make_subplots(
|
| 1444 |
+
rows=n_rows, cols=n_cols,
|
| 1445 |
+
subplot_titles=numeric_cols,
|
| 1446 |
+
vertical_spacing=0.05,
|
| 1447 |
+
horizontal_spacing=0.1
|
| 1448 |
+
)
|
| 1449 |
+
|
| 1450 |
+
# Define colors for different metrics
|
| 1451 |
+
colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'cyan', 'magenta']
|
| 1452 |
+
|
| 1453 |
+
for i, metric in enumerate(numeric_cols):
|
| 1454 |
+
if metric in df.columns and not df[metric].isna().all():
|
| 1455 |
+
row = (i // n_cols) + 1
|
| 1456 |
+
col = (i % n_cols) + 1
|
| 1457 |
+
color = colors[i % len(colors)]
|
| 1458 |
+
|
| 1459 |
+
fig.add_trace(
|
| 1460 |
+
go.Scatter(
|
| 1461 |
+
x=df['step'].tolist(),
|
| 1462 |
+
y=df[metric].tolist(),
|
| 1463 |
+
mode='lines+markers',
|
| 1464 |
+
name=metric,
|
| 1465 |
+
line=dict(width=2, color=color),
|
| 1466 |
+
marker=dict(size=4, color=color),
|
| 1467 |
+
showlegend=False,
|
| 1468 |
+
connectgaps=True
|
| 1469 |
+
),
|
| 1470 |
+
row=row, col=col
|
| 1471 |
+
)
|
| 1472 |
+
|
| 1473 |
+
fig.update_layout(
|
| 1474 |
+
title=f"All Metrics for Experiment {experiment_id}",
|
| 1475 |
+
height=350 * n_rows,
|
| 1476 |
+
plot_bgcolor='white',
|
| 1477 |
+
paper_bgcolor='white',
|
| 1478 |
+
font=dict(size=12),
|
| 1479 |
+
margin=dict(l=50, r=50, t=80, b=50)
|
| 1480 |
+
)
|
| 1481 |
+
|
| 1482 |
+
# Update all subplot axes
|
| 1483 |
+
for i in range(1, n_rows + 1):
|
| 1484 |
+
for j in range(1, n_cols + 1):
|
| 1485 |
+
fig.update_xaxes(
|
| 1486 |
+
showgrid=True, gridwidth=1, gridcolor='lightgray',
|
| 1487 |
+
zeroline=True, zerolinecolor='black',
|
| 1488 |
+
row=i, col=j
|
| 1489 |
+
)
|
| 1490 |
+
fig.update_yaxes(
|
| 1491 |
+
showgrid=True, gridwidth=1, gridcolor='lightgray',
|
| 1492 |
+
zeroline=True, zerolinecolor='black',
|
| 1493 |
+
row=i, col=j
|
| 1494 |
+
)
|
| 1495 |
+
|
| 1496 |
+
return fig
|
| 1497 |
+
|
| 1498 |
+
except Exception as e:
|
| 1499 |
+
logger.error(f"Error creating combined metrics plot: {str(e)}")
|
| 1500 |
+
fig = go.Figure()
|
| 1501 |
+
fig.add_annotation(
|
| 1502 |
+
text=f"Error creating combined plot: {str(e)}",
|
| 1503 |
+
xref="paper", yref="paper",
|
| 1504 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1505 |
+
font=dict(size=14, color="red")
|
| 1506 |
+
)
|
| 1507 |
+
return fig
|
| 1508 |
+
|
| 1509 |
+
def update_dashboard(experiment_id: str) -> tuple:
|
| 1510 |
+
"""Update all dashboard components for a selected experiment"""
|
| 1511 |
+
try:
|
| 1512 |
+
if not experiment_id or experiment_id == "No experiments available":
|
| 1513 |
+
return (
|
| 1514 |
+
"Please select an experiment from the dropdown.",
|
| 1515 |
+
"No experiment selected.",
|
| 1516 |
+
"No experiment selected.",
|
| 1517 |
+
create_combined_metrics_plot(""),
|
| 1518 |
+
"No experiment selected."
|
| 1519 |
+
)
|
| 1520 |
+
|
| 1521 |
+
# Get all the dashboard components
|
| 1522 |
+
status_summary = get_experiment_status_summary(experiment_id)
|
| 1523 |
+
parameters_summary = get_experiment_parameters_summary(experiment_id)
|
| 1524 |
+
metrics_summary = get_experiment_metrics_summary(experiment_id)
|
| 1525 |
+
combined_plot = create_combined_metrics_plot(experiment_id)
|
| 1526 |
+
|
| 1527 |
+
# Create a combined summary
|
| 1528 |
+
combined_summary = f"{status_summary}\n\n{parameters_summary}\n\n{metrics_summary}"
|
| 1529 |
+
|
| 1530 |
+
return (
|
| 1531 |
+
status_summary,
|
| 1532 |
+
parameters_summary,
|
| 1533 |
+
metrics_summary,
|
| 1534 |
+
combined_plot,
|
| 1535 |
+
combined_summary
|
| 1536 |
+
)
|
| 1537 |
+
except Exception as e:
|
| 1538 |
+
error_msg = f"Error updating dashboard: {str(e)}"
|
| 1539 |
+
return (error_msg, error_msg, error_msg, create_combined_metrics_plot(""), error_msg)
|
| 1540 |
+
|
| 1541 |
+
def update_dashboard_metric_plot(experiment_id: str, metric_name: str = "loss") -> go.Figure:
|
| 1542 |
+
"""Update the dashboard metric plot for a selected experiment and metric"""
|
| 1543 |
+
try:
|
| 1544 |
+
if not experiment_id or experiment_id == "No experiments available":
|
| 1545 |
+
return create_metrics_plot("", metric_name)
|
| 1546 |
+
|
| 1547 |
+
return create_metrics_plot(experiment_id, metric_name)
|
| 1548 |
+
except Exception as e:
|
| 1549 |
+
logger.error(f"Error updating dashboard metric plot: {str(e)}")
|
| 1550 |
+
return create_metrics_plot("", metric_name)
|
| 1551 |
+
|
| 1552 |
+
def create_experiment_comparison_from_selection(selected_experiments: list, selected_metrics: list) -> go.Figure:
|
| 1553 |
+
"""Create experiment comparison from checkbox selections"""
|
| 1554 |
+
try:
|
| 1555 |
+
if not selected_experiments:
|
| 1556 |
+
fig = go.Figure()
|
| 1557 |
+
fig.add_annotation(
|
| 1558 |
+
text="Please select at least one experiment to compare",
|
| 1559 |
+
xref="paper", yref="paper",
|
| 1560 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1561 |
+
font=dict(size=16, color="orange")
|
| 1562 |
+
)
|
| 1563 |
+
fig.update_layout(
|
| 1564 |
+
title="No Experiments Selected",
|
| 1565 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
| 1566 |
+
)
|
| 1567 |
+
return fig
|
| 1568 |
+
|
| 1569 |
+
if not selected_metrics:
|
| 1570 |
+
fig = go.Figure()
|
| 1571 |
+
fig.add_annotation(
|
| 1572 |
+
text="Please select at least one metric to compare",
|
| 1573 |
+
xref="paper", yref="paper",
|
| 1574 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1575 |
+
font=dict(size=16, color="orange")
|
| 1576 |
+
)
|
| 1577 |
+
fig.update_layout(
|
| 1578 |
+
title="No Metrics Selected",
|
| 1579 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
| 1580 |
+
)
|
| 1581 |
+
return fig
|
| 1582 |
+
|
| 1583 |
+
# Use the existing comparison function with comma-separated IDs
|
| 1584 |
+
experiment_ids_str = ",".join(selected_experiments)
|
| 1585 |
+
return create_experiment_comparison(experiment_ids_str)
|
| 1586 |
+
|
| 1587 |
+
except Exception as e:
|
| 1588 |
+
logger.error(f"Error creating comparison from selection: {str(e)}")
|
| 1589 |
+
fig = go.Figure()
|
| 1590 |
+
fig.add_annotation(
|
| 1591 |
+
text=f"Error creating comparison: {str(e)}",
|
| 1592 |
+
xref="paper", yref="paper",
|
| 1593 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1594 |
+
font=dict(size=14, color="red")
|
| 1595 |
+
)
|
| 1596 |
+
return fig
|
| 1597 |
+
|
| 1598 |
+
def refresh_comparison_options() -> tuple:
|
| 1599 |
+
"""Refresh the experiment and metric options for comparison"""
|
| 1600 |
+
try:
|
| 1601 |
+
# Get updated experiment choices
|
| 1602 |
+
experiment_choices = get_experiment_dropdown_choices()
|
| 1603 |
+
if experiment_choices == ["No experiments available"]:
|
| 1604 |
+
experiment_choices = []
|
| 1605 |
+
|
| 1606 |
+
# Get available metrics from all experiments
|
| 1607 |
+
all_experiments = list(trackio_space.experiments.keys())
|
| 1608 |
+
available_metrics = get_available_metrics_for_experiments(all_experiments)
|
| 1609 |
+
|
| 1610 |
+
# Default to common metrics if available
|
| 1611 |
+
default_metrics = []
|
| 1612 |
+
common_metrics = ["loss", "accuracy", "learning_rate", "gpu_memory"]
|
| 1613 |
+
for metric in common_metrics:
|
| 1614 |
+
if metric in available_metrics:
|
| 1615 |
+
default_metrics.append(metric)
|
| 1616 |
+
|
| 1617 |
+
# If no common metrics, use first few available
|
| 1618 |
+
if not default_metrics and available_metrics:
|
| 1619 |
+
default_metrics = available_metrics[:2]
|
| 1620 |
+
|
| 1621 |
+
return gr.CheckboxGroup(choices=experiment_choices, value=[]), gr.CheckboxGroup(choices=available_metrics, value=default_metrics)
|
| 1622 |
+
except Exception as e:
|
| 1623 |
+
logger.error(f"Error refreshing comparison options: {str(e)}")
|
| 1624 |
+
return gr.CheckboxGroup(choices=[], value=[]), gr.CheckboxGroup(choices=["loss", "accuracy"], value=[])
|
| 1625 |
+
|
| 1626 |
# Create Gradio interface
|
| 1627 |
with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as demo:
|
| 1628 |
gr.Markdown("# π Trackio Experiment Tracking & Monitoring")
|
| 1629 |
gr.Markdown("Monitor and track your ML experiments with real-time visualization!")
|
| 1630 |
|
| 1631 |
with gr.Tabs():
|
| 1632 |
+
# Dashboard Tab (NEW)
|
| 1633 |
+
with gr.Tab("π Dashboard"):
|
| 1634 |
+
gr.Markdown("### Comprehensive Experiment Dashboard")
|
| 1635 |
+
gr.Markdown("Select an experiment to view all its data, plots, and information in one place.")
|
| 1636 |
+
|
| 1637 |
+
# Row 1: Experiment Selection
|
| 1638 |
+
with gr.Row():
|
| 1639 |
+
with gr.Column(scale=3):
|
| 1640 |
+
# Experiment selection dropdown
|
| 1641 |
+
experiment_dropdown = gr.Dropdown(
|
| 1642 |
+
label="Select Experiment",
|
| 1643 |
+
choices=get_experiment_dropdown_choices(),
|
| 1644 |
+
value=get_experiment_dropdown_choices()[0] if get_experiment_dropdown_choices() and get_experiment_dropdown_choices()[0] != "No experiments available" else None,
|
| 1645 |
+
info="Choose an experiment to view its dashboard"
|
| 1646 |
+
)
|
| 1647 |
+
|
| 1648 |
+
with gr.Column(scale=1):
|
| 1649 |
+
with gr.Row():
|
| 1650 |
+
refresh_dropdown_btn = gr.Button("π Refresh List", variant="secondary", size="sm")
|
| 1651 |
+
refresh_dashboard_btn = gr.Button("π Refresh Dashboard", variant="primary", size="sm")
|
| 1652 |
+
|
| 1653 |
+
# Row 2: All Metrics Plots
|
| 1654 |
+
with gr.Row():
|
| 1655 |
+
with gr.Column(scale=3):
|
| 1656 |
+
with gr.Row():
|
| 1657 |
+
gr.Markdown("### π All Metrics Plots")
|
| 1658 |
+
with gr.Row():
|
| 1659 |
+
with gr.Column(scale=3):
|
| 1660 |
+
dashboard_plots = gr.Plot(
|
| 1661 |
+
label="Training Metrics",
|
| 1662 |
+
container=True,
|
| 1663 |
+
show_label=True,
|
| 1664 |
+
elem_classes=["plot-container"]
|
| 1665 |
+
)
|
| 1666 |
+
|
| 1667 |
+
# Row 3: Training Metrics Visualization Accordion
|
| 1668 |
+
with gr.Row():
|
| 1669 |
+
with gr.Accordion("π Training Metrics Visualization", open=False):
|
| 1670 |
+
with gr.Row():
|
| 1671 |
+
with gr.Column():
|
| 1672 |
+
metric_dropdown = gr.Dropdown(
|
| 1673 |
+
label="Metric to Plot",
|
| 1674 |
+
choices=[
|
| 1675 |
+
"loss", "accuracy", "learning_rate", "gpu_memory", "training_time",
|
| 1676 |
+
"total_tokens", "truncated_tokens", "padding_tokens", "throughput", "step_time",
|
| 1677 |
+
"batch_size", "seq_len", "token_acc", "train/gate_ortho", "train/center"
|
| 1678 |
+
],
|
| 1679 |
+
value="loss"
|
| 1680 |
+
)
|
| 1681 |
+
plot_btn = gr.Button("Create Plot", variant="primary")
|
| 1682 |
+
test_plot_btn = gr.Button("Test Plot Rendering", variant="secondary")
|
| 1683 |
+
|
| 1684 |
+
with gr.Row():
|
| 1685 |
+
dashboard_metric_plot = gr.Plot(
|
| 1686 |
+
label="Training Metrics",
|
| 1687 |
+
container=True,
|
| 1688 |
+
show_label=True,
|
| 1689 |
+
elem_classes=["plot-container"]
|
| 1690 |
+
)
|
| 1691 |
+
|
| 1692 |
+
plot_btn.click(
|
| 1693 |
+
create_metrics_plot,
|
| 1694 |
+
inputs=[experiment_dropdown, metric_dropdown],
|
| 1695 |
+
outputs=dashboard_metric_plot
|
| 1696 |
+
)
|
| 1697 |
+
|
| 1698 |
+
test_plot_btn.click(
|
| 1699 |
+
create_test_plot,
|
| 1700 |
+
inputs=[],
|
| 1701 |
+
outputs=dashboard_metric_plot
|
| 1702 |
+
)
|
| 1703 |
+
|
| 1704 |
+
# Row 4: Accordion with Detailed Information
|
| 1705 |
+
with gr.Row():
|
| 1706 |
+
with gr.Accordion("π Experiment Details", open=False):
|
| 1707 |
+
with gr.Tabs():
|
| 1708 |
+
with gr.Tab("π Status"):
|
| 1709 |
+
dashboard_status = gr.Textbox(
|
| 1710 |
+
label="Experiment Status",
|
| 1711 |
+
lines=8,
|
| 1712 |
+
interactive=False
|
| 1713 |
+
)
|
| 1714 |
+
|
| 1715 |
+
with gr.Tab("π§ Parameters"):
|
| 1716 |
+
dashboard_parameters = gr.Textbox(
|
| 1717 |
+
label="Experiment Parameters",
|
| 1718 |
+
lines=12,
|
| 1719 |
+
interactive=False
|
| 1720 |
+
)
|
| 1721 |
+
|
| 1722 |
+
with gr.Tab("π Metrics Summary"):
|
| 1723 |
+
dashboard_metrics = gr.Textbox(
|
| 1724 |
+
label="Metrics Summary",
|
| 1725 |
+
lines=12,
|
| 1726 |
+
interactive=False
|
| 1727 |
+
)
|
| 1728 |
+
|
| 1729 |
+
with gr.Tab("π Complete Summary"):
|
| 1730 |
+
dashboard_summary = gr.Textbox(
|
| 1731 |
+
label="Full Experiment Summary",
|
| 1732 |
+
lines=20,
|
| 1733 |
+
interactive=False
|
| 1734 |
+
)
|
| 1735 |
+
|
| 1736 |
+
# Connect the dashboard update function
|
| 1737 |
+
experiment_dropdown.change(
|
| 1738 |
+
update_dashboard,
|
| 1739 |
+
inputs=[experiment_dropdown],
|
| 1740 |
+
outputs=[dashboard_status, dashboard_parameters, dashboard_metrics, dashboard_plots, dashboard_summary]
|
| 1741 |
+
)
|
| 1742 |
+
|
| 1743 |
+
refresh_dashboard_btn.click(
|
| 1744 |
+
update_dashboard,
|
| 1745 |
+
inputs=[experiment_dropdown],
|
| 1746 |
+
outputs=[dashboard_status, dashboard_parameters, dashboard_metrics, dashboard_plots, dashboard_summary]
|
| 1747 |
+
)
|
| 1748 |
+
|
| 1749 |
+
# Connect the metric plot update function
|
| 1750 |
+
metric_dropdown.change(
|
| 1751 |
+
update_dashboard_metric_plot,
|
| 1752 |
+
inputs=[experiment_dropdown, metric_dropdown],
|
| 1753 |
+
outputs=[dashboard_metric_plot]
|
| 1754 |
+
)
|
| 1755 |
+
|
| 1756 |
+
refresh_dropdown_btn.click(
|
| 1757 |
+
refresh_experiment_dropdown,
|
| 1758 |
+
inputs=[],
|
| 1759 |
+
outputs=[experiment_dropdown]
|
| 1760 |
+
)
|
| 1761 |
+
|
| 1762 |
+
|
| 1763 |
+
# Experiment Comparison Tab
|
| 1764 |
+
with gr.Tab("π Experiment Comparison"):
|
| 1765 |
+
gr.Markdown("### Compare Multiple Experiments")
|
| 1766 |
+
gr.Markdown("Select experiments and metrics to compare from the available options below.")
|
| 1767 |
+
|
| 1768 |
+
# Selection controls
|
| 1769 |
+
with gr.Row():
|
| 1770 |
+
with gr.Column(scale=2):
|
| 1771 |
+
gr.Markdown("### Available Experiments")
|
| 1772 |
+
experiment_checkboxes = gr.CheckboxGroup(
|
| 1773 |
+
label="Select Experiments to Compare",
|
| 1774 |
+
choices=get_experiment_dropdown_choices(),
|
| 1775 |
+
value=[],
|
| 1776 |
+
info="Choose experiments to include in the comparison"
|
| 1777 |
+
)
|
| 1778 |
+
|
| 1779 |
+
gr.Markdown("### Available Metrics")
|
| 1780 |
+
metric_checkboxes = gr.CheckboxGroup(
|
| 1781 |
+
label="Select Metrics to Compare",
|
| 1782 |
+
choices=get_available_metrics_for_experiments(list(trackio_space.experiments.keys())),
|
| 1783 |
+
value=["loss", "accuracy"],
|
| 1784 |
+
info="Choose metrics to include in the comparison"
|
| 1785 |
+
)
|
| 1786 |
+
|
| 1787 |
+
with gr.Row():
|
| 1788 |
+
comparison_btn = gr.Button("Compare Selected", variant="primary")
|
| 1789 |
+
refresh_options_btn = gr.Button("π Refresh Options", variant="secondary")
|
| 1790 |
+
|
| 1791 |
+
with gr.Column(scale=1):
|
| 1792 |
+
gr.Markdown("### Comparison Results")
|
| 1793 |
+
gr.Markdown("The comparison will show subplots for the selected metrics across the selected experiments.")
|
| 1794 |
+
|
| 1795 |
+
# Comparison plots as subplots
|
| 1796 |
+
comparison_plot = gr.Plot(
|
| 1797 |
+
label="Experiment Comparison Dashboard",
|
| 1798 |
+
container=True,
|
| 1799 |
+
show_label=True,
|
| 1800 |
+
elem_classes=["plot-container"]
|
| 1801 |
+
)
|
| 1802 |
+
|
| 1803 |
+
comparison_btn.click(
|
| 1804 |
+
create_experiment_comparison_from_selection,
|
| 1805 |
+
inputs=[experiment_checkboxes, metric_checkboxes],
|
| 1806 |
+
outputs=comparison_plot
|
| 1807 |
+
)
|
| 1808 |
+
|
| 1809 |
+
refresh_options_btn.click(
|
| 1810 |
+
refresh_comparison_options,
|
| 1811 |
+
inputs=[],
|
| 1812 |
+
outputs=[experiment_checkboxes, metric_checkboxes]
|
| 1813 |
+
)
|
| 1814 |
+
|
| 1815 |
# Configuration Tab
|
| 1816 |
with gr.Tab("βοΈ Configuration"):
|
| 1817 |
gr.Markdown("### Configure HF Datasets Connection")
|
|
|
|
| 1828 |
dataset_repo_input = gr.Textbox(
|
| 1829 |
label="Dataset Repository",
|
| 1830 |
placeholder="your-username/your-dataset-name",
|
| 1831 |
+
value="Tonic/trackio-experiments",
|
| 1832 |
info="HF Dataset repository for experiment storage"
|
| 1833 |
)
|
| 1834 |
|
|
|
|
| 1840 |
gr.Markdown("### Current Configuration")
|
| 1841 |
current_config_output = gr.Textbox(
|
| 1842 |
label="Status",
|
| 1843 |
+
lines=10,
|
| 1844 |
interactive=False,
|
| 1845 |
+
value=f"π Dataset: {trackio_space.dataset_repo}\nπ HF Token: {'Set' if trackio_space.hf_token else 'Not set'}\nπ‘οΈ Data Preservation: {'β
Enabled' if trackio_space.dataset_manager else 'β οΈ Legacy Mode'}\nπ Experiments: {len(trackio_space.experiments)}\nπ Available Experiments: {', '.join(list(trackio_space.experiments.keys())[:3])}{'...' if len(trackio_space.experiments) > 3 else ''}"
|
| 1846 |
)
|
| 1847 |
|
| 1848 |
with gr.Column():
|
|
|
|
| 1865 |
- `HF_TOKEN`: Your Hugging Face token
|
| 1866 |
- `TRACKIO_DATASET_REPO`: Dataset repository
|
| 1867 |
|
| 1868 |
+
**Data Preservation:**
|
| 1869 |
+
- β
**Enabled**: All experiment data is preserved when adding/updating experiments
|
| 1870 |
+
- β οΈ **Legacy Mode**: Data preservation not guaranteed (fallback mode)
|
| 1871 |
+
- Data preservation requires the dataset management utilities to be available
|
| 1872 |
+
|
| 1873 |
**Actions:**
|
| 1874 |
- **Update Configuration**: Apply new settings and reload experiments
|
| 1875 |
- **Test Connection**: Verify access to the dataset repository
|
| 1876 |
- **Create Dataset**: Create a new dataset repository if it doesn't exist
|
| 1877 |
""")
|
| 1878 |
|
| 1879 |
+
# Experiment Management Accordion
|
| 1880 |
+
with gr.Accordion("π§ Experiment Management", open=False):
|
| 1881 |
+
with gr.Tabs():
|
| 1882 |
+
# Create Experiment Tab
|
| 1883 |
+
with gr.Tab("Create Experiment"):
|
| 1884 |
+
gr.Markdown("### Create a New Experiment")
|
| 1885 |
+
with gr.Row():
|
| 1886 |
+
with gr.Column():
|
| 1887 |
+
create_exp_name = gr.Textbox(
|
| 1888 |
+
label="Experiment Name",
|
| 1889 |
+
placeholder="my_smollm3_finetune",
|
| 1890 |
+
value="smollm3_finetune"
|
| 1891 |
+
)
|
| 1892 |
+
create_exp_description = gr.Textbox(
|
| 1893 |
+
label="Description",
|
| 1894 |
+
placeholder="Fine-tuning SmolLM3 model on custom dataset",
|
| 1895 |
+
value="SmolLM3 fine-tuning experiment"
|
| 1896 |
+
)
|
| 1897 |
+
create_exp_btn = gr.Button("Create Experiment", variant="primary")
|
| 1898 |
+
|
| 1899 |
+
with gr.Column():
|
| 1900 |
+
create_exp_output = gr.Textbox(
|
| 1901 |
+
label="Result",
|
| 1902 |
+
lines=5,
|
| 1903 |
+
interactive=False
|
| 1904 |
+
)
|
| 1905 |
+
|
| 1906 |
+
create_exp_btn.click(
|
| 1907 |
+
create_experiment_interface,
|
| 1908 |
+
inputs=[create_exp_name, create_exp_description],
|
| 1909 |
+
outputs=[create_exp_output, experiment_dropdown]
|
| 1910 |
+
)
|
| 1911 |
+
|
| 1912 |
+
# Log Metrics Tab
|
| 1913 |
+
with gr.Tab("Log Metrics"):
|
| 1914 |
+
gr.Markdown("### Log Training Metrics")
|
| 1915 |
+
with gr.Row():
|
| 1916 |
+
with gr.Column():
|
| 1917 |
+
log_metrics_exp_id = gr.Textbox(
|
| 1918 |
+
label="Experiment ID",
|
| 1919 |
+
placeholder="exp_20231201_143022"
|
| 1920 |
+
)
|
| 1921 |
+
log_metrics_json = gr.Textbox(
|
| 1922 |
+
label="Metrics (JSON)",
|
| 1923 |
+
placeholder='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5}',
|
| 1924 |
+
value='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5, "gpu_memory": 22.5}'
|
| 1925 |
+
)
|
| 1926 |
+
log_metrics_step = gr.Textbox(
|
| 1927 |
+
label="Step (optional)",
|
| 1928 |
+
placeholder="100"
|
| 1929 |
+
)
|
| 1930 |
+
log_metrics_btn = gr.Button("Log Metrics", variant="primary")
|
| 1931 |
+
|
| 1932 |
+
with gr.Column():
|
| 1933 |
+
log_metrics_output = gr.Textbox(
|
| 1934 |
+
label="Result",
|
| 1935 |
+
lines=5,
|
| 1936 |
+
interactive=False
|
| 1937 |
+
)
|
| 1938 |
+
|
| 1939 |
+
log_metrics_btn.click(
|
| 1940 |
+
log_metrics_interface,
|
| 1941 |
+
inputs=[log_metrics_exp_id, log_metrics_json, log_metrics_step],
|
| 1942 |
+
outputs=log_metrics_output
|
| 1943 |
+
)
|
| 1944 |
+
|
| 1945 |
+
# Log Parameters Tab
|
| 1946 |
+
with gr.Tab("Log Parameters"):
|
| 1947 |
+
gr.Markdown("### Log Experiment Parameters")
|
| 1948 |
+
with gr.Row():
|
| 1949 |
+
with gr.Column():
|
| 1950 |
+
log_params_exp_id = gr.Textbox(
|
| 1951 |
+
label="Experiment ID",
|
| 1952 |
+
placeholder="exp_20231201_143022"
|
| 1953 |
+
)
|
| 1954 |
+
log_params_json = gr.Textbox(
|
| 1955 |
+
label="Parameters (JSON)",
|
| 1956 |
+
placeholder='{"learning_rate": 2e-5, "batch_size": 4}',
|
| 1957 |
+
value='{"learning_rate": 3.5e-6, "batch_size": 8, "model_name": "HuggingFaceTB/SmolLM3-3B", "max_iters": 18000, "mixed_precision": "bf16"}'
|
| 1958 |
+
)
|
| 1959 |
+
log_params_btn = gr.Button("Log Parameters", variant="primary")
|
| 1960 |
+
|
| 1961 |
+
with gr.Column():
|
| 1962 |
+
log_params_output = gr.Textbox(
|
| 1963 |
+
label="Result",
|
| 1964 |
+
lines=5,
|
| 1965 |
+
interactive=False
|
| 1966 |
+
)
|
| 1967 |
+
|
| 1968 |
+
log_params_btn.click(
|
| 1969 |
+
log_parameters_interface,
|
| 1970 |
+
inputs=[log_params_exp_id, log_params_json],
|
| 1971 |
+
outputs=log_params_output
|
| 1972 |
+
)
|
| 1973 |
+
|
| 1974 |
+
# View Experiments Tab
|
| 1975 |
+
with gr.Tab("View Experiments"):
|
| 1976 |
+
gr.Markdown("### View Experiment Details")
|
| 1977 |
+
with gr.Row():
|
| 1978 |
+
with gr.Column():
|
| 1979 |
+
view_exp_id = gr.Textbox(
|
| 1980 |
+
label="Experiment ID",
|
| 1981 |
+
placeholder="exp_20231201_143022"
|
| 1982 |
+
)
|
| 1983 |
+
view_btn = gr.Button("View Experiment", variant="primary")
|
| 1984 |
+
list_btn = gr.Button("List All Experiments", variant="secondary")
|
| 1985 |
+
|
| 1986 |
+
with gr.Column():
|
| 1987 |
+
view_output = gr.Textbox(
|
| 1988 |
+
label="Experiment Details",
|
| 1989 |
+
lines=20,
|
| 1990 |
+
interactive=False
|
| 1991 |
+
)
|
| 1992 |
+
|
| 1993 |
+
view_btn.click(
|
| 1994 |
+
get_experiment_details,
|
| 1995 |
+
inputs=[view_exp_id],
|
| 1996 |
+
outputs=view_output
|
| 1997 |
+
)
|
| 1998 |
+
|
| 1999 |
+
list_btn.click(
|
| 2000 |
+
list_experiments_interface,
|
| 2001 |
+
inputs=[],
|
| 2002 |
+
outputs=view_output
|
| 2003 |
+
)
|
| 2004 |
+
|
| 2005 |
+
# Update Status Tab
|
| 2006 |
+
with gr.Tab("Update Status"):
|
| 2007 |
+
gr.Markdown("### Update Experiment Status")
|
| 2008 |
+
with gr.Row():
|
| 2009 |
+
with gr.Column():
|
| 2010 |
+
status_exp_id = gr.Textbox(
|
| 2011 |
+
label="Experiment ID",
|
| 2012 |
+
placeholder="exp_20231201_143022"
|
| 2013 |
+
)
|
| 2014 |
+
status_dropdown = gr.Dropdown(
|
| 2015 |
+
label="Status",
|
| 2016 |
+
choices=["running", "completed", "failed", "paused"],
|
| 2017 |
+
value="running"
|
| 2018 |
+
)
|
| 2019 |
+
update_status_btn = gr.Button("Update Status", variant="primary")
|
| 2020 |
+
|
| 2021 |
+
with gr.Column():
|
| 2022 |
+
status_output = gr.Textbox(
|
| 2023 |
+
label="Result",
|
| 2024 |
+
lines=3,
|
| 2025 |
+
interactive=False
|
| 2026 |
+
)
|
| 2027 |
+
|
| 2028 |
+
update_status_btn.click(
|
| 2029 |
+
update_experiment_status_interface,
|
| 2030 |
+
inputs=[status_exp_id, status_dropdown],
|
| 2031 |
+
outputs=status_output
|
| 2032 |
+
)
|
| 2033 |
+
|
| 2034 |
+
# Demo Data Tab
|
| 2035 |
+
with gr.Tab("Demo Data"):
|
| 2036 |
+
gr.Markdown("### Generate Demo Training Data")
|
| 2037 |
+
gr.Markdown("Use this to simulate training data for testing the interface")
|
| 2038 |
+
with gr.Row():
|
| 2039 |
+
with gr.Column():
|
| 2040 |
+
demo_exp_id = gr.Textbox(
|
| 2041 |
+
label="Experiment ID",
|
| 2042 |
+
placeholder="exp_20231201_143022"
|
| 2043 |
+
)
|
| 2044 |
+
demo_btn = gr.Button("Generate Demo Data", variant="primary")
|
| 2045 |
+
create_demo_btn = gr.Button("Create Demo Experiment", variant="secondary")
|
| 2046 |
+
|
| 2047 |
+
with gr.Column():
|
| 2048 |
+
demo_output = gr.Textbox(
|
| 2049 |
+
label="Result",
|
| 2050 |
+
lines=5,
|
| 2051 |
+
interactive=False
|
| 2052 |
+
)
|
| 2053 |
+
|
| 2054 |
+
demo_btn.click(
|
| 2055 |
+
simulate_training_data,
|
| 2056 |
+
inputs=[demo_exp_id],
|
| 2057 |
+
outputs=[demo_output, dashboard_status, dashboard_parameters, dashboard_metrics, dashboard_plots, dashboard_summary]
|
| 2058 |
+
)
|
| 2059 |
+
|
| 2060 |
+
create_demo_btn.click(
|
| 2061 |
+
create_demo_experiment,
|
| 2062 |
+
inputs=[],
|
| 2063 |
+
outputs=[demo_output, experiment_dropdown]
|
| 2064 |
+
)
|
| 2065 |
+
|
| 2066 |
update_config_btn.click(
|
| 2067 |
update_trackio_config,
|
| 2068 |
inputs=[hf_token_input, dataset_repo_input],
|
|
|
|
| 2080 |
inputs=[hf_token_input, dataset_repo_input],
|
| 2081 |
outputs=current_config_output
|
| 2082 |
)
|
| 2083 |
+
|
| 2084 |
|
| 2085 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2086 |
|
| 2087 |
# Launch the app
|
| 2088 |
if __name__ == "__main__":
|
templates/spaces/trackio/dataset_utils.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Dataset utilities for Trackio experiment data management
|
| 4 |
+
Provides functions for safe dataset operations with data preservation
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import Dict, Any, List, Optional, Union
|
| 11 |
+
from datasets import Dataset, load_dataset
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class TrackioDatasetManager:
|
| 16 |
+
"""
|
| 17 |
+
Manager class for Trackio experiment datasets with data preservation.
|
| 18 |
+
|
| 19 |
+
This class ensures that existing experiment data is always preserved
|
| 20 |
+
when adding new experiments or updating existing ones.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, dataset_repo: str, hf_token: str):
|
| 24 |
+
"""
|
| 25 |
+
Initialize the dataset manager.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
dataset_repo (str): HF dataset repository ID (e.g., "username/dataset-name")
|
| 29 |
+
hf_token (str): Hugging Face token for authentication
|
| 30 |
+
"""
|
| 31 |
+
self.dataset_repo = dataset_repo
|
| 32 |
+
self.hf_token = hf_token
|
| 33 |
+
self._validate_repo_format()
|
| 34 |
+
|
| 35 |
+
def _validate_repo_format(self):
|
| 36 |
+
"""Validate dataset repository format"""
|
| 37 |
+
if not self.dataset_repo or '/' not in self.dataset_repo:
|
| 38 |
+
raise ValueError(f"Invalid dataset repository format: {self.dataset_repo}")
|
| 39 |
+
|
| 40 |
+
def check_dataset_exists(self) -> bool:
|
| 41 |
+
"""
|
| 42 |
+
Check if the dataset repository exists and is accessible.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
bool: True if dataset exists and is accessible, False otherwise
|
| 46 |
+
"""
|
| 47 |
+
try:
|
| 48 |
+
load_dataset(self.dataset_repo, token=self.hf_token)
|
| 49 |
+
logger.info(f"β
Dataset {self.dataset_repo} exists and is accessible")
|
| 50 |
+
return True
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.info(f"π Dataset {self.dataset_repo} doesn't exist or isn't accessible: {e}")
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
def load_existing_experiments(self) -> List[Dict[str, Any]]:
|
| 56 |
+
"""
|
| 57 |
+
Load all existing experiments from the dataset.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List[Dict[str, Any]]: List of existing experiment dictionaries
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
if not self.check_dataset_exists():
|
| 64 |
+
logger.info("π No existing dataset found, returning empty list")
|
| 65 |
+
return []
|
| 66 |
+
|
| 67 |
+
dataset = load_dataset(self.dataset_repo, token=self.hf_token)
|
| 68 |
+
|
| 69 |
+
if 'train' not in dataset:
|
| 70 |
+
logger.info("π No 'train' split found in dataset")
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
experiments = list(dataset['train'])
|
| 74 |
+
logger.info(f"π Loaded {len(experiments)} existing experiments")
|
| 75 |
+
|
| 76 |
+
# Validate experiment structure
|
| 77 |
+
valid_experiments = []
|
| 78 |
+
for exp in experiments:
|
| 79 |
+
if self._validate_experiment_structure(exp):
|
| 80 |
+
valid_experiments.append(exp)
|
| 81 |
+
else:
|
| 82 |
+
logger.warning(f"β οΈ Skipping invalid experiment: {exp.get('experiment_id', 'unknown')}")
|
| 83 |
+
|
| 84 |
+
logger.info(f"π {len(valid_experiments)} valid experiments loaded")
|
| 85 |
+
return valid_experiments
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"β Failed to load existing experiments: {e}")
|
| 89 |
+
return []
|
| 90 |
+
|
| 91 |
+
def _validate_experiment_structure(self, experiment: Dict[str, Any]) -> bool:
|
| 92 |
+
"""
|
| 93 |
+
Validate that an experiment has the required structure.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
experiment (Dict[str, Any]): Experiment dictionary to validate
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
bool: True if experiment structure is valid
|
| 100 |
+
"""
|
| 101 |
+
required_fields = [
|
| 102 |
+
'experiment_id', 'name', 'description', 'created_at',
|
| 103 |
+
'status', 'metrics', 'parameters', 'artifacts', 'logs'
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
for field in required_fields:
|
| 107 |
+
if field not in experiment:
|
| 108 |
+
logger.warning(f"β οΈ Missing required field '{field}' in experiment")
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
# Validate JSON fields
|
| 112 |
+
json_fields = ['metrics', 'parameters', 'artifacts', 'logs']
|
| 113 |
+
for field in json_fields:
|
| 114 |
+
if isinstance(experiment[field], str):
|
| 115 |
+
try:
|
| 116 |
+
json.loads(experiment[field])
|
| 117 |
+
except json.JSONDecodeError:
|
| 118 |
+
logger.warning(f"β οΈ Invalid JSON in field '{field}' for experiment {experiment.get('experiment_id')}")
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
return True
|
| 122 |
+
|
| 123 |
+
def save_experiments(self, experiments: List[Dict[str, Any]], commit_message: Optional[str] = None) -> bool:
|
| 124 |
+
"""
|
| 125 |
+
Save a list of experiments to the dataset, preserving data integrity.
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
experiments (List[Dict[str, Any]]): List of experiment dictionaries
|
| 129 |
+
commit_message (Optional[str]): Custom commit message
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
bool: True if save was successful, False otherwise
|
| 133 |
+
"""
|
| 134 |
+
try:
|
| 135 |
+
if not experiments:
|
| 136 |
+
logger.warning("β οΈ No experiments to save")
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
# Validate all experiments before saving
|
| 140 |
+
valid_experiments = []
|
| 141 |
+
for exp in experiments:
|
| 142 |
+
if self._validate_experiment_structure(exp):
|
| 143 |
+
# Ensure last_updated is set
|
| 144 |
+
if 'last_updated' not in exp:
|
| 145 |
+
exp['last_updated'] = datetime.now().isoformat()
|
| 146 |
+
valid_experiments.append(exp)
|
| 147 |
+
else:
|
| 148 |
+
logger.error(f"β Invalid experiment structure: {exp.get('experiment_id', 'unknown')}")
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
# Create dataset
|
| 152 |
+
dataset = Dataset.from_list(valid_experiments)
|
| 153 |
+
|
| 154 |
+
# Generate commit message if not provided
|
| 155 |
+
if not commit_message:
|
| 156 |
+
commit_message = f"Update dataset with {len(valid_experiments)} experiments ({datetime.now().isoformat()})"
|
| 157 |
+
|
| 158 |
+
# Push to hub
|
| 159 |
+
dataset.push_to_hub(
|
| 160 |
+
self.dataset_repo,
|
| 161 |
+
token=self.hf_token,
|
| 162 |
+
private=True,
|
| 163 |
+
commit_message=commit_message
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
logger.info(f"β
Successfully saved {len(valid_experiments)} experiments to {self.dataset_repo}")
|
| 167 |
+
return True
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.error(f"β Failed to save experiments to dataset: {e}")
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
def upsert_experiment(self, experiment: Dict[str, Any]) -> bool:
|
| 174 |
+
"""
|
| 175 |
+
Insert a new experiment or update an existing one, preserving all other data.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
experiment (Dict[str, Any]): Experiment dictionary to upsert
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
bool: True if operation was successful, False otherwise
|
| 182 |
+
"""
|
| 183 |
+
try:
|
| 184 |
+
# Validate the experiment structure
|
| 185 |
+
if not self._validate_experiment_structure(experiment):
|
| 186 |
+
logger.error(f"β Invalid experiment structure for {experiment.get('experiment_id', 'unknown')}")
|
| 187 |
+
return False
|
| 188 |
+
|
| 189 |
+
# Load existing experiments
|
| 190 |
+
existing_experiments = self.load_existing_experiments()
|
| 191 |
+
|
| 192 |
+
# Find if experiment already exists
|
| 193 |
+
experiment_id = experiment['experiment_id']
|
| 194 |
+
experiment_found = False
|
| 195 |
+
updated_experiments = []
|
| 196 |
+
|
| 197 |
+
for existing_exp in existing_experiments:
|
| 198 |
+
if existing_exp.get('experiment_id') == experiment_id:
|
| 199 |
+
# Update existing experiment
|
| 200 |
+
logger.info(f"π Updating existing experiment: {experiment_id}")
|
| 201 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
| 202 |
+
updated_experiments.append(experiment)
|
| 203 |
+
experiment_found = True
|
| 204 |
+
else:
|
| 205 |
+
# Preserve existing experiment
|
| 206 |
+
updated_experiments.append(existing_exp)
|
| 207 |
+
|
| 208 |
+
# If experiment doesn't exist, add it
|
| 209 |
+
if not experiment_found:
|
| 210 |
+
logger.info(f"β Adding new experiment: {experiment_id}")
|
| 211 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
| 212 |
+
updated_experiments.append(experiment)
|
| 213 |
+
|
| 214 |
+
# Save all experiments
|
| 215 |
+
commit_message = f"{'Update' if experiment_found else 'Add'} experiment {experiment_id} (preserving {len(existing_experiments)} existing experiments)"
|
| 216 |
+
|
| 217 |
+
return self.save_experiments(updated_experiments, commit_message)
|
| 218 |
+
|
| 219 |
+
except Exception as e:
|
| 220 |
+
logger.error(f"β Failed to upsert experiment: {e}")
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
def get_experiment_by_id(self, experiment_id: str) -> Optional[Dict[str, Any]]:
|
| 224 |
+
"""
|
| 225 |
+
Retrieve a specific experiment by its ID.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
experiment_id (str): The experiment ID to search for
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
Optional[Dict[str, Any]]: The experiment dictionary if found, None otherwise
|
| 232 |
+
"""
|
| 233 |
+
try:
|
| 234 |
+
experiments = self.load_existing_experiments()
|
| 235 |
+
|
| 236 |
+
for exp in experiments:
|
| 237 |
+
if exp.get('experiment_id') == experiment_id:
|
| 238 |
+
logger.info(f"β
Found experiment: {experiment_id}")
|
| 239 |
+
return exp
|
| 240 |
+
|
| 241 |
+
logger.info(f"π Experiment not found: {experiment_id}")
|
| 242 |
+
return None
|
| 243 |
+
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logger.error(f"β Failed to get experiment {experiment_id}: {e}")
|
| 246 |
+
return None
|
| 247 |
+
|
| 248 |
+
def list_experiments(self, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 249 |
+
"""
|
| 250 |
+
List all experiments, optionally filtered by status.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
status_filter (Optional[str]): Filter by experiment status (running, completed, failed, paused)
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
List[Dict[str, Any]]: List of experiments matching the filter
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
experiments = self.load_existing_experiments()
|
| 260 |
+
|
| 261 |
+
if status_filter:
|
| 262 |
+
filtered_experiments = [exp for exp in experiments if exp.get('status') == status_filter]
|
| 263 |
+
logger.info(f"π Found {len(filtered_experiments)} experiments with status '{status_filter}'")
|
| 264 |
+
return filtered_experiments
|
| 265 |
+
|
| 266 |
+
logger.info(f"π Found {len(experiments)} total experiments")
|
| 267 |
+
return experiments
|
| 268 |
+
|
| 269 |
+
except Exception as e:
|
| 270 |
+
logger.error(f"β Failed to list experiments: {e}")
|
| 271 |
+
return []
|
| 272 |
+
|
| 273 |
+
def backup_dataset(self, backup_suffix: Optional[str] = None) -> str:
|
| 274 |
+
"""
|
| 275 |
+
Create a backup of the current dataset.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
backup_suffix (Optional[str]): Optional suffix for backup repo name
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
str: Backup repository name if successful, empty string otherwise
|
| 282 |
+
"""
|
| 283 |
+
try:
|
| 284 |
+
if not backup_suffix:
|
| 285 |
+
backup_suffix = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 286 |
+
|
| 287 |
+
backup_repo = f"{self.dataset_repo}-backup-{backup_suffix}"
|
| 288 |
+
|
| 289 |
+
# Load current experiments
|
| 290 |
+
experiments = self.load_existing_experiments()
|
| 291 |
+
|
| 292 |
+
if not experiments:
|
| 293 |
+
logger.warning("β οΈ No experiments to backup")
|
| 294 |
+
return ""
|
| 295 |
+
|
| 296 |
+
# Create backup dataset manager
|
| 297 |
+
backup_manager = TrackioDatasetManager(backup_repo, self.hf_token)
|
| 298 |
+
|
| 299 |
+
# Save to backup
|
| 300 |
+
success = backup_manager.save_experiments(
|
| 301 |
+
experiments,
|
| 302 |
+
f"Backup of {self.dataset_repo} created on {datetime.now().isoformat()}"
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
if success:
|
| 306 |
+
logger.info(f"β
Backup created: {backup_repo}")
|
| 307 |
+
return backup_repo
|
| 308 |
+
else:
|
| 309 |
+
logger.error("β Failed to create backup")
|
| 310 |
+
return ""
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logger.error(f"β Failed to create backup: {e}")
|
| 314 |
+
return ""
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def create_dataset_manager(dataset_repo: str, hf_token: str) -> TrackioDatasetManager:
|
| 318 |
+
"""
|
| 319 |
+
Factory function to create a TrackioDatasetManager instance.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
dataset_repo (str): HF dataset repository ID
|
| 323 |
+
hf_token (str): Hugging Face token
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
TrackioDatasetManager: Configured dataset manager instance
|
| 327 |
+
"""
|
| 328 |
+
return TrackioDatasetManager(dataset_repo, hf_token)
|
templates/spaces/{requirements.txt β trackio/requirements.txt}
RENAMED
|
File without changes
|
templates/spaces/trackio/trackio_api_client.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Trackio API Client for Hugging Face Spaces
|
| 4 |
+
Uses gradio_client for proper API communication with automatic Space URL resolution
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
import json
|
| 9 |
+
import time
|
| 10 |
+
import logging
|
| 11 |
+
from typing import Dict, Any, Optional
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
# Setup logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from gradio_client import Client
|
| 21 |
+
GRADIO_CLIENT_AVAILABLE = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
GRADIO_CLIENT_AVAILABLE = False
|
| 24 |
+
logger.warning("gradio_client not available. Install with: pip install gradio_client")
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
from huggingface_hub import HfApi
|
| 28 |
+
HF_HUB_AVAILABLE = True
|
| 29 |
+
except ImportError:
|
| 30 |
+
HF_HUB_AVAILABLE = False
|
| 31 |
+
logger.warning("huggingface_hub not available. Install with: pip install huggingface-hub")
|
| 32 |
+
|
| 33 |
+
class TrackioAPIClient:
|
| 34 |
+
"""API client for Trackio Space using gradio_client with automatic Space URL resolution"""
|
| 35 |
+
|
| 36 |
+
def __init__(self, space_id: str, hf_token: Optional[str] = None):
|
| 37 |
+
self.space_id = space_id
|
| 38 |
+
self.hf_token = hf_token
|
| 39 |
+
self.client = None
|
| 40 |
+
|
| 41 |
+
# Auto-resolve Space URL
|
| 42 |
+
self.space_url = self._resolve_space_url()
|
| 43 |
+
|
| 44 |
+
# Initialize gradio client
|
| 45 |
+
if GRADIO_CLIENT_AVAILABLE and self.space_url:
|
| 46 |
+
try:
|
| 47 |
+
self.client = Client(self.space_url)
|
| 48 |
+
logger.info(f"β
Connected to Trackio Space: {self.space_id}")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.error(f"β Failed to connect to Trackio Space: {e}")
|
| 51 |
+
self.client = None
|
| 52 |
+
else:
|
| 53 |
+
logger.error("β gradio_client not available. Install with: pip install gradio_client")
|
| 54 |
+
|
| 55 |
+
def _resolve_space_url(self) -> Optional[str]:
|
| 56 |
+
"""Resolve Space URL using Hugging Face Hub API"""
|
| 57 |
+
try:
|
| 58 |
+
# Clean the space_id - remove any URL prefixes
|
| 59 |
+
clean_space_id = self.space_id
|
| 60 |
+
if clean_space_id.startswith('http'):
|
| 61 |
+
# Extract space ID from URL
|
| 62 |
+
if '/spaces/' in clean_space_id:
|
| 63 |
+
clean_space_id = clean_space_id.split('/spaces/')[-1]
|
| 64 |
+
else:
|
| 65 |
+
# Try to extract from URL format
|
| 66 |
+
clean_space_id = clean_space_id.replace('https://', '').replace('http://', '')
|
| 67 |
+
if '.hf.space' in clean_space_id:
|
| 68 |
+
clean_space_id = clean_space_id.replace('.hf.space', '').replace('-', '/')
|
| 69 |
+
|
| 70 |
+
logger.info(f"π§ Resolving Space URL for ID: {clean_space_id}")
|
| 71 |
+
|
| 72 |
+
if not HF_HUB_AVAILABLE:
|
| 73 |
+
logger.warning("β οΈ Hugging Face Hub not available, using default URL format")
|
| 74 |
+
# Fallback to default URL format
|
| 75 |
+
space_name = clean_space_id.replace('/', '-')
|
| 76 |
+
return f"https://{space_name}.hf.space"
|
| 77 |
+
|
| 78 |
+
# Use Hugging Face Hub API to get Space info
|
| 79 |
+
api = HfApi(token=self.hf_token)
|
| 80 |
+
|
| 81 |
+
# Get Space info
|
| 82 |
+
space_info = api.space_info(clean_space_id)
|
| 83 |
+
if space_info and hasattr(space_info, 'host'):
|
| 84 |
+
# Use the host directly from space_info
|
| 85 |
+
space_url = space_info.host
|
| 86 |
+
logger.info(f"β
Resolved Space URL: {space_url}")
|
| 87 |
+
return space_url
|
| 88 |
+
else:
|
| 89 |
+
# Fallback to default URL format
|
| 90 |
+
space_name = clean_space_id.replace('/', '-')
|
| 91 |
+
space_url = f"https://{space_name}.hf.space"
|
| 92 |
+
logger.info(f"β
Using fallback Space URL: {space_url}")
|
| 93 |
+
return space_url
|
| 94 |
+
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.warning(f"β οΈ Failed to resolve Space URL: {e}")
|
| 97 |
+
# Fallback to default URL format
|
| 98 |
+
space_name = self.space_id.replace('/', '-')
|
| 99 |
+
space_url = f"https://{space_name}.hf.space"
|
| 100 |
+
logger.info(f"β
Using fallback Space URL: {space_url}")
|
| 101 |
+
return space_url
|
| 102 |
+
|
| 103 |
+
def _make_api_call(self, api_name: str, *args) -> Dict[str, Any]:
|
| 104 |
+
"""Make an API call to the Trackio Space using gradio_client"""
|
| 105 |
+
if not self.client:
|
| 106 |
+
return {"error": "Client not available"}
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
logger.debug(f"Making API call to {api_name} with args: {args}")
|
| 110 |
+
|
| 111 |
+
# Use gradio_client to make the prediction
|
| 112 |
+
result = self.client.predict(*args, api_name=api_name)
|
| 113 |
+
|
| 114 |
+
logger.debug(f"API call result: {result}")
|
| 115 |
+
return {"success": True, "data": result}
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.error(f"API call failed for {api_name}: {e}")
|
| 119 |
+
return {"error": f"API call failed: {str(e)}"}
|
| 120 |
+
|
| 121 |
+
def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
|
| 122 |
+
"""Create a new experiment"""
|
| 123 |
+
logger.info(f"Creating experiment: {name}")
|
| 124 |
+
|
| 125 |
+
result = self._make_api_call("/create_experiment_interface", name, description)
|
| 126 |
+
|
| 127 |
+
if "success" in result:
|
| 128 |
+
logger.info(f"Experiment created successfully: {result['data']}")
|
| 129 |
+
return result
|
| 130 |
+
else:
|
| 131 |
+
logger.error(f"Failed to create experiment: {result}")
|
| 132 |
+
return result
|
| 133 |
+
|
| 134 |
+
def log_metrics(self, experiment_id: str, metrics: Dict[str, Any], step: Optional[int] = None) -> Dict[str, Any]:
|
| 135 |
+
"""Log metrics for an experiment"""
|
| 136 |
+
metrics_json = json.dumps(metrics)
|
| 137 |
+
step_str = str(step) if step is not None else ""
|
| 138 |
+
|
| 139 |
+
logger.info(f"Logging metrics for experiment {experiment_id} at step {step}")
|
| 140 |
+
|
| 141 |
+
result = self._make_api_call("/log_metrics_interface", experiment_id, metrics_json, step_str)
|
| 142 |
+
|
| 143 |
+
if "success" in result:
|
| 144 |
+
logger.info(f"Metrics logged successfully: {result['data']}")
|
| 145 |
+
return result
|
| 146 |
+
else:
|
| 147 |
+
logger.error(f"Failed to log metrics: {result}")
|
| 148 |
+
return result
|
| 149 |
+
|
| 150 |
+
def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
|
| 151 |
+
"""Log parameters for an experiment"""
|
| 152 |
+
parameters_json = json.dumps(parameters)
|
| 153 |
+
|
| 154 |
+
logger.info(f"Logging parameters for experiment {experiment_id}")
|
| 155 |
+
|
| 156 |
+
result = self._make_api_call("/log_parameters_interface", experiment_id, parameters_json)
|
| 157 |
+
|
| 158 |
+
if "success" in result:
|
| 159 |
+
logger.info(f"Parameters logged successfully: {result['data']}")
|
| 160 |
+
return result
|
| 161 |
+
else:
|
| 162 |
+
logger.error(f"Failed to log parameters: {result}")
|
| 163 |
+
return result
|
| 164 |
+
|
| 165 |
+
def get_experiment_details(self, experiment_id: str) -> Dict[str, Any]:
|
| 166 |
+
"""Get experiment details"""
|
| 167 |
+
logger.info(f"Getting details for experiment {experiment_id}")
|
| 168 |
+
|
| 169 |
+
result = self._make_api_call("/get_experiment_details", experiment_id)
|
| 170 |
+
|
| 171 |
+
if "success" in result:
|
| 172 |
+
logger.info(f"Experiment details retrieved: {result['data']}")
|
| 173 |
+
return result
|
| 174 |
+
else:
|
| 175 |
+
logger.error(f"Failed to get experiment details: {result}")
|
| 176 |
+
return result
|
| 177 |
+
|
| 178 |
+
def list_experiments(self) -> Dict[str, Any]:
|
| 179 |
+
"""List all experiments"""
|
| 180 |
+
logger.info("Listing experiments")
|
| 181 |
+
|
| 182 |
+
result = self._make_api_call("/list_experiments_interface")
|
| 183 |
+
|
| 184 |
+
if "success" in result:
|
| 185 |
+
logger.info(f"Experiments listed successfully: {result['data']}")
|
| 186 |
+
return result
|
| 187 |
+
else:
|
| 188 |
+
logger.error(f"Failed to list experiments: {result}")
|
| 189 |
+
return result
|
| 190 |
+
|
| 191 |
+
def update_experiment_status(self, experiment_id: str, status: str) -> Dict[str, Any]:
|
| 192 |
+
"""Update experiment status"""
|
| 193 |
+
logger.info(f"Updating experiment {experiment_id} status to {status}")
|
| 194 |
+
|
| 195 |
+
result = self._make_api_call("/update_experiment_status_interface", experiment_id, status)
|
| 196 |
+
|
| 197 |
+
if "success" in result:
|
| 198 |
+
logger.info(f"Experiment status updated successfully: {result['data']}")
|
| 199 |
+
return result
|
| 200 |
+
else:
|
| 201 |
+
logger.error(f"Failed to update experiment status: {result}")
|
| 202 |
+
return result
|
| 203 |
+
|
| 204 |
+
def simulate_training_data(self, experiment_id: str) -> Dict[str, Any]:
|
| 205 |
+
"""Simulate training data for testing"""
|
| 206 |
+
logger.info(f"Simulating training data for experiment {experiment_id}")
|
| 207 |
+
|
| 208 |
+
result = self._make_api_call("/simulate_training_data", experiment_id)
|
| 209 |
+
|
| 210 |
+
if "success" in result:
|
| 211 |
+
logger.info(f"Training data simulated successfully: {result['data']}")
|
| 212 |
+
return result
|
| 213 |
+
else:
|
| 214 |
+
logger.error(f"Failed to simulate training data: {result}")
|
| 215 |
+
return result
|
| 216 |
+
|
| 217 |
+
def get_training_metrics(self, experiment_id: str) -> Dict[str, Any]:
|
| 218 |
+
"""Get training metrics for an experiment"""
|
| 219 |
+
logger.info(f"Getting training metrics for experiment {experiment_id}")
|
| 220 |
+
|
| 221 |
+
result = self._make_api_call("/get_experiment_details", experiment_id)
|
| 222 |
+
|
| 223 |
+
if "success" in result:
|
| 224 |
+
logger.info(f"Training metrics retrieved: {result['data']}")
|
| 225 |
+
return result
|
| 226 |
+
else:
|
| 227 |
+
logger.error(f"Failed to get training metrics: {result}")
|
| 228 |
+
return result
|
| 229 |
+
|
| 230 |
+
def create_metrics_plot(self, experiment_id: str, metric_name: str = "loss") -> Dict[str, Any]:
|
| 231 |
+
"""Create a metrics plot for an experiment"""
|
| 232 |
+
logger.info(f"Creating metrics plot for experiment {experiment_id}, metric: {metric_name}")
|
| 233 |
+
|
| 234 |
+
result = self._make_api_call("/create_metrics_plot", experiment_id, metric_name)
|
| 235 |
+
|
| 236 |
+
if "success" in result:
|
| 237 |
+
logger.info(f"Metrics plot created successfully")
|
| 238 |
+
return result
|
| 239 |
+
else:
|
| 240 |
+
logger.error(f"Failed to create metrics plot: {result}")
|
| 241 |
+
return result
|
| 242 |
+
|
| 243 |
+
def create_experiment_comparison(self, experiment_ids: str) -> Dict[str, Any]:
|
| 244 |
+
"""Compare multiple experiments"""
|
| 245 |
+
logger.info(f"Creating experiment comparison for: {experiment_ids}")
|
| 246 |
+
|
| 247 |
+
result = self._make_api_call("/create_experiment_comparison", experiment_ids)
|
| 248 |
+
|
| 249 |
+
if "success" in result:
|
| 250 |
+
logger.info(f"Experiment comparison created successfully")
|
| 251 |
+
return result
|
| 252 |
+
else:
|
| 253 |
+
logger.error(f"Failed to create experiment comparison: {result}")
|
| 254 |
+
return result
|
| 255 |
+
|
| 256 |
+
def test_connection(self) -> Dict[str, Any]:
|
| 257 |
+
"""Test connection to the Trackio Space"""
|
| 258 |
+
logger.info("Testing connection to Trackio Space")
|
| 259 |
+
|
| 260 |
+
try:
|
| 261 |
+
# Try to list experiments as a connection test
|
| 262 |
+
result = self.list_experiments()
|
| 263 |
+
if "success" in result:
|
| 264 |
+
return {"success": True, "message": "Connection successful"}
|
| 265 |
+
else:
|
| 266 |
+
return {"error": "Connection failed", "details": result}
|
| 267 |
+
except Exception as e:
|
| 268 |
+
return {"error": f"Connection test failed: {str(e)}"}
|
| 269 |
+
|
| 270 |
+
def get_space_info(self) -> Dict[str, Any]:
|
| 271 |
+
"""Get information about the Space"""
|
| 272 |
+
try:
|
| 273 |
+
if not HF_HUB_AVAILABLE:
|
| 274 |
+
return {"error": "Hugging Face Hub not available"}
|
| 275 |
+
|
| 276 |
+
api = HfApi(token=self.hf_token)
|
| 277 |
+
space_info = api.space_info(self.space_id)
|
| 278 |
+
|
| 279 |
+
return {
|
| 280 |
+
"success": True,
|
| 281 |
+
"data": {
|
| 282 |
+
"space_id": self.space_id,
|
| 283 |
+
"space_url": self.space_url,
|
| 284 |
+
"space_info": {
|
| 285 |
+
"title": getattr(space_info, 'title', 'Unknown'),
|
| 286 |
+
"host": getattr(space_info, 'host', 'Unknown'),
|
| 287 |
+
"stage": getattr(space_info, 'stage', 'Unknown'),
|
| 288 |
+
"visibility": getattr(space_info, 'visibility', 'Unknown')
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
}
|
| 292 |
+
except Exception as e:
|
| 293 |
+
return {"error": f"Failed to get Space info: {str(e)}"}
|
| 294 |
+
|
| 295 |
+
# Factory function to create client with dynamic configuration
|
| 296 |
+
def create_trackio_client(space_id: Optional[str] = None, hf_token: Optional[str] = None) -> TrackioAPIClient:
|
| 297 |
+
"""Create a TrackioAPIClient with dynamic configuration"""
|
| 298 |
+
|
| 299 |
+
# Get space_id from environment if not provided
|
| 300 |
+
if not space_id:
|
| 301 |
+
space_id = os.environ.get('TRACKIO_URL')
|
| 302 |
+
if not space_id:
|
| 303 |
+
# Try to construct from username and space name
|
| 304 |
+
username = os.environ.get('HF_USERNAME')
|
| 305 |
+
space_name = os.environ.get('TRACKIO_SPACE_NAME')
|
| 306 |
+
if username and space_name:
|
| 307 |
+
space_id = f"https://huggingface.co/spaces/{username}/{space_name}"
|
| 308 |
+
else:
|
| 309 |
+
logger.warning("β οΈ No space_id provided and could not determine from environment")
|
| 310 |
+
return None
|
| 311 |
+
|
| 312 |
+
# Get HF token from environment if not provided
|
| 313 |
+
if not hf_token:
|
| 314 |
+
hf_token = os.environ.get('HF_TOKEN')
|
| 315 |
+
|
| 316 |
+
if not space_id:
|
| 317 |
+
logger.error("β No space_id available for TrackioAPIClient")
|
| 318 |
+
return None
|
| 319 |
+
|
| 320 |
+
return TrackioAPIClient(space_id, hf_token)
|
tests/test_data_preservation.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to validate data preservation in Trackio dataset operations
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import json
|
| 9 |
+
import tempfile
|
| 10 |
+
import logging
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from typing import Dict, Any
|
| 13 |
+
|
| 14 |
+
# Add src to path for imports
|
| 15 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
| 16 |
+
|
| 17 |
+
from dataset_utils import TrackioDatasetManager
|
| 18 |
+
|
| 19 |
+
# Setup logging
|
| 20 |
+
logging.basicConfig(level=logging.INFO)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
def create_sample_experiment(experiment_id: str, name: str, status: str = "running") -> Dict[str, Any]:
|
| 24 |
+
"""Create a sample experiment for testing"""
|
| 25 |
+
return {
|
| 26 |
+
'experiment_id': experiment_id,
|
| 27 |
+
'name': name,
|
| 28 |
+
'description': f"Test experiment {name}",
|
| 29 |
+
'created_at': datetime.now().isoformat(),
|
| 30 |
+
'status': status,
|
| 31 |
+
'metrics': json.dumps([
|
| 32 |
+
{
|
| 33 |
+
'timestamp': datetime.now().isoformat(),
|
| 34 |
+
'step': 100,
|
| 35 |
+
'metrics': {
|
| 36 |
+
'loss': 1.5,
|
| 37 |
+
'accuracy': 0.85,
|
| 38 |
+
'learning_rate': 5e-6
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
]),
|
| 42 |
+
'parameters': json.dumps({
|
| 43 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
| 44 |
+
'batch_size': 8,
|
| 45 |
+
'learning_rate': 5e-6
|
| 46 |
+
}),
|
| 47 |
+
'artifacts': json.dumps([]),
|
| 48 |
+
'logs': json.dumps([]),
|
| 49 |
+
'last_updated': datetime.now().isoformat()
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
def test_data_preservation():
|
| 53 |
+
"""Test data preservation functionality"""
|
| 54 |
+
# Get HF token from environment
|
| 55 |
+
hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
|
| 56 |
+
|
| 57 |
+
if not hf_token:
|
| 58 |
+
logger.error("β HF_TOKEN not found in environment variables")
|
| 59 |
+
logger.info("Please set HF_TOKEN or HUGGING_FACE_HUB_TOKEN environment variable")
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
# Use a test dataset repository
|
| 63 |
+
test_dataset_repo = "tonic/trackio-test-preservation"
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
logger.info("π§ͺ Starting data preservation test")
|
| 67 |
+
logger.info(f"π Test dataset: {test_dataset_repo}")
|
| 68 |
+
|
| 69 |
+
# Initialize dataset manager
|
| 70 |
+
dataset_manager = TrackioDatasetManager(test_dataset_repo, hf_token)
|
| 71 |
+
|
| 72 |
+
# Test 1: Check if dataset exists
|
| 73 |
+
logger.info("\nπ Test 1: Checking dataset existence...")
|
| 74 |
+
exists = dataset_manager.check_dataset_exists()
|
| 75 |
+
logger.info(f"Dataset exists: {exists}")
|
| 76 |
+
|
| 77 |
+
# Test 2: Load existing experiments (should handle empty/non-existent gracefully)
|
| 78 |
+
logger.info("\nπ Test 2: Loading existing experiments...")
|
| 79 |
+
existing_experiments = dataset_manager.load_existing_experiments()
|
| 80 |
+
logger.info(f"Found {len(existing_experiments)} existing experiments")
|
| 81 |
+
|
| 82 |
+
# Test 3: Add first experiment
|
| 83 |
+
logger.info("\nπ Test 3: Adding first experiment...")
|
| 84 |
+
exp1 = create_sample_experiment("test_exp_001", "First Test Experiment")
|
| 85 |
+
success = dataset_manager.upsert_experiment(exp1)
|
| 86 |
+
logger.info(f"First experiment added: {success}")
|
| 87 |
+
|
| 88 |
+
if not success:
|
| 89 |
+
logger.error("β Failed to add first experiment")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
# Test 4: Add second experiment (should preserve first)
|
| 93 |
+
logger.info("\nπ Test 4: Adding second experiment...")
|
| 94 |
+
exp2 = create_sample_experiment("test_exp_002", "Second Test Experiment")
|
| 95 |
+
success = dataset_manager.upsert_experiment(exp2)
|
| 96 |
+
logger.info(f"Second experiment added: {success}")
|
| 97 |
+
|
| 98 |
+
if not success:
|
| 99 |
+
logger.error("β Failed to add second experiment")
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
# Test 5: Verify both experiments exist
|
| 103 |
+
logger.info("\nπ Test 5: Verifying both experiments exist...")
|
| 104 |
+
all_experiments = dataset_manager.load_existing_experiments()
|
| 105 |
+
logger.info(f"Total experiments after adding two: {len(all_experiments)}")
|
| 106 |
+
|
| 107 |
+
exp_ids = [exp.get('experiment_id') for exp in all_experiments]
|
| 108 |
+
if "test_exp_001" in exp_ids and "test_exp_002" in exp_ids:
|
| 109 |
+
logger.info("β
Both experiments preserved successfully")
|
| 110 |
+
else:
|
| 111 |
+
logger.error(f"β Experiments not preserved. Found IDs: {exp_ids}")
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
# Test 6: Update existing experiment (should preserve others)
|
| 115 |
+
logger.info("\nπ Test 6: Updating first experiment...")
|
| 116 |
+
exp1_updated = create_sample_experiment("test_exp_001", "Updated First Experiment", "completed")
|
| 117 |
+
success = dataset_manager.upsert_experiment(exp1_updated)
|
| 118 |
+
logger.info(f"First experiment updated: {success}")
|
| 119 |
+
|
| 120 |
+
if not success:
|
| 121 |
+
logger.error("β Failed to update first experiment")
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
# Test 7: Verify update preserved other experiments
|
| 125 |
+
logger.info("\nπ Test 7: Verifying update preserved other experiments...")
|
| 126 |
+
final_experiments = dataset_manager.load_existing_experiments()
|
| 127 |
+
logger.info(f"Total experiments after update: {len(final_experiments)}")
|
| 128 |
+
|
| 129 |
+
# Check that we still have both experiments
|
| 130 |
+
if len(final_experiments) != 2:
|
| 131 |
+
logger.error(f"β Wrong number of experiments after update: {len(final_experiments)}")
|
| 132 |
+
return False
|
| 133 |
+
|
| 134 |
+
# Check that first experiment was updated
|
| 135 |
+
exp1_final = dataset_manager.get_experiment_by_id("test_exp_001")
|
| 136 |
+
if exp1_final and exp1_final.get('status') == 'completed':
|
| 137 |
+
logger.info("β
First experiment successfully updated")
|
| 138 |
+
else:
|
| 139 |
+
logger.error("β First experiment update failed")
|
| 140 |
+
return False
|
| 141 |
+
|
| 142 |
+
# Check that second experiment was preserved
|
| 143 |
+
exp2_final = dataset_manager.get_experiment_by_id("test_exp_002")
|
| 144 |
+
if exp2_final and exp2_final.get('name') == "Second Test Experiment":
|
| 145 |
+
logger.info("β
Second experiment successfully preserved")
|
| 146 |
+
else:
|
| 147 |
+
logger.error("β Second experiment not preserved")
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
# Test 8: Test filtering functionality
|
| 151 |
+
logger.info("\nπ Test 8: Testing filtering functionality...")
|
| 152 |
+
running_experiments = dataset_manager.list_experiments(status_filter="running")
|
| 153 |
+
completed_experiments = dataset_manager.list_experiments(status_filter="completed")
|
| 154 |
+
|
| 155 |
+
logger.info(f"Running experiments: {len(running_experiments)}")
|
| 156 |
+
logger.info(f"Completed experiments: {len(completed_experiments)}")
|
| 157 |
+
|
| 158 |
+
if len(running_experiments) == 1 and len(completed_experiments) == 1:
|
| 159 |
+
logger.info("β
Filtering functionality works correctly")
|
| 160 |
+
else:
|
| 161 |
+
logger.error("β Filtering functionality failed")
|
| 162 |
+
return False
|
| 163 |
+
|
| 164 |
+
logger.info("\nπ All data preservation tests passed!")
|
| 165 |
+
logger.info("β
Data preservation functionality is working correctly")
|
| 166 |
+
return True
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.error(f"β Test failed with exception: {e}")
|
| 170 |
+
return False
|
| 171 |
+
|
| 172 |
+
def main():
|
| 173 |
+
"""Main test function"""
|
| 174 |
+
logger.info("Data Preservation Test Suite")
|
| 175 |
+
logger.info("=" * 50)
|
| 176 |
+
|
| 177 |
+
success = test_data_preservation()
|
| 178 |
+
|
| 179 |
+
if success:
|
| 180 |
+
logger.info("\nβ
All tests passed!")
|
| 181 |
+
sys.exit(0)
|
| 182 |
+
else:
|
| 183 |
+
logger.error("\nβ Some tests failed!")
|
| 184 |
+
sys.exit(1)
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
main()
|
tests/test_demo_deployment.py
CHANGED
|
@@ -39,14 +39,17 @@ def test_template_files_exist():
|
|
| 39 |
"""Test that template files exist"""
|
| 40 |
print("π§ͺ Testing template files existence...")
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
required_files = ["app.py", "requirements.txt"]
|
| 45 |
|
| 46 |
-
for
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
print("β
Template files test passed")
|
| 52 |
|
|
|
|
| 39 |
"""Test that template files exist"""
|
| 40 |
print("π§ͺ Testing template files existence...")
|
| 41 |
|
| 42 |
+
demo_types = ["demo_smol", "demo_gpt"]
|
|
|
|
| 43 |
required_files = ["app.py", "requirements.txt"]
|
| 44 |
|
| 45 |
+
for demo_type in demo_types:
|
| 46 |
+
template_dir = Path(__file__).parent.parent / "templates" / "spaces" / demo_type
|
| 47 |
+
print(f"Checking {demo_type} templates...")
|
| 48 |
+
|
| 49 |
+
for file_name in required_files:
|
| 50 |
+
file_path = template_dir / file_name
|
| 51 |
+
assert file_path.exists(), f"Required file {file_name} not found in {demo_type} templates"
|
| 52 |
+
print(f"β
Found {demo_type}/{file_name}")
|
| 53 |
|
| 54 |
print("β
Template files test passed")
|
| 55 |
|
tests/test_deployment.py
CHANGED
|
@@ -17,15 +17,19 @@ def test_templates_exist():
|
|
| 17 |
|
| 18 |
# Check spaces templates
|
| 19 |
spaces_dir = project_root / "templates" / "spaces"
|
|
|
|
| 20 |
spaces_files = ["app.py", "requirements.txt", "README.md"]
|
| 21 |
|
| 22 |
-
for
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Check datasets templates
|
| 31 |
datasets_dir = project_root / "templates" / "datasets"
|
|
|
|
| 17 |
|
| 18 |
# Check spaces templates
|
| 19 |
spaces_dir = project_root / "templates" / "spaces"
|
| 20 |
+
demo_types = ["demo_smol", "demo_gpt", "trackio"]
|
| 21 |
spaces_files = ["app.py", "requirements.txt", "README.md"]
|
| 22 |
|
| 23 |
+
for demo_type in demo_types:
|
| 24 |
+
demo_dir = spaces_dir / demo_type
|
| 25 |
+
print(f"Checking {demo_type} templates...")
|
| 26 |
+
for file_name in spaces_files:
|
| 27 |
+
file_path = demo_dir / file_name
|
| 28 |
+
if file_path.exists():
|
| 29 |
+
print(f"β
{file_path}")
|
| 30 |
+
else:
|
| 31 |
+
print(f"β {file_path} not found")
|
| 32 |
+
return False
|
| 33 |
|
| 34 |
# Check datasets templates
|
| 35 |
datasets_dir = project_root / "templates" / "datasets"
|
tests/test_hf_datasets.py
CHANGED
|
@@ -76,7 +76,7 @@ def test_backup_fallback():
|
|
| 76 |
|
| 77 |
try:
|
| 78 |
# Import and test the TrackioSpace class
|
| 79 |
-
from templates.spaces.app import TrackioSpace
|
| 80 |
|
| 81 |
trackio = TrackioSpace()
|
| 82 |
experiments = trackio.experiments
|
|
@@ -105,7 +105,7 @@ def test_metrics_dataframe():
|
|
| 105 |
print("=" * 40)
|
| 106 |
|
| 107 |
try:
|
| 108 |
-
from templates.spaces.app import TrackioSpace
|
| 109 |
|
| 110 |
trackio = TrackioSpace()
|
| 111 |
|
|
|
|
| 76 |
|
| 77 |
try:
|
| 78 |
# Import and test the TrackioSpace class
|
| 79 |
+
from templates.spaces.trackio.app import TrackioSpace
|
| 80 |
|
| 81 |
trackio = TrackioSpace()
|
| 82 |
experiments = trackio.experiments
|
|
|
|
| 105 |
print("=" * 40)
|
| 106 |
|
| 107 |
try:
|
| 108 |
+
from templates.spaces.trackio.app import TrackioSpace
|
| 109 |
|
| 110 |
trackio = TrackioSpace()
|
| 111 |
|
tests/test_latest_deployment.py
CHANGED
|
@@ -158,16 +158,20 @@ def test_template_files():
|
|
| 158 |
"""Test that all required template files exist"""
|
| 159 |
print("\nπ Testing template files...")
|
| 160 |
|
| 161 |
-
|
|
|
|
| 162 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
| 163 |
|
| 164 |
-
for
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
return True
|
| 173 |
|
|
|
|
| 158 |
"""Test that all required template files exist"""
|
| 159 |
print("\nπ Testing template files...")
|
| 160 |
|
| 161 |
+
spaces_dir = project_root / "templates" / "spaces"
|
| 162 |
+
demo_types = ["demo_smol", "demo_gpt", "trackio"]
|
| 163 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
| 164 |
|
| 165 |
+
for demo_type in demo_types:
|
| 166 |
+
demo_dir = spaces_dir / demo_type
|
| 167 |
+
print(f"Checking {demo_type} templates...")
|
| 168 |
+
for file_name in required_files:
|
| 169 |
+
file_path = demo_dir / file_name
|
| 170 |
+
if file_path.exists():
|
| 171 |
+
print(f"β
{demo_type}/{file_name} exists")
|
| 172 |
+
else:
|
| 173 |
+
print(f"β {demo_type}/{file_name} missing")
|
| 174 |
+
return False
|
| 175 |
|
| 176 |
return True
|
| 177 |
|
tests/test_readme_template.py
CHANGED
|
@@ -16,8 +16,8 @@ def test_readme_template():
|
|
| 16 |
print("π Testing README template replacement...")
|
| 17 |
|
| 18 |
try:
|
| 19 |
-
# Get template path
|
| 20 |
-
templates_dir = project_root / "templates" / "spaces"
|
| 21 |
readme_template_path = templates_dir / "README.md"
|
| 22 |
|
| 23 |
if not readme_template_path.exists():
|
|
|
|
| 16 |
print("π Testing README template replacement...")
|
| 17 |
|
| 18 |
try:
|
| 19 |
+
# Get template path (using trackio as example)
|
| 20 |
+
templates_dir = project_root / "templates" / "spaces" / "trackio"
|
| 21 |
readme_template_path = templates_dir / "README.md"
|
| 22 |
|
| 23 |
if not readme_template_path.exists():
|
tests/test_real_dataset_access.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify that the Trackio Space can read from the real Hugging Face dataset
|
| 4 |
+
This test requires an HF_TOKEN environment variable to access the dataset
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
from typing import Dict, Any
|
| 12 |
+
|
| 13 |
+
# Setup logging
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
def test_direct_dataset_access():
|
| 18 |
+
"""Test direct access to the Hugging Face dataset"""
|
| 19 |
+
try:
|
| 20 |
+
hf_token = os.environ.get('HF_TOKEN')
|
| 21 |
+
|
| 22 |
+
if not hf_token:
|
| 23 |
+
logger.warning("β οΈ No HF_TOKEN found. Skipping real dataset test.")
|
| 24 |
+
logger.info("π‘ Set HF_TOKEN environment variable to test with real dataset")
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
from datasets import load_dataset
|
| 28 |
+
|
| 29 |
+
dataset_repo = "Tonic/trackio-experiments"
|
| 30 |
+
logger.info(f"π§ Testing direct access to {dataset_repo}")
|
| 31 |
+
|
| 32 |
+
# Load the dataset
|
| 33 |
+
dataset = load_dataset(dataset_repo, token=hf_token)
|
| 34 |
+
|
| 35 |
+
# Check structure
|
| 36 |
+
experiment_count = len(dataset['train']) if 'train' in dataset else 0
|
| 37 |
+
logger.info(f"π Dataset contains {experiment_count} experiments")
|
| 38 |
+
|
| 39 |
+
if experiment_count == 0:
|
| 40 |
+
logger.warning("β οΈ No experiments found in dataset")
|
| 41 |
+
return False
|
| 42 |
+
|
| 43 |
+
# Check columns
|
| 44 |
+
columns = list(dataset['train'].column_names) if 'train' in dataset else []
|
| 45 |
+
logger.info(f"π Dataset columns: {columns}")
|
| 46 |
+
|
| 47 |
+
expected_columns = ['experiment_id', 'name', 'description', 'created_at', 'status', 'metrics', 'parameters', 'artifacts', 'logs', 'last_updated']
|
| 48 |
+
missing_columns = [col for col in expected_columns if col not in columns]
|
| 49 |
+
|
| 50 |
+
if missing_columns:
|
| 51 |
+
logger.warning(f"β οΈ Missing expected columns: {missing_columns}")
|
| 52 |
+
else:
|
| 53 |
+
logger.info("β
All expected columns present")
|
| 54 |
+
|
| 55 |
+
# Test parsing a few experiments
|
| 56 |
+
successful_parses = 0
|
| 57 |
+
for i, row in enumerate(dataset['train']):
|
| 58 |
+
if i >= 3: # Test first 3 experiments
|
| 59 |
+
break
|
| 60 |
+
|
| 61 |
+
exp_id = row.get('experiment_id', 'unknown')
|
| 62 |
+
logger.info(f"\n㪠Testing experiment: {exp_id}")
|
| 63 |
+
|
| 64 |
+
# Test metrics parsing
|
| 65 |
+
metrics_raw = row.get('metrics', '[]')
|
| 66 |
+
try:
|
| 67 |
+
if isinstance(metrics_raw, str):
|
| 68 |
+
metrics = json.loads(metrics_raw)
|
| 69 |
+
if isinstance(metrics, list):
|
| 70 |
+
logger.info(f" β
Metrics parsed: {len(metrics)} entries")
|
| 71 |
+
if metrics:
|
| 72 |
+
first_metric = metrics[0]
|
| 73 |
+
if 'metrics' in first_metric:
|
| 74 |
+
metric_keys = list(first_metric['metrics'].keys())
|
| 75 |
+
logger.info(f" π Sample metrics: {metric_keys[:5]}...")
|
| 76 |
+
successful_parses += 1
|
| 77 |
+
else:
|
| 78 |
+
logger.warning(f" β οΈ Metrics is not a list: {type(metrics)}")
|
| 79 |
+
else:
|
| 80 |
+
logger.warning(f" β οΈ Metrics is not a string: {type(metrics_raw)}")
|
| 81 |
+
except json.JSONDecodeError as e:
|
| 82 |
+
logger.warning(f" β Failed to parse metrics JSON: {e}")
|
| 83 |
+
|
| 84 |
+
# Test parameters parsing
|
| 85 |
+
parameters_raw = row.get('parameters', '{}')
|
| 86 |
+
try:
|
| 87 |
+
if isinstance(parameters_raw, str):
|
| 88 |
+
parameters = json.loads(parameters_raw)
|
| 89 |
+
if isinstance(parameters, dict):
|
| 90 |
+
logger.info(f" β
Parameters parsed: {len(parameters)} entries")
|
| 91 |
+
else:
|
| 92 |
+
logger.warning(f" β οΈ Parameters is not a dict: {type(parameters)}")
|
| 93 |
+
else:
|
| 94 |
+
logger.warning(f" β οΈ Parameters is not a string: {type(parameters_raw)}")
|
| 95 |
+
except json.JSONDecodeError as e:
|
| 96 |
+
logger.warning(f" β Failed to parse parameters JSON: {e}")
|
| 97 |
+
|
| 98 |
+
logger.info(f"\nπ Successfully parsed {successful_parses} out of {min(3, experiment_count)} test experiments")
|
| 99 |
+
|
| 100 |
+
return successful_parses > 0
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"β Error testing direct dataset access: {e}")
|
| 104 |
+
import traceback
|
| 105 |
+
traceback.print_exc()
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
def test_trackio_space_with_real_dataset():
|
| 109 |
+
"""Test TrackioSpace class with real dataset"""
|
| 110 |
+
try:
|
| 111 |
+
hf_token = os.environ.get('HF_TOKEN')
|
| 112 |
+
|
| 113 |
+
if not hf_token:
|
| 114 |
+
logger.warning("β οΈ No HF_TOKEN found. Skipping TrackioSpace test with real dataset.")
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
+
# Add the templates/spaces/trackio directory to the path
|
| 118 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'templates', 'spaces', 'trackio'))
|
| 119 |
+
|
| 120 |
+
from app import TrackioSpace
|
| 121 |
+
|
| 122 |
+
dataset_repo = "Tonic/trackio-experiments"
|
| 123 |
+
logger.info(f"π§ Testing TrackioSpace with {dataset_repo}")
|
| 124 |
+
|
| 125 |
+
# Create TrackioSpace instance with real credentials
|
| 126 |
+
trackio_space = TrackioSpace(hf_token=hf_token, dataset_repo=dataset_repo)
|
| 127 |
+
|
| 128 |
+
# Check if it loaded experiments from the dataset (not backup)
|
| 129 |
+
experiments_count = len(trackio_space.experiments)
|
| 130 |
+
logger.info(f"π TrackioSpace loaded {experiments_count} experiments")
|
| 131 |
+
|
| 132 |
+
if experiments_count == 0:
|
| 133 |
+
logger.warning("β οΈ TrackioSpace loaded no experiments")
|
| 134 |
+
return False
|
| 135 |
+
|
| 136 |
+
# Check if the dataset manager is available
|
| 137 |
+
if trackio_space.dataset_manager:
|
| 138 |
+
logger.info("β
Dataset manager is available - data preservation enabled")
|
| 139 |
+
else:
|
| 140 |
+
logger.warning("β οΈ Dataset manager not available - using legacy mode")
|
| 141 |
+
|
| 142 |
+
# Test loading a specific experiment
|
| 143 |
+
experiment_ids = list(trackio_space.experiments.keys())
|
| 144 |
+
if experiment_ids:
|
| 145 |
+
test_exp_id = experiment_ids[0]
|
| 146 |
+
logger.info(f"π¬ Testing metrics loading for {test_exp_id}")
|
| 147 |
+
|
| 148 |
+
from app import get_metrics_dataframe
|
| 149 |
+
df = get_metrics_dataframe(test_exp_id)
|
| 150 |
+
|
| 151 |
+
if not df.empty:
|
| 152 |
+
logger.info(f"β
Metrics DataFrame created: {len(df)} rows, {len(df.columns)} columns")
|
| 153 |
+
logger.info(f"π Available metrics: {list(df.columns)}")
|
| 154 |
+
return True
|
| 155 |
+
else:
|
| 156 |
+
logger.warning(f"β οΈ Metrics DataFrame is empty for {test_exp_id}")
|
| 157 |
+
return False
|
| 158 |
+
else:
|
| 159 |
+
logger.warning("β οΈ No experiments available for testing")
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"β Error testing TrackioSpace with real dataset: {e}")
|
| 164 |
+
import traceback
|
| 165 |
+
traceback.print_exc()
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
if __name__ == "__main__":
|
| 169 |
+
logger.info("π Starting real dataset access test")
|
| 170 |
+
|
| 171 |
+
# Test direct dataset access
|
| 172 |
+
logger.info("\n" + "="*60)
|
| 173 |
+
logger.info("TEST 1: Direct Dataset Access")
|
| 174 |
+
logger.info("="*60)
|
| 175 |
+
|
| 176 |
+
direct_test_passed = test_direct_dataset_access()
|
| 177 |
+
|
| 178 |
+
# Test TrackioSpace with real dataset
|
| 179 |
+
logger.info("\n" + "="*60)
|
| 180 |
+
logger.info("TEST 2: TrackioSpace with Real Dataset")
|
| 181 |
+
logger.info("="*60)
|
| 182 |
+
|
| 183 |
+
trackio_test_passed = test_trackio_space_with_real_dataset()
|
| 184 |
+
|
| 185 |
+
# Summary
|
| 186 |
+
logger.info("\n" + "="*60)
|
| 187 |
+
logger.info("TEST SUMMARY")
|
| 188 |
+
logger.info("="*60)
|
| 189 |
+
|
| 190 |
+
logger.info(f"Direct Dataset Access: {'β
PASSED' if direct_test_passed else 'β FAILED/SKIPPED'}")
|
| 191 |
+
logger.info(f"TrackioSpace Integration: {'β
PASSED' if trackio_test_passed else 'β FAILED/SKIPPED'}")
|
| 192 |
+
|
| 193 |
+
if direct_test_passed and trackio_test_passed:
|
| 194 |
+
logger.info("π All tests passed! The dataset integration is working correctly.")
|
| 195 |
+
sys.exit(0)
|
| 196 |
+
elif not os.environ.get('HF_TOKEN'):
|
| 197 |
+
logger.info("βΉοΈ Tests skipped due to missing HF_TOKEN. Set the token to test with real dataset.")
|
| 198 |
+
sys.exit(0)
|
| 199 |
+
else:
|
| 200 |
+
logger.error("β Some tests failed. Please check the implementation.")
|
| 201 |
+
sys.exit(1)
|
tests/test_trackio_dataset_fix.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify that the Trackio Space can properly read from the actual dataset
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
from typing import Dict, Any
|
| 11 |
+
|
| 12 |
+
# Add the templates/spaces/trackio directory to the path
|
| 13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'templates', 'spaces', 'trackio'))
|
| 14 |
+
|
| 15 |
+
# Setup logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
def test_dataset_loading():
|
| 20 |
+
"""Test loading experiments from the actual dataset"""
|
| 21 |
+
try:
|
| 22 |
+
# Import the TrackioSpace class
|
| 23 |
+
from app import TrackioSpace
|
| 24 |
+
|
| 25 |
+
# Create a TrackioSpace instance pointing to the real dataset
|
| 26 |
+
dataset_repo = "Tonic/trackio-experiments"
|
| 27 |
+
hf_token = os.environ.get('HF_TOKEN')
|
| 28 |
+
|
| 29 |
+
if not hf_token:
|
| 30 |
+
logger.warning("β οΈ No HF_TOKEN found in environment. Testing with public access.")
|
| 31 |
+
|
| 32 |
+
logger.info(f"π§ Testing dataset loading from {dataset_repo}")
|
| 33 |
+
|
| 34 |
+
# Create TrackioSpace instance
|
| 35 |
+
trackio_space = TrackioSpace(hf_token=hf_token, dataset_repo=dataset_repo)
|
| 36 |
+
|
| 37 |
+
# Check how many experiments were loaded
|
| 38 |
+
experiments_count = len(trackio_space.experiments)
|
| 39 |
+
logger.info(f"π Loaded {experiments_count} experiments")
|
| 40 |
+
|
| 41 |
+
if experiments_count == 0:
|
| 42 |
+
logger.warning("β οΈ No experiments loaded - this might indicate a problem")
|
| 43 |
+
return False
|
| 44 |
+
|
| 45 |
+
# Test specific experiment IDs from the logs
|
| 46 |
+
test_experiment_ids = [
|
| 47 |
+
'exp_20250720_130853',
|
| 48 |
+
'exp_20250720_134319',
|
| 49 |
+
'exp_20250727_172507',
|
| 50 |
+
'exp_20250727_172526'
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
found_experiments = []
|
| 54 |
+
for exp_id in test_experiment_ids:
|
| 55 |
+
if exp_id in trackio_space.experiments:
|
| 56 |
+
found_experiments.append(exp_id)
|
| 57 |
+
experiment = trackio_space.experiments[exp_id]
|
| 58 |
+
|
| 59 |
+
logger.info(f"β
Found experiment: {exp_id}")
|
| 60 |
+
logger.info(f" Name: {experiment.get('name', 'N/A')}")
|
| 61 |
+
logger.info(f" Status: {experiment.get('status', 'N/A')}")
|
| 62 |
+
logger.info(f" Metrics count: {len(experiment.get('metrics', []))}")
|
| 63 |
+
logger.info(f" Parameters count: {len(experiment.get('parameters', {}))}")
|
| 64 |
+
|
| 65 |
+
# Test metrics parsing specifically
|
| 66 |
+
metrics = experiment.get('metrics', [])
|
| 67 |
+
if metrics:
|
| 68 |
+
logger.info(f" First metric entry: {metrics[0] if metrics else 'None'}")
|
| 69 |
+
|
| 70 |
+
# Test if we can get a DataFrame for this experiment
|
| 71 |
+
from app import get_metrics_dataframe
|
| 72 |
+
df = get_metrics_dataframe(exp_id)
|
| 73 |
+
if not df.empty:
|
| 74 |
+
logger.info(f" β
DataFrame created successfully: {len(df)} rows, {len(df.columns)} columns")
|
| 75 |
+
logger.info(f" Available metrics: {list(df.columns)}")
|
| 76 |
+
else:
|
| 77 |
+
logger.warning(f" β οΈ DataFrame is empty for {exp_id}")
|
| 78 |
+
else:
|
| 79 |
+
logger.warning(f" β οΈ No metrics found for {exp_id}")
|
| 80 |
+
|
| 81 |
+
logger.info(f"π Found {len(found_experiments)} out of {len(test_experiment_ids)} test experiments")
|
| 82 |
+
|
| 83 |
+
if found_experiments:
|
| 84 |
+
logger.info("β
Dataset loading appears to be working correctly!")
|
| 85 |
+
return True
|
| 86 |
+
else:
|
| 87 |
+
logger.warning("β οΈ No test experiments found - dataset loading may have issues")
|
| 88 |
+
return False
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"β Error testing dataset loading: {e}")
|
| 92 |
+
import traceback
|
| 93 |
+
traceback.print_exc()
|
| 94 |
+
return False
|
| 95 |
+
|
| 96 |
+
def test_metrics_parsing():
|
| 97 |
+
"""Test parsing metrics from the actual dataset format"""
|
| 98 |
+
try:
|
| 99 |
+
# Test with actual data structure from the dataset
|
| 100 |
+
sample_metrics_json = '''[{"timestamp": "2025-07-20T11:20:01.780908", "step": 25, "metrics": {"loss": 1.1659, "grad_norm": 10.3125, "learning_rate": 7e-08, "num_tokens": 1642080.0, "mean_token_accuracy": 0.75923578992486, "epoch": 0.004851130919895701}}, {"timestamp": "2025-07-20T11:26:39.042155", "step": 50, "metrics": {"loss": 1.165, "grad_norm": 10.75, "learning_rate": 1.4291666666666667e-07, "num_tokens": 3324682.0, "mean_token_accuracy": 0.7577659255266189, "epoch": 0.009702261839791402}}]'''
|
| 101 |
+
|
| 102 |
+
logger.info("π§ Testing metrics parsing")
|
| 103 |
+
|
| 104 |
+
# Parse the JSON
|
| 105 |
+
metrics_list = json.loads(sample_metrics_json)
|
| 106 |
+
logger.info(f"π Parsed {len(metrics_list)} metric entries")
|
| 107 |
+
|
| 108 |
+
# Convert to DataFrame format (like the app does)
|
| 109 |
+
import pandas as pd
|
| 110 |
+
df_data = []
|
| 111 |
+
for metric_entry in metrics_list:
|
| 112 |
+
if isinstance(metric_entry, dict):
|
| 113 |
+
step = metric_entry.get('step', 0)
|
| 114 |
+
timestamp = metric_entry.get('timestamp', '')
|
| 115 |
+
metrics = metric_entry.get('metrics', {})
|
| 116 |
+
|
| 117 |
+
row = {'step': step, 'timestamp': timestamp}
|
| 118 |
+
row.update(metrics)
|
| 119 |
+
df_data.append(row)
|
| 120 |
+
|
| 121 |
+
if df_data:
|
| 122 |
+
df = pd.DataFrame(df_data)
|
| 123 |
+
logger.info(f"β
DataFrame created: {len(df)} rows, {len(df.columns)} columns")
|
| 124 |
+
logger.info(f"π Columns: {list(df.columns)}")
|
| 125 |
+
logger.info(f"π Sample data:\n{df.head()}")
|
| 126 |
+
return True
|
| 127 |
+
else:
|
| 128 |
+
logger.warning("β οΈ No data converted to DataFrame format")
|
| 129 |
+
return False
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"β Error testing metrics parsing: {e}")
|
| 133 |
+
import traceback
|
| 134 |
+
traceback.print_exc()
|
| 135 |
+
return False
|
| 136 |
+
|
| 137 |
+
if __name__ == "__main__":
|
| 138 |
+
logger.info("π Starting Trackio dataset fix verification")
|
| 139 |
+
|
| 140 |
+
# Test metrics parsing first
|
| 141 |
+
logger.info("\n" + "="*50)
|
| 142 |
+
logger.info("TEST 1: Metrics Parsing")
|
| 143 |
+
logger.info("="*50)
|
| 144 |
+
|
| 145 |
+
metrics_test_passed = test_metrics_parsing()
|
| 146 |
+
|
| 147 |
+
# Test dataset loading
|
| 148 |
+
logger.info("\n" + "="*50)
|
| 149 |
+
logger.info("TEST 2: Dataset Loading")
|
| 150 |
+
logger.info("="*50)
|
| 151 |
+
|
| 152 |
+
dataset_test_passed = test_dataset_loading()
|
| 153 |
+
|
| 154 |
+
# Summary
|
| 155 |
+
logger.info("\n" + "="*50)
|
| 156 |
+
logger.info("TEST SUMMARY")
|
| 157 |
+
logger.info("="*50)
|
| 158 |
+
|
| 159 |
+
logger.info(f"Metrics Parsing: {'β
PASSED' if metrics_test_passed else 'β FAILED'}")
|
| 160 |
+
logger.info(f"Dataset Loading: {'β
PASSED' if dataset_test_passed else 'β FAILED'}")
|
| 161 |
+
|
| 162 |
+
if metrics_test_passed and dataset_test_passed:
|
| 163 |
+
logger.info("π All tests passed! The dataset fix should work correctly.")
|
| 164 |
+
sys.exit(0)
|
| 165 |
+
else:
|
| 166 |
+
logger.error("β Some tests failed. Please check the implementation.")
|
| 167 |
+
sys.exit(1)
|
tests/test_trackio_deployment.py
CHANGED
|
@@ -17,16 +17,16 @@ def test_templates_structure():
|
|
| 17 |
"""Test that the templates structure is correct"""
|
| 18 |
print("π Testing templates structure...")
|
| 19 |
|
| 20 |
-
|
| 21 |
|
| 22 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
| 23 |
|
| 24 |
for file_name in required_files:
|
| 25 |
-
file_path =
|
| 26 |
if file_path.exists():
|
| 27 |
-
print(f"β
{file_name} exists")
|
| 28 |
else:
|
| 29 |
-
print(f"β {file_name} missing")
|
| 30 |
return False
|
| 31 |
|
| 32 |
return True
|
|
@@ -35,7 +35,7 @@ def test_app_py_content():
|
|
| 35 |
"""Test that app.py has the required structure"""
|
| 36 |
print("\nπ Testing app.py content...")
|
| 37 |
|
| 38 |
-
app_path = project_root / "templates" / "spaces" / "app.py"
|
| 39 |
|
| 40 |
try:
|
| 41 |
with open(app_path, 'r', encoding='utf-8') as f:
|
|
|
|
| 17 |
"""Test that the templates structure is correct"""
|
| 18 |
print("π Testing templates structure...")
|
| 19 |
|
| 20 |
+
trackio_dir = project_root / "templates" / "spaces" / "trackio"
|
| 21 |
|
| 22 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
| 23 |
|
| 24 |
for file_name in required_files:
|
| 25 |
+
file_path = trackio_dir / file_name
|
| 26 |
if file_path.exists():
|
| 27 |
+
print(f"β
trackio/{file_name} exists")
|
| 28 |
else:
|
| 29 |
+
print(f"β trackio/{file_name} missing")
|
| 30 |
return False
|
| 31 |
|
| 32 |
return True
|
|
|
|
| 35 |
"""Test that app.py has the required structure"""
|
| 36 |
print("\nπ Testing app.py content...")
|
| 37 |
|
| 38 |
+
app_path = project_root / "templates" / "spaces" / "trackio" / "app.py"
|
| 39 |
|
| 40 |
try:
|
| 41 |
with open(app_path, 'r', encoding='utf-8') as f:
|
tests/test_trackio_space_diagnostics.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Diagnostic script for Trackio Space issues
|
| 4 |
+
Helps debug dataset loading and API client issues
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
# Add src directory to path
|
| 12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
| 13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'templates', 'spaces', 'trackio'))
|
| 14 |
+
|
| 15 |
+
# Setup logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
def test_dataset_manager():
|
| 20 |
+
"""Test dataset manager functionality"""
|
| 21 |
+
try:
|
| 22 |
+
from dataset_utils import TrackioDatasetManager
|
| 23 |
+
|
| 24 |
+
# Test with environment variables
|
| 25 |
+
hf_token = os.environ.get('HF_TOKEN')
|
| 26 |
+
dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
| 27 |
+
|
| 28 |
+
if not hf_token:
|
| 29 |
+
logger.warning("β οΈ HF_TOKEN not found in environment")
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
logger.info(f"π§ Testing dataset manager with repo: {dataset_repo}")
|
| 33 |
+
|
| 34 |
+
# Initialize dataset manager
|
| 35 |
+
manager = TrackioDatasetManager(dataset_repo, hf_token)
|
| 36 |
+
|
| 37 |
+
# Test loading experiments
|
| 38 |
+
experiments = manager.load_existing_experiments()
|
| 39 |
+
logger.info(f"π Loaded {len(experiments)} experiments from dataset")
|
| 40 |
+
|
| 41 |
+
# Test creating a sample experiment
|
| 42 |
+
sample_experiment = {
|
| 43 |
+
'experiment_id': f'test_diagnostic_{int(os.urandom(4).hex(), 16)}',
|
| 44 |
+
'name': 'Diagnostic Test Experiment',
|
| 45 |
+
'description': 'Test experiment created by diagnostic script',
|
| 46 |
+
'created_at': '2025-01-27T12:00:00',
|
| 47 |
+
'status': 'completed',
|
| 48 |
+
'metrics': '[]',
|
| 49 |
+
'parameters': '{"test": true}',
|
| 50 |
+
'artifacts': '[]',
|
| 51 |
+
'logs': '[]',
|
| 52 |
+
'last_updated': '2025-01-27T12:00:00'
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
# Test upsert functionality
|
| 56 |
+
logger.info("π§ͺ Testing experiment upsert...")
|
| 57 |
+
success = manager.upsert_experiment(sample_experiment)
|
| 58 |
+
|
| 59 |
+
if success:
|
| 60 |
+
logger.info("β
Dataset manager working correctly")
|
| 61 |
+
|
| 62 |
+
# Verify the experiment was saved
|
| 63 |
+
experiments_after = manager.load_existing_experiments()
|
| 64 |
+
logger.info(f"π After upsert: {len(experiments_after)} experiments")
|
| 65 |
+
|
| 66 |
+
return True
|
| 67 |
+
else:
|
| 68 |
+
logger.error("β Failed to upsert test experiment")
|
| 69 |
+
return False
|
| 70 |
+
|
| 71 |
+
except ImportError as e:
|
| 72 |
+
logger.error(f"β Failed to import dataset_utils: {e}")
|
| 73 |
+
return False
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"β Dataset manager test failed: {e}")
|
| 76 |
+
return False
|
| 77 |
+
|
| 78 |
+
def test_trackio_space():
|
| 79 |
+
"""Test TrackioSpace initialization"""
|
| 80 |
+
try:
|
| 81 |
+
# Import the TrackioSpace class
|
| 82 |
+
from app import TrackioSpace
|
| 83 |
+
|
| 84 |
+
logger.info("π§ͺ Testing TrackioSpace initialization...")
|
| 85 |
+
|
| 86 |
+
# Initialize TrackioSpace
|
| 87 |
+
space = TrackioSpace()
|
| 88 |
+
|
| 89 |
+
logger.info(f"π TrackioSpace initialized with {len(space.experiments)} experiments")
|
| 90 |
+
logger.info(f"π‘οΈ Dataset manager available: {'Yes' if space.dataset_manager else 'No'}")
|
| 91 |
+
logger.info(f"π HF Token available: {'Yes' if space.hf_token else 'No'}")
|
| 92 |
+
logger.info(f"π Dataset repo: {space.dataset_repo}")
|
| 93 |
+
|
| 94 |
+
return True
|
| 95 |
+
|
| 96 |
+
except ImportError as e:
|
| 97 |
+
logger.error(f"β Failed to import TrackioSpace: {e}")
|
| 98 |
+
return False
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error(f"β TrackioSpace test failed: {e}")
|
| 101 |
+
return False
|
| 102 |
+
|
| 103 |
+
def test_environment():
|
| 104 |
+
"""Test environment configuration"""
|
| 105 |
+
logger.info("π Checking environment configuration...")
|
| 106 |
+
|
| 107 |
+
# Check required environment variables
|
| 108 |
+
env_vars = {
|
| 109 |
+
'HF_TOKEN': os.environ.get('HF_TOKEN'),
|
| 110 |
+
'TRACKIO_DATASET_REPO': os.environ.get('TRACKIO_DATASET_REPO'),
|
| 111 |
+
'TRACKIO_URL': os.environ.get('TRACKIO_URL'),
|
| 112 |
+
'SPACE_ID': os.environ.get('SPACE_ID')
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
for var, value in env_vars.items():
|
| 116 |
+
if value:
|
| 117 |
+
masked_value = value[:8] + '...' if len(value) > 8 and 'TOKEN' in var else value
|
| 118 |
+
logger.info(f"β
{var}: {masked_value}")
|
| 119 |
+
else:
|
| 120 |
+
logger.warning(f"β οΈ {var}: Not set")
|
| 121 |
+
|
| 122 |
+
# Check if running on HF Spaces
|
| 123 |
+
is_hf_spaces = bool(os.environ.get('SPACE_ID'))
|
| 124 |
+
logger.info(f"π Running on HF Spaces: {'Yes' if is_hf_spaces else 'No'}")
|
| 125 |
+
|
| 126 |
+
return True
|
| 127 |
+
|
| 128 |
+
def fix_common_issues():
|
| 129 |
+
"""Suggest fixes for common issues"""
|
| 130 |
+
logger.info("π‘ Common issue fixes:")
|
| 131 |
+
|
| 132 |
+
# Check dataset repository format
|
| 133 |
+
dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
| 134 |
+
if '/' not in dataset_repo:
|
| 135 |
+
logger.warning(f"β οΈ Dataset repo format issue: {dataset_repo} should be 'username/dataset-name'")
|
| 136 |
+
else:
|
| 137 |
+
logger.info(f"β
Dataset repo format looks good: {dataset_repo}")
|
| 138 |
+
|
| 139 |
+
# Check for URL issues
|
| 140 |
+
trackio_url = os.environ.get('TRACKIO_URL', 'https://tonic-test-trackio-test.hf.space')
|
| 141 |
+
if trackio_url.startswith('https://https://') or trackio_url.startswith('http://http://'):
|
| 142 |
+
logger.warning(f"β οΈ URL format issue detected: {trackio_url}")
|
| 143 |
+
fixed_url = trackio_url.replace('https://https://', 'https://').replace('http://http://', 'http://')
|
| 144 |
+
logger.info(f"π‘ Fixed URL should be: {fixed_url}")
|
| 145 |
+
else:
|
| 146 |
+
logger.info(f"β
Trackio URL format looks good: {trackio_url}")
|
| 147 |
+
|
| 148 |
+
def main():
|
| 149 |
+
"""Run all diagnostic tests"""
|
| 150 |
+
logger.info("π§ Starting Trackio Space diagnostics...")
|
| 151 |
+
logger.info("=" * 60)
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
# Test environment
|
| 155 |
+
test_environment()
|
| 156 |
+
logger.info("-" * 40)
|
| 157 |
+
|
| 158 |
+
# Test dataset manager
|
| 159 |
+
dataset_manager_ok = test_dataset_manager()
|
| 160 |
+
logger.info("-" * 40)
|
| 161 |
+
|
| 162 |
+
# Test TrackioSpace
|
| 163 |
+
trackio_space_ok = test_trackio_space()
|
| 164 |
+
logger.info("-" * 40)
|
| 165 |
+
|
| 166 |
+
# Suggest fixes
|
| 167 |
+
fix_common_issues()
|
| 168 |
+
logger.info("-" * 40)
|
| 169 |
+
|
| 170 |
+
# Summary
|
| 171 |
+
logger.info("π DIAGNOSTIC SUMMARY:")
|
| 172 |
+
logger.info(f"Dataset Manager: {'β
OK' if dataset_manager_ok else 'β Issues'}")
|
| 173 |
+
logger.info(f"TrackioSpace: {'β
OK' if trackio_space_ok else 'β Issues'}")
|
| 174 |
+
|
| 175 |
+
if dataset_manager_ok and trackio_space_ok:
|
| 176 |
+
logger.info("π All systems appear to be working correctly!")
|
| 177 |
+
logger.info("π‘ The issues in the logs might be related to:")
|
| 178 |
+
logger.info(" - Empty dataset (expected for new setup)")
|
| 179 |
+
logger.info(" - API client URL formatting (being auto-fixed)")
|
| 180 |
+
logger.info(" - Remote data access (falling back to local data)")
|
| 181 |
+
else:
|
| 182 |
+
logger.warning("β οΈ Some issues detected. Check the logs above for details.")
|
| 183 |
+
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"β Diagnostic script failed: {e}")
|
| 186 |
+
return False
|
| 187 |
+
|
| 188 |
+
return True
|
| 189 |
+
|
| 190 |
+
if __name__ == "__main__":
|
| 191 |
+
main()
|