Deeptanshuu commited on Apr 3

Commit

d187b57

verified ·

1 Parent(s): 120843d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +48 -0
.env.template +9 -0
.gitattributes +27 -35
.gitignore +83 -0
Dockerfile +29 -0
analysis/analysis.txt +264 -0
analysis/analyze_lang_distribution.py +336 -0
analysis/compute_class_weights.py +499 -0
analysis/plot_loss_curves.py +374 -0
analysis/plot_roc_curves.py +163 -0
app.py +262 -0
augmentation/balance_english.py +237 -0
augmentation/threat_augment.py +379 -0
augmentation/toxic_augment.py +439 -0
datacard.md +39 -0
docker-compose.yml +13 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_identity_hate.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_insult.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_obscene.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_severe_toxic.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_threat.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_0.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_1.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_2.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_3.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_4.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_5.png +0 -0
evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_6.png +0 -0
evaluation_results/eval_20250208_161149/eval_params.json +7 -0
evaluation_results/eval_20250208_161149/evaluation_results.json +2020 -0
evaluation_results/eval_20250208_161149/plots/calibration_0.png +3 -0
evaluation_results/eval_20250208_161149/plots/calibration_1.png +3 -0
evaluation_results/eval_20250208_161149/plots/calibration_2.png +3 -0
evaluation_results/eval_20250208_161149/plots/calibration_3.png +3 -0
evaluation_results/eval_20250208_161149/plots/calibration_4.png +3 -0
evaluation_results/eval_20250208_161149/plots/calibration_5.png +3 -0
evaluation_results/eval_20250208_161149/plots/calibration_6.png +3 -0
evaluation_results/eval_20250208_161149/plots/class_calibration.png +3 -0
evaluation_results/eval_20250208_161149/plots/language_performance.png +0 -0
evaluation_results/eval_20250208_161149/plots/metric_correlations.png +0 -0
evaluation_results/eval_20250208_161149/plots/overall_calibration.png +0 -0
evaluation_results/eval_20250208_161149/plots/performance_distributions.png +0 -0
evaluation_results/eval_20250208_161149/predictions.npz +3 -0
evaluation_results/eval_20250208_161149/thresholds.json +58 -0
evaluation_results/eval_20250401_143401/eval_params.json +21 -0
evaluation_results/eval_20250401_143401/evaluation_results.json +684 -0
evaluation_results/eval_20250401_143401/plots/per_class_comparison.png +0 -0
evaluation_results/eval_20250401_143401/plots/roc_all_classes.png +3 -0
evaluation_results/eval_20250401_143401/plots/roc_by_language.png +3 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,48 @@

+# Git
+.git
+.gitignore
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Project specific
+dataset/
+weights/
+wandb/
+*.pt
+*.pth
+*.ckpt
+# Logs
+*.log
+logs/

.env.template ADDED Viewed

	@@ -0,0 +1,9 @@

+# Weights & Biases API Key
+WANDB_API_KEY=
+# Model Configuration
+BATCH_SIZE=16
+GRAD_ACCUM_STEPS=4
+EPOCHS=5
+LEARNING_RATE=2e-5
+MODEL_NAME=xlm-roberta-large

.gitattributes CHANGED Viewed

@@ -1,35 +1,27 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+dataset/raw/MULTILINGUAL_TOXIC_DATASET_360K_7LANG.csv filter=lfs diff=lfs merge=lfs -text
+dataset/raw/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary.csv filter=lfs diff=lfs merge=lfs -text
+dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_FINAL.csv filter=lfs diff=lfs merge=lfs -text
+dataset/split/train.csv filter=lfs diff=lfs merge=lfs -text
+dataset/processed/MULTILINGUAL_TOXIC_DATASET_AUGMENTED.csv filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/calibration_0.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/calibration_1.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/calibration_2.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/calibration_3.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/calibration_4.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/calibration_5.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/calibration_6.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/plots/class_calibration.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250208_161149/predictions.npz filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_all_classes.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_by_language.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_identity_hate.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_insult.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_obscene.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_severe_toxic.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_threat.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/plots/roc_toxic.png filter=lfs diff=lfs merge=lfs -text
+evaluation_results/eval_20250401_143401/predictions.npz filter=lfs diff=lfs merge=lfs -text
+images/class_distribution.png filter=lfs diff=lfs merge=lfs -text
+images/language_distribution.png filter=lfs diff=lfs merge=lfs -text
+images/toxicity_by_language.png filter=lfs diff=lfs merge=lfs -text
+images/toxicity_correlation.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,83 @@

+# Python cache files
+__pycache__/
+*.py[cod]
+# Virtual environment
+venv/
+ENV/
+env/
+env.bak/
+venv.bak/
+# Gradio
+.gradio/*
+# Weights and Biases
+weights/*
+dataset/*
+cache/*
+wandb/*
+# IDE and editor files
+.idea/
+.vscode/
+*.swp
+*.swo
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Pytest
+.cache/
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# pyre type checker
+.pyre/
+# C extensions
+*.so
+# Backup files
+*~
+*.bak
+*.tmp
+#Logging
+*.log
+logs/
+*.csv

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# Use CUDA-enabled PyTorch base image
+FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements file
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy project files
+COPY . .
+# Create directories for data and models
+RUN mkdir -p dataset/final_balanced weights
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV WANDB_API_KEY=""
+# Default command to run training
+CMD ["python", "model/train.py"]

analysis/analysis.txt ADDED Viewed

	@@ -0,0 +1,264 @@

+(venv) PS V:\Deeptanshu Lal\PROJECTS\Toxic Comment Classification> python .\analysis\analyze_lang_distribution.py
+Reading dataset...
+Dataset Overview:
+--------------------------------------------------
+Total number of comments: 361,228
+Number of languages: 7
+Language Distribution:
+--------------------------------------------------
+ru: 52,632 comments (14.57%)
+tr: 52,558 comments (14.55%)
+pt: 52,440 comments (14.52%)
+es: 52,412 comments (14.51%)
+fr: 52,368 comments (14.50%)
+it: 52,340 comments (14.49%)
+en: 46,478 comments (12.87%)
+Class Distribution by Language:
+--------------------------------------------------
+RU (Total: 52,632 comments)
+0 toxic classes: 26,316 (50.00%)
+1 toxic classes: 7,688 (14.61%)
+2 toxic classes: 8,010 (15.22%)
+3 toxic classes: 7,103 (13.50%)
+4 toxic classes: 2,740 (5.21%)
+5 toxic classes: 706 (1.34%)
+6 toxic classes: 69 (0.13%)
+TR (Total: 52,558 comments)
+0 toxic classes: 26,279 (50.00%)
+1 toxic classes: 7,677 (14.61%)
+2 toxic classes: 8,004 (15.23%)
+3 toxic classes: 7,088 (13.49%)
+4 toxic classes: 2,736 (5.21%)
+5 toxic classes: 705 (1.34%)
+6 toxic classes: 69 (0.13%)
+PT (Total: 52,440 comments)
+0 toxic classes: 26,220 (50.00%)
+1 toxic classes: 7,668 (14.62%)
+2 toxic classes: 7,977 (15.21%)
+3 toxic classes: 7,071 (13.48%)
+4 toxic classes: 2,732 (5.21%)
+5 toxic classes: 703 (1.34%)
+6 toxic classes: 69 (0.13%)
+ES (Total: 52,412 comments)
+0 toxic classes: 26,206 (50.00%)
+1 toxic classes: 7,647 (14.59%)
+2 toxic classes: 7,982 (15.23%)
+3 toxic classes: 7,069 (13.49%)
+4 toxic classes: 2,737 (5.22%)
+5 toxic classes: 702 (1.34%)
+6 toxic classes: 69 (0.13%)
+FR (Total: 52,368 comments)
+0 toxic classes: 26,184 (50.00%)
+1 toxic classes: 7,626 (14.56%)
+2 toxic classes: 7,990 (15.26%)
+3 toxic classes: 7,066 (13.49%)
+4 toxic classes: 2,728 (5.21%)
+5 toxic classes: 705 (1.35%)
+6 toxic classes: 69 (0.13%)
+IT (Total: 52,340 comments)
+0 toxic classes: 26,170 (50.00%)
+1 toxic classes: 7,652 (14.62%)
+2 toxic classes: 7,967 (15.22%)
+3 toxic classes: 7,057 (13.48%)
+4 toxic classes: 2,722 (5.20%)
+5 toxic classes: 703 (1.34%)
+6 toxic classes: 69 (0.13%)
+EN (Total: 46,478 comments)
+0 toxic classes: 22,989 (49.46%)
+1 toxic classes: 8,499 (18.29%)
+2 toxic classes: 5,604 (12.06%)
+3 toxic classes: 6,391 (13.75%)
+4 toxic classes: 2,395 (5.15%)
+5 toxic classes: 553 (1.19%)
+6 toxic classes: 47 (0.10%)
+Detailed Toxicity Analysis by Language:
+--------------------------------------------------
+RU (Total: 52,632 comments)
+- Toxic:
+  Count: 25,954 (49.31%)
+  95% CI: [48.89%, 49.74%]
+- Severe Toxic:
+  Count: 2,441 (4.64%)
+  95% CI: [4.46%, 4.82%]
+- Obscene:
+  Count: 12,432 (23.62%)
+  95% CI: [23.26%, 23.98%]
+- Threat:
+  Count: 1,075 (2.04%)
+  95% CI: [1.92%, 2.16%]
+- Insult:
+  Count: 15,207 (28.89%)
+  95% CI: [28.51%, 29.28%]
+- Identity Hate:
+  Count: 2,812 (5.34%)
+  95% CI: [5.15%, 5.53%]
+TR (Total: 52,558 comments)
+- Toxic:
+  Count: 25,908 (49.29%)
+  95% CI: [48.87%, 49.72%]
+- Severe Toxic:
+  Count: 2,439 (4.64%)
+  95% CI: [4.46%, 4.82%]
+- Obscene:
+  Count: 12,411 (23.61%)
+  95% CI: [23.25%, 23.98%]
+- Threat:
+  Count: 1,077 (2.05%)
+  95% CI: [1.93%, 2.17%]
+- Insult:
+  Count: 15,170 (28.86%)
+  95% CI: [28.48%, 29.25%]
+- Identity Hate:
+  Count: 2,827 (5.38%)
+  95% CI: [5.19%, 5.57%]
+PT (Total: 52,440 comments)
+- Toxic:
+  Count: 25,841 (49.28%)
+  95% CI: [48.85%, 49.71%]
+- Severe Toxic:
+  Count: 2,432 (4.64%)
+  95% CI: [4.46%, 4.82%]
+- Obscene:
+  Count: 12,395 (23.64%)
+  95% CI: [23.27%, 24.00%]
+- Threat:
+  Count: 1,080 (2.06%)
+  95% CI: [1.94%, 2.18%]
+- Insult:
+  Count: 15,143 (28.88%)
+  95% CI: [28.49%, 29.26%]
+- Identity Hate:
+  Count: 2,801 (5.34%)
+  95% CI: [5.15%, 5.53%]
+ES (Total: 52,412 comments)
+- Toxic:
+  Count: 25,874 (49.37%)
+  95% CI: [48.94%, 49.79%]
+- Severe Toxic:
+  Count: 2,432 (4.64%)
+  95% CI: [4.46%, 4.82%]
+- Obscene:
+  Count: 12,388 (23.64%)
+  95% CI: [23.27%, 24.00%]
+- Threat:
+  Count: 1,073 (2.05%)
+  95% CI: [1.93%, 2.17%]
+- Insult:
+  Count: 15,140 (28.89%)
+  95% CI: [28.50%, 29.27%]
+- Identity Hate:
+  Count: 2,783 (5.31%)
+  95% CI: [5.12%, 5.50%]
+FR (Total: 52,368 comments)
+- Toxic:
+  Count: 25,877 (49.41%)
+  95% CI: [48.99%, 49.84%]
+- Severe Toxic:
+  Count: 2,428 (4.64%)
+  95% CI: [4.46%, 4.82%]
+- Obscene:
+  Count: 12,379 (23.64%)
+  95% CI: [23.27%, 24.00%]
+- Threat:
+  Count: 1,066 (2.04%)
+  95% CI: [1.91%, 2.16%]
+- Insult:
+  Count: 15,131 (28.89%)
+  95% CI: [28.51%, 29.28%]
+- Identity Hate:
+  Count: 2,774 (5.30%)
+  95% CI: [5.11%, 5.49%]
+IT (Total: 52,340 comments)
+- Toxic:
+  Count: 25,827 (49.34%)
+  95% CI: [48.92%, 49.77%]
+- Severe Toxic:
+  Count: 2,429 (4.64%)
+  95% CI: [4.46%, 4.82%]
+- Obscene:
+  Count: 12,341 (23.58%)
+  95% CI: [23.21%, 23.94%]
+- Threat:
+  Count: 1,077 (2.06%)
+  95% CI: [1.94%, 2.18%]
+- Insult:
+  Count: 15,118 (28.88%)
+  95% CI: [28.50%, 29.27%]
+- Identity Hate:
+  Count: 2,782 (5.32%)
+  95% CI: [5.12%, 5.51%]
+EN (Total: 46,478 comments)
+- Toxic:
+  Count: 22,343 (48.07%)
+  95% CI: [47.62%, 48.53%]
+- Severe Toxic:
+  Count: 1,986 (4.27%)
+  95% CI: [4.09%, 4.46%]
+- Obscene:
+  Count: 12,356 (26.58%)
+  95% CI: [26.18%, 26.99%]
+- Threat:
+  Count: 1,204 (2.59%)
+  95% CI: [2.45%, 2.73%]
+- Insult:
+  Count: 11,475 (24.69%)
+  95% CI: [24.30%, 25.08%]
+- Identity Hate:
+  Count: 2,143 (4.61%)
+  95% CI: [4.42%, 4.80%]
+Statistical Analysis:
+--------------------------------------------------
+Chi-square test for number of toxic classes by language:
+Chi-square statistic: 654.28
+p-value: 0.0000000000
+Significant at α=0.05: Yes
+Chi-square test for Toxic:
+Chi-square statistic: 26.10
+p-value: 0.0002136602
+Significant at α=0.05: Yes
+Chi-square test for Severe Toxic:
+Chi-square statistic: 12.38
+p-value: 0.0540052211
+Significant at α=0.05: No
+Chi-square test for Obscene:
+Chi-square statistic: 195.12
+p-value: 0.0000000000
+Significant at α=0.05: Yes
+Chi-square test for Threat:
+Chi-square statistic: 57.45
+p-value: 0.0000000001
+Significant at α=0.05: Yes
+Chi-square test for Insult:
+Chi-square statistic: 350.72
+p-value: 0.0000000000
+Significant at α=0.05: Yes
+Chi-square test for Identity Hate:
+Chi-square statistic: 42.77
+p-value: 0.0000001295
+Significant at α=0.05: Yes

analysis/analyze_lang_distribution.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from scipy import stats
+import os
+def set_style():
+    """Set the style for all plots"""
+    # Use a basic style instead of seaborn
+    plt.style.use('default')
+    # Custom style settings
+    plt.rcParams['figure.figsize'] = (12, 6)
+    plt.rcParams['font.size'] = 10
+    plt.rcParams['axes.titlesize'] = 14
+    plt.rcParams['axes.labelsize'] = 12
+    plt.rcParams['axes.grid'] = True
+    plt.rcParams['grid.alpha'] = 0.3
+    # Custom color palette
+    colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC', '#99FFCC', '#FFB366']
+    return colors
+def create_language_distribution_plot(df, lang_dist, lang_percent, colors, image_dir):
+    """Create and save language distribution plot"""
+    plt.figure(figsize=(14, 8))
+    # Create bar positions
+    x = np.arange(len(lang_dist))
+    # Create bars with language names as x-ticks
+    bars = plt.bar(x, lang_dist.values, color=colors)
+    plt.title('Language Distribution in Multilingual Toxic Comment Dataset', pad=20)
+    plt.xlabel('Language', labelpad=10)
+    plt.ylabel('Number of Comments', labelpad=10)
+    # Set x-ticks to language names
+    plt.xticks(x, lang_dist.index, rotation=45)
+    # Add value labels on top of each bar with increased spacing
+    for i, bar in enumerate(bars):
+        height = bar.get_height()
+        plt.text(bar.get_x() + bar.get_width()/2., height + (max(lang_dist.values) * 0.01),
+                f'{int(height):,}\n({lang_percent.values[i]:.1f}%)',
+                ha='center', va='bottom', fontsize=10)
+    # Add some padding to the top of the plot
+    plt.margins(y=0.2)
+    plt.tight_layout()
+    plt.savefig(os.path.join(image_dir, 'language_distribution.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+def create_toxicity_heatmap(df, toxicity_cols, image_dir):
+    """Create and save toxicity correlation heatmap"""
+    plt.figure(figsize=(12, 10))
+    # Calculate correlation and sort
+    correlation = df[toxicity_cols].corr()
+    # Sort correlation matrix by mean correlation value
+    mean_corr = correlation.mean()
+    sorted_cols = mean_corr.sort_values(ascending=False).index
+    correlation = correlation.loc[sorted_cols, sorted_cols]
+    # Create heatmap with better styling
+    im = plt.imshow(correlation, cmap='RdYlBu_r', aspect='equal', vmin=0, vmax=1)
+    plt.colorbar(im, label='Correlation Coefficient')
+    # Add text annotations with conditional formatting
+    for i in range(len(correlation)):
+        for j in range(len(correlation)):
+            corr_value = correlation.iloc[i, j]
+            # Choose text color based on background
+            text_color = 'white' if abs(corr_value) > 0.7 else 'black'
+            # Make diagonal elements bold
+            fontweight = 'bold' if i == j else 'normal'
+            plt.text(j, i, f'{corr_value:.2f}',
+                    ha='center', va='center',
+                    color=text_color,
+                    fontweight=fontweight,
+                    fontsize=10)
+    # Improve title and labels
+    plt.title('Correlation between Different Types of Toxicity\n(Sorted by Average Correlation)',
+             pad=20, fontsize=14)
+    # Format axis labels
+    formatted_labels = [col.replace('_', ' ').title() for col in correlation.columns]
+    plt.xticks(range(len(formatted_labels)), formatted_labels, rotation=45, ha='right')
+    plt.yticks(range(len(formatted_labels)), formatted_labels)
+    # Add gridlines
+    plt.grid(False)
+    # Adjust layout
+    plt.tight_layout()
+    plt.savefig(os.path.join(image_dir, 'toxicity_correlation.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+def create_toxicity_by_language_plot(df, lang_dist, toxicity_cols, colors, image_dir):
+    """Create and save toxicity distribution by language plot"""
+    plt.figure(figsize=(15, 8))
+    x = np.arange(len(lang_dist.index))
+    width = 0.15
+    multiplier = 0
+    for attribute, color in zip(toxicity_cols, colors):
+        # Calculate percentage of toxic comments (any value > 0)
+        attribute_means = [(df[df['lang'] == lang][attribute] > 0).mean() * 100
+                         for lang in lang_dist.index]
+        offset = width * multiplier
+        rects = plt.bar(x + offset, attribute_means, width,
+                       label=attribute.replace('_', ' ').title(),
+                       color=color, alpha=0.8)
+        # Add value labels on the bars
+        for rect in rects:
+            height = rect.get_height()
+            plt.text(rect.get_x() + rect.get_width()/2., height,
+                    f'{height:.1f}%', ha='center', va='bottom', fontsize=8)
+        multiplier += 1
+    plt.xlabel('Language')
+    plt.ylabel('Percentage of Toxic Comments (%)')
+    plt.title('Distribution of Toxicity Types by Language')
+    plt.xticks(x + width * 2.5, lang_dist.index, rotation=45)
+    plt.legend(loc='upper right', bbox_to_anchor=(1, 1))
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(os.path.join(image_dir, 'toxicity_by_language.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+def create_class_distribution_plot(df, lang_dist, image_dir):
+    """Create and save class distribution across languages plot"""
+    plt.figure(figsize=(16, 10))
+    # Define toxicity columns and their display names
+    toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    display_names = [col.replace('_', ' ').title() for col in toxicity_cols]
+    # Calculate class distribution for each language
+    class_dist = {}
+    non_toxic_dist = {}  # Store non-toxic percentages
+    for lang in lang_dist.index:
+        lang_df = df[df['lang'] == lang]
+        total = len(lang_df)
+        # Create a binary matrix of toxicity flags
+        toxic_matrix = lang_df[toxicity_cols].astype(bool)
+        # Calculate non-toxic percentage (comments with no toxic flags)
+        non_toxic_mask = ~toxic_matrix.any(axis=1)
+        non_toxic_percent = (non_toxic_mask.sum() / total) * 100
+        non_toxic_dist[lang] = non_toxic_percent
+        # Calculate percentages for each toxicity type
+        class_dist[lang] = [(toxic_matrix[col].sum() / total) * 100 for col in toxicity_cols]
+    # Create stacked bar chart
+    x = np.arange(len(lang_dist.index))
+    # Use a color scheme with an additional color for non-toxic
+    colors = plt.cm.Set3(np.linspace(0, 1, len(toxicity_cols) + 1))
+    # First, plot non-toxic comments
+    non_toxic_values = [non_toxic_dist[lang] for lang in lang_dist.index]
+    non_toxic_bar = plt.bar(x, non_toxic_values, label='Non-Toxic', color=colors[0], alpha=0.9)
+    # Add percentage labels for non-toxic
+    for j, v in enumerate(non_toxic_values):
+        if v > 1:  # Show all values above 1%
+            plt.text(x[j], v/2, f'{v:.1f}%',
+                    ha='center', va='center',
+                    color='black',
+                    fontweight='bold',
+                    fontsize=9)
+    # Initialize bottom array with non-toxic values
+    bottom = np.array(non_toxic_values)
+    # Then plot toxic categories
+    bars = [non_toxic_bar]
+    for i, (col, display_name) in enumerate(zip(toxicity_cols, display_names)):
+        values = [class_dist[lang][i] for lang in lang_dist.index]
+        bar = plt.bar(x, values, bottom=bottom, label=display_name, color=colors[i+1], alpha=0.9)
+        bars.append(bar)
+        # Add percentage labels for all values > 1%
+        for j, v in enumerate(values):
+            if v > 1:  # Show all values above 1%
+                center = bottom[j] + v/2
+                text_color = 'black' if v > 10 else 'black'
+                plt.text(x[j], center, f'{v:.1f}%',
+                        ha='center', va='center',
+                        color=text_color,
+                        fontweight='bold',
+                        fontsize=9)
+        bottom = bottom + np.array(values)  # Update bottom array correctly
+    plt.xlabel('Language', labelpad=10, fontsize=12)
+    plt.ylabel('Percentage of Comments', labelpad=10, fontsize=12)
+    plt.title('Distribution of Non-Toxic and Toxic Comments by Language', pad=20, fontsize=14)
+    plt.xticks(x, lang_dist.index, rotation=45, fontsize=10)
+    # Adjust legend
+    plt.legend(title='Comment Types',
+              bbox_to_anchor=(1.15, 1),
+              loc='upper left',
+              fontsize=10,
+              title_fontsize=12)
+    # Add grid for better readability
+    plt.grid(True, axis='y', alpha=0.3)
+    # Adjust layout to prevent label cutoff
+    plt.margins(y=0.1)
+    plt.tight_layout()
+    plt.savefig(os.path.join(image_dir, 'class_distribution.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+def analyze_language_distribution():
+    """Analyze language distribution and toxicity patterns in the dataset"""
+    # Create images directory if it doesn't exist
+    image_dir = 'images'
+    os.makedirs(image_dir, exist_ok=True)
+    # Set style and get color palette
+    colors = set_style()
+    # Read the dataset
+    print("Reading dataset...")
+    input_file = 'dataset/split/train.csv'
+    df = pd.read_csv(input_file)
+    # Get language distribution
+    lang_dist = df['lang'].value_counts()
+    lang_percent = df['lang'].value_counts(normalize=True) * 100
+    # Print basic statistics
+    print("\nDataset Overview:")
+    print("-" * 50)
+    print("Input file: ", input_file)
+    print(f"Total number of comments: {len(df):,}")
+    print(f"Number of languages: {df['lang'].nunique()}")
+    print("\nLanguage Distribution:")
+    print("-" * 50)
+    for lang, count in lang_dist.items():
+        print(f"{lang}: {count:,} comments ({lang_percent[lang]:.2f}%)")
+    # Create language distribution plot
+    create_language_distribution_plot(df, lang_dist, lang_percent, colors, image_dir)
+    # Analyze toxicity
+    toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    # Create correlation heatmap
+    create_toxicity_heatmap(df, toxicity_cols, image_dir)
+    # Create toxicity by language plot
+    create_toxicity_by_language_plot(df, lang_dist, toxicity_cols, colors, image_dir)
+    # Create class distribution plot
+    create_class_distribution_plot(df, lang_dist, image_dir)
+    # Print class distribution statistics
+    print("\nClass Distribution by Language:")
+    print("-" * 50)
+    for lang in lang_dist.index:
+        lang_df = df[df['lang'] == lang]
+        total = len(lang_df)
+        print(f"\n{lang.upper()} (Total: {total:,} comments)")
+        # Count comments by number of toxic classes
+        toxic_counts = lang_df[toxicity_cols].astype(bool).sum(axis=1)
+        class_dist = toxic_counts.value_counts().sort_index()
+        for n_classes, count in class_dist.items():
+            percentage = (count / total) * 100
+            print(f"{n_classes} toxic classes: {count:,} ({percentage:.2f}%)")
+    # Detailed toxicity analysis by language
+    print("\nDetailed Toxicity Analysis by Language:")
+    print("-" * 50)
+    for lang in lang_dist.index:
+        lang_df = df[df['lang'] == lang]
+        print(f"\n{lang.upper()} (Total: {len(lang_df):,} comments)")
+        # Calculate toxicity statistics
+        for col in toxicity_cols:
+            toxic_count = (lang_df[col] > 0).sum()
+            toxic_percent = (toxic_count / len(lang_df)) * 100
+            # Calculate confidence interval
+            ci = stats.norm.interval(0.95,
+                                   loc=toxic_percent/100,
+                                   scale=np.sqrt((toxic_percent/100 * (1-toxic_percent/100)) / len(lang_df)))
+            ci_lower, ci_upper = ci[0] * 100, ci[1] * 100
+            print(f"- {col.replace('_', ' ').title()}:")
+            print(f"  Count: {toxic_count:,} ({toxic_percent:.2f}%)")
+            print(f"  95% CI: [{ci_lower:.2f}%, {ci_upper:.2f}%]")
+    # Statistical tests
+    print("\nStatistical Analysis:")
+    print("-" * 50)
+    # Chi-square test for independence between language and number of toxic classes
+    toxic_class_counts = pd.crosstab(df['lang'], df[toxicity_cols].astype(bool).sum(axis=1))
+    chi2, p_value, _, _ = stats.chi2_contingency(toxic_class_counts)
+    print("\nChi-square test for number of toxic classes by language:")
+    print(f"Chi-square statistic: {chi2:.2f}")
+    print(f"p-value: {p_value:.10f}")
+    print(f"Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")
+    # Chi-square test for each toxicity type
+    for col in toxicity_cols:
+        binary_col = (df[col] > 0).astype(int)
+        contingency_table = pd.crosstab(df['lang'], binary_col)
+        chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
+        print(f"\nChi-square test for {col.replace('_', ' ').title()}:")
+        print(f"Chi-square statistic: {chi2:.2f}")
+        print(f"p-value: {p_value:.10f}")
+        print(f"Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")
+if __name__ == "__main__":
+    analyze_language_distribution()

analysis/compute_class_weights.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import numpy as np
+import pandas as pd
+import json
+from typing import Dict, List
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+def validate_parameters(params: Dict) -> Dict:
+    """
+    Validate weight calculation parameters to prevent dangerous combinations.
+    Includes validation for focal loss parameters.
+    """
+    # Check for dangerous weight scaling
+    if params['boost_factor'] * params['max_weight'] > 30:
+        raise ValueError(f"Dangerous weight scaling detected: boost_factor * max_weight = {params['boost_factor'] * params['max_weight']}")
+    # Validate focal loss parameters
+    if not 0 < params['gamma'] <= 5.0:
+        raise ValueError(f"Invalid gamma value: {params['gamma']}. Must be in (0, 5.0]")
+    if not 0 < params['alpha'] < 1:
+        raise ValueError(f"Invalid alpha value: {params['alpha']}. Must be in (0, 1)")
+    # Check for potentially unstable combinations
+    if params['gamma'] > 3.0 and params['boost_factor'] > 1.5:
+        logging.warning(f"Potentially unstable combination: high gamma ({params['gamma']}) with high boost factor ({params['boost_factor']})")
+    if params['alpha'] > 0.4 and params['boost_factor'] > 1.5:
+        logging.warning(f"Potentially unstable combination: high alpha ({params['alpha']}) with high boost factor ({params['boost_factor']})")
+    return params
+def calculate_safe_weights(
+    support_0: int,
+    support_1: int,
+    max_weight: float = 15.0,
+    min_weight: float = 0.5,
+    gamma: float = 2.0,
+    alpha: float = 0.25,
+    boost_factor: float = 1.0,
+    num_classes: int = 6,
+    lang: str = None,
+    toxicity_type: str = None
+) -> Dict[str, float]:
+    """
+    Calculate class weights with focal loss and adaptive scaling.
+    Uses focal loss components for better handling of imbalanced classes
+    while preserving language-specific adjustments.
+    Args:
+        support_0: Number of negative samples
+        support_1: Number of positive samples
+        max_weight: Maximum allowed weight
+        min_weight: Minimum allowed weight
+        gamma: Focal loss gamma parameter for down-weighting easy examples
+        alpha: Focal loss alpha parameter for balancing positive/negative classes
+        boost_factor: Optional boost for specific classes
+        num_classes: Number of toxicity classes (default=6)
+        lang: Language code for language-specific constraints
+        toxicity_type: Type of toxicity for class-specific constraints
+    """
+    # Input validation with detailed error messages
+    if support_0 < 0 or support_1 < 0:
+        raise ValueError(f"Negative sample counts: support_0={support_0}, support_1={support_1}")
+    eps = 1e-7  # Small epsilon for numerical stability
+    total = support_0 + support_1 + eps
+    # Handle empty dataset case
+    if total <= eps:
+        logging.warning(f"Empty dataset for {toxicity_type} in {lang}")
+        return {
+            "0": 1.0,
+            "1": 1.0,
+            "support_0": support_0,
+            "support_1": support_1,
+            "raw_weight_1": 1.0,
+            "calculation_metadata": {
+                "formula": "default_weights_empty_dataset",
+                "constraints_applied": ["empty_dataset_fallback"]
+            }
+        }
+    # Handle zero support cases safely
+    if support_1 == 0:
+        logging.warning(f"No positive samples for {toxicity_type} in {lang}")
+        return {
+            "0": 1.0,
+            "1": max_weight,
+            "support_0": support_0,
+            "support_1": support_1,
+            "raw_weight_1": max_weight,
+            "calculation_metadata": {
+                "formula": "max_weight_no_positives",
+                "constraints_applied": ["no_positives_fallback"]
+            }
+        }
+    # Determine effective maximum weight based on class and language
+    if lang == 'en' and toxicity_type == 'threat':
+        effective_max = min(max_weight, 15.0)  # Absolute cap for EN threat
+    elif toxicity_type == 'identity_hate':
+        effective_max = min(max_weight, 10.0)  # Cap for identity hate
+    else:
+        effective_max = max_weight
+    try:
+        # Calculate class frequencies
+        freq_1 = support_1 / total
+        freq_0 = support_0 / total
+        # Focal loss components
+        pt = freq_1 + eps  # Probability of target class
+        modulating_factor = (1 - pt) ** gamma
+        balanced_alpha = alpha / (alpha + (1 - alpha) * (1 - pt))
+        # Base weight calculation with focal loss
+        raw_weight_1 = balanced_alpha * modulating_factor / (pt + eps)
+        # Apply adaptive scaling for severe classes
+        if toxicity_type in ['threat', 'identity_hate']:
+            severity_factor = (1 + np.log1p(total) / np.log1p(support_1)) / 2
+            raw_weight_1 *= severity_factor
+        # Apply boost factor
+        raw_weight_1 *= boost_factor
+        # Detect potential numerical instability
+        if not np.isfinite(raw_weight_1):
+            logging.error(f"Numerical instability detected for {toxicity_type} in {lang}")
+            raw_weight_1 = effective_max
+    except Exception as e:
+        logging.error(f"Weight calculation error: {str(e)}")
+        raw_weight_1 = effective_max
+    # Apply safety limits with effective maximum
+    weight_1 = min(effective_max, max(min_weight, raw_weight_1))
+    weight_0 = 1.0  # Reference weight for majority class
+    # Round weights for consistency and to prevent floating point issues
+    weight_1 = round(float(weight_1), 3)
+    weight_0 = round(float(weight_0), 3)
+    return {
+        "0": weight_0,
+        "1": weight_1,
+        "support_0": support_0,
+        "support_1": support_1,
+        "raw_weight_1": round(float(raw_weight_1), 3),
+        "calculation_metadata": {
+            "formula": "focal_loss_with_adaptive_scaling",
+            "gamma": round(float(gamma), 3),
+            "alpha": round(float(alpha), 3),
+            "final_pt": round(float(pt), 4),
+            "effective_max": round(float(effective_max), 3),
+            "modulating_factor": round(float(modulating_factor), 4),
+            "balanced_alpha": round(float(balanced_alpha), 4),
+            "severity_adjusted": toxicity_type in ['threat', 'identity_hate'],
+            "boost_factor": round(float(boost_factor), 3),
+            "constraints_applied": [
+                f"max_weight={effective_max}",
+                f"boost={boost_factor}",
+                f"numerical_stability=enforced",
+                f"adaptive_scaling={'enabled' if toxicity_type in ['threat', 'identity_hate'] else 'disabled'}"
+            ]
+        }
+    }
+def get_language_specific_params(lang: str, toxicity_type: str) -> Dict:
+    """
+    Get language and class specific parameters for weight calculation.
+    Includes focal loss parameters and their adjustments per language/class.
+    """
+    # Default parameters
+    default_params = {
+        "max_weight": 15.0,
+        "min_weight": 0.5,
+        "boost_factor": 1.0,
+        "gamma": 2.0,  # Default focal loss gamma
+        "alpha": 0.25  # Default focal loss alpha
+    }
+    # Updated language-specific adjustments based on analysis
+    lang_adjustments = {
+        "en": {
+            "toxic": {
+                "boost_factor": 1.67,  # To achieve ~3.5x weight
+                "gamma": 2.5  # More focus on hard examples for main class
+            },
+            "threat": {
+                "max_weight": 15.0,  # Absolute maximum cap
+                "gamma": 3.0,  # Higher gamma for severe class
+                "alpha": 0.3  # Slightly higher alpha for better recall
+            },
+            "identity_hate": {
+                "max_weight": 5.0,  # Reduced from 8.4
+                "gamma": 3.0,  # Higher gamma for severe class
+                "alpha": 0.3  # Slightly higher alpha for better recall
+            },
+            "severe_toxic": {
+                "max_weight": 3.9,  # Corrected weight
+                "gamma": 2.5  # Moderate gamma for balance
+            }
+        },
+        "tr": {
+            "threat": {
+                "max_weight": 12.8,  # Aligned with cross-lingual ratio
+                "gamma": 2.8  # Slightly lower than EN for stability
+            },
+            "identity_hate": {
+                "max_weight": 6.2,  # Adjusted for balance
+                "gamma": 2.8  # Slightly lower than EN for stability
+            }
+        },
+        "ru": {
+            "threat": {
+                "max_weight": 12.8,  # Aligned with cross-lingual ratio
+                "gamma": 2.8  # Slightly lower than EN for stability
+            },
+            "identity_hate": {
+                "max_weight": 7.0,  # Adjusted for balance
+                "gamma": 2.8  # Slightly lower than EN for stability
+            }
+        },
+        "fr": {
+            "toxic": {
+                "boost_factor": 1.2,  # To achieve ~2.2x weight
+                "gamma": 2.2  # Lower gamma for better stability
+            }
+        }
+    }
+    # Get language-specific params and validate
+    lang_params = lang_adjustments.get(lang, {})
+    class_params = lang_params.get(toxicity_type, {})
+    merged_params = {**default_params, **class_params}
+    return validate_parameters(merged_params)
+def check_cross_language_consistency(lang_weights: Dict) -> List[str]:
+    """
+    Check for consistency of weights across languages.
+    Returns a list of warnings for significant disparities.
+    """
+    warnings = []
+    baseline = lang_weights['en']
+    for lang in lang_weights:
+        if lang == 'en':
+            continue
+        for cls in ['threat', 'identity_hate']:
+            if cls in lang_weights[lang] and cls in baseline:
+                ratio = lang_weights[lang][cls]['1'] / baseline[cls]['1']
+                if ratio > 1.5 or ratio < 0.67:
+                    warning = f"Large {cls} weight disparity: {lang} vs en ({ratio:.2f}x)"
+                    warnings.append(warning)
+                    logging.warning(warning)
+    return warnings
+def validate_dataset_balance(df: pd.DataFrame) -> bool:
+    """
+    Validate dataset balance across languages.
+    Returns False if imbalance exceeds threshold.
+    """
+    sample_counts = df.groupby('lang').size()
+    cv = sample_counts.std() / sample_counts.mean()
+    if cv > 0.15:  # 15% threshold for coefficient of variation
+        logging.error(f"Dataset language imbalance exceeds 15% (CV={cv:.2%})")
+        for lang, count in sample_counts.items():
+            logging.warning(f"{lang}: {count:,} samples ({count/len(df):.1%})")
+        return False
+    return True
+def validate_weights(lang_weights: Dict) -> List[str]:
+    """
+    Ensure weights meet multilingual safety criteria.
+    Validates weight ratios and focal loss parameters across languages.
+    Args:
+        lang_weights: Dictionary of weights per language and class
+    Returns:
+        List of validation warnings
+    Raises:
+        ValueError: If weights violate safety constraints
+    """
+    warnings = []
+    for lang in lang_weights:
+        for cls in lang_weights[lang]:
+            w1 = lang_weights[lang][cls]['1']
+            w0 = lang_weights[lang][cls]['0']
+            # Check weight ratio sanity
+            ratio = w1 / w0
+            if ratio > 30:
+                raise ValueError(
+                    f"Dangerous weight ratio {ratio:.1f}x for {lang} {cls}. "
+                    f"Weight_1={w1:.3f}, Weight_0={w0:.3f}"
+                )
+            elif ratio > 20:
+                warnings.append(
+                    f"High weight ratio {ratio:.1f}x for {lang} {cls}"
+                )
+            # Check focal parameter boundaries
+            metadata = lang_weights[lang][cls]['calculation_metadata']
+            gamma = metadata.get('gamma', 0.0)
+            alpha = metadata.get('alpha', 0.0)
+            if gamma > 5.0:
+                raise ValueError(
+                    f"Unsafe gamma={gamma:.1f} for {lang} {cls}. "
+                    f"Must be <= 5.0"
+                )
+            elif gamma > 4.0:
+                warnings.append(
+                    f"High gamma={gamma:.1f} for {lang} {cls}"
+                )
+            if alpha > 0.9:
+                raise ValueError(
+                    f"Unsafe alpha={alpha:.2f} for {lang} {cls}. "
+                    f"Must be < 0.9"
+                )
+            elif alpha > 0.7:
+                warnings.append(
+                    f"High alpha={alpha:.2f} for {lang} {cls}"
+                )
+            # Check for combined risk factors
+            if gamma > 3.0 and ratio > 15:
+                warnings.append(
+                    f"Risky combination for {lang} {cls}: "
+                    f"gamma={gamma:.1f}, ratio={ratio:.1f}x"
+                )
+    return warnings
+def compute_language_weights(df: pd.DataFrame) -> Dict:
+    """
+    Compute weights with inter-language normalization to ensure consistent
+    weighting across languages while preserving relative class relationships.
+    """
+    # Validate dataset balance first
+    if not validate_dataset_balance(df):
+        logging.warning("Proceeding with imbalanced dataset - weights may need manual adjustment")
+    lang_weights = {}
+    toxicity_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    # First pass: calculate raw weights for each language and class
+    logging.info("\nFirst pass: Calculating raw weights")
+    for lang in df['lang'].unique():
+        logging.info(f"\nProcessing language: {lang}")
+        lang_df = df[df['lang'] == lang]
+        lang_weights[lang] = {}
+        for col in toxicity_columns:
+            y = lang_df[col].values.astype(np.int32)
+            support_0 = int((y == 0).sum())
+            support_1 = int((y == 1).sum())
+            params = get_language_specific_params(lang, col)
+            weights = calculate_safe_weights(
+                support_0=support_0,
+                support_1=support_1,
+                max_weight=params['max_weight'],
+                min_weight=params['min_weight'],
+                gamma=params['gamma'],
+                alpha=params['alpha'],
+                boost_factor=params['boost_factor'],
+                lang=lang,
+                toxicity_type=col
+            )
+            lang_weights[lang][col] = weights
+            # Log initial weights
+            logging.info(f"  {col} - Initial weights:")
+            logging.info(f"    Class 0: {weights['0']:.3f}, samples: {support_0:,}")
+            logging.info(f"    Class 1: {weights['1']:.3f}, samples: {support_1:,}")
+    # Second pass: normalize weights across languages
+    logging.info("\nSecond pass: Normalizing weights across languages")
+    for col in toxicity_columns:
+        # Find maximum weight for this toxicity type across all languages
+        max_weight = max(
+            lang_weights[lang][col]['1']
+            for lang in lang_weights
+        )
+        if max_weight > 0:  # Prevent division by zero
+            logging.info(f"\nNormalizing {col}:")
+            logging.info(f"  Maximum weight across languages: {max_weight:.3f}")
+            # Normalize weights for each language
+            for lang in lang_weights:
+                original_weight = lang_weights[lang][col]['1']
+                # Normalize and rescale
+                normalized_weight = (original_weight / max_weight) * 15.0
+                # Update weight while preserving metadata
+                lang_weights[lang][col]['raw_weight_1'] = original_weight
+                lang_weights[lang][col]['1'] = round(normalized_weight, 3)
+                # Add normalization info to metadata
+                lang_weights[lang][col]['calculation_metadata'].update({
+                    'normalization': {
+                        'original_weight': round(float(original_weight), 3),
+                        'max_weight_across_langs': round(float(max_weight), 3),
+                        'normalization_factor': round(float(15.0 / max_weight), 3)
+                    }
+                })
+                # Log normalization results
+                logging.info(f"  {lang}: {original_weight:.3f} → {normalized_weight:.3f}")
+    # Validate final weights
+    logging.info("\nValidating final weights:")
+    for col in toxicity_columns:
+        weights_range = [
+            lang_weights[lang][col]['1']
+            for lang in lang_weights
+        ]
+        logging.info(f"  {col}: range [{min(weights_range):.3f}, {max(weights_range):.3f}]")
+    # Validate weights meet safety criteria
+    validation_warnings = validate_weights(lang_weights)
+    if validation_warnings:
+        logging.warning("\nWeight validation warnings:")
+        for warning in validation_warnings:
+            logging.warning(f"  {warning}")
+    # Check cross-language consistency
+    consistency_warnings = check_cross_language_consistency(lang_weights)
+    if consistency_warnings:
+        logging.warning("\nCross-language consistency warnings:")
+        for warning in consistency_warnings:
+            logging.warning(f"  {warning}")
+    return lang_weights
+def main():
+    # Load dataset
+    input_file = 'dataset/processed/MULTILINGUAL_TOXIC_DATASET_AUGMENTED.csv'
+    logging.info(f"Loading dataset from {input_file}")
+    df = pd.read_csv(input_file)
+    # Compute weights
+    lang_weights = compute_language_weights(df)
+    # Add metadata
+    weights_data = {
+        "metadata": {
+            "total_samples": len(df),
+            "language_distribution": df['lang'].value_counts().to_dict(),
+            "weight_calculation": {
+                "method": "focal_loss_with_adaptive_scaling",
+                "parameters": {
+                    "default_max_weight": 15.0,
+                    "default_min_weight": 0.5,
+                    "language_specific_adjustments": True
+                }
+            }
+        },
+        "weights": lang_weights
+    }
+    # Save weights
+    output_file = 'weights/language_class_weights.json'
+    logging.info(f"\nSaving weights to {output_file}")
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(weights_data, f, indent=2, ensure_ascii=False)
+    logging.info("\nWeight calculation complete!")
+    # Print summary statistics
+    logging.info("\nSummary of adjustments made:")
+    for lang in lang_weights:
+        for col in ['threat', 'identity_hate']:
+            if col in lang_weights[lang]:
+                weight = lang_weights[lang][col]['1']
+                raw = lang_weights[lang][col]['raw_weight_1']
+                if raw != weight:
+                    logging.info(f"{lang} {col}: Adjusted from {raw:.2f}× to {weight:.2f}×")
+if __name__ == "__main__":
+    main()

analysis/plot_loss_curves.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import pandas as pd
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+from datetime import datetime
+import logging
+from pathlib import Path
+from torch.utils.data import DataLoader
+import sys
+import os
+import wandb
+from transformers import get_linear_schedule_with_warmup
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from model.training_config import TrainingConfig
+from model.language_aware_transformer import LanguageAwareTransformer
+from model.train import ToxicDataset
+from transformers import XLMRobertaTokenizer
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def setup_plot_style():
+    """Configure plot styling"""
+    plt.style.use('seaborn-darkgrid')
+    plt.rcParams['figure.figsize'] = (12, 12)
+    plt.rcParams['font.size'] = 12
+def setup_wandb():
+    """Initialize wandb for validation tracking"""
+    try:
+        wandb.init(
+            project="toxic-comment-classification",
+            name=f"validation-analysis-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
+            config={
+                "analysis_type": "validation_loss",
+                "timestamp": datetime.now().strftime('%Y%m%d-%H%M%S')
+            }
+        )
+        logger.info("Initialized wandb logging")
+    except Exception as e:
+        logger.error(f"Error initializing wandb: {str(e)}")
+        raise
+def load_model_and_data():
+    """Load the model and validation data"""
+    try:
+        # Initialize config with training settings
+        config = TrainingConfig(
+            batch_size=16,
+            num_workers=16,
+            lr=2e-5,
+            weight_decay=0.01,
+            max_grad_norm=1.0,
+            warmup_ratio=0.1,
+            label_smoothing=0.01,
+            mixed_precision="fp16",
+            activation_checkpointing=True,
+            epochs=1  # Number of validation epochs
+        )
+        # Load validation data
+        logger.info("Loading validation and test data...")
+        val_df = pd.read_csv("dataset/split/val.csv")
+        test_df = pd.read_csv("dataset/split/test.csv")
+        combined_df = pd.concat([val_df, test_df])
+        tokenizer = XLMRobertaTokenizer.from_pretrained(config.model_name)
+        combined_dataset = ToxicDataset(combined_df, tokenizer, config, mode='combined')
+        # Create combined dataloader
+        combined_loader = DataLoader(
+            combined_dataset,
+            batch_size=config.batch_size,
+            shuffle=True,  # Enable shuffling
+            num_workers=config.num_workers,
+            pin_memory=True,
+            drop_last=False  # Keep all samples
+        )
+        # Log dataloader config to wandb
+        if wandb.run is not None:
+            wandb.config.update({
+                'shuffle': True,
+                'drop_last': False,
+                'total_validation_steps': len(combined_loader),
+                'total_validation_samples': len(combined_dataset)
+            })
+        # Load model
+        logger.info("Loading model...")
+        model = LanguageAwareTransformer(
+            num_labels=len(config.toxicity_labels),
+            model_name=config.model_name
+        )
+        # Load latest checkpoint
+        checkpoint_path = Path('weights/toxic_classifier_xlm-roberta-large/pytorch_model.bin')
+        if checkpoint_path.exists():
+            checkpoint = torch.load(checkpoint_path, map_location='cpu')
+            model.load_state_dict(checkpoint)
+            logger.info("Loaded model checkpoint")
+        else:
+            raise FileNotFoundError("No checkpoint found")
+        # Move model to GPU if available
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        model = model.to(device)
+        # Setup optimizer
+        param_groups = config.get_param_groups(model)
+        optimizer = torch.optim.AdamW(param_groups)
+        # Setup scheduler
+        total_steps = len(combined_loader) * config.epochs
+        warmup_steps = int(total_steps * config.warmup_ratio)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=warmup_steps,
+            num_training_steps=total_steps
+        )
+        # Initialize gradient scaler for mixed precision
+        scaler = torch.cuda.amp.GradScaler(enabled=config.mixed_precision == "fp16")
+        # Log model configuration to wandb
+        if wandb.run is not None:
+            wandb.config.update({
+                'model_name': config.model_name,
+                'batch_size': config.batch_size,
+                'learning_rate': config.lr,
+                'weight_decay': config.weight_decay,
+                'max_grad_norm': config.max_grad_norm,
+                'warmup_ratio': config.warmup_ratio,
+                'label_smoothing': config.label_smoothing,
+                'mixed_precision': config.mixed_precision,
+                'num_workers': config.num_workers,
+                'activation_checkpointing': config.activation_checkpointing,
+                'validation_epochs': config.epochs
+            })
+        return model, combined_loader, device, optimizer, scheduler, scaler, config
+    except Exception as e:
+        logger.error(f"Error loading model and data: {str(e)}")
+        raise
+def collect_validation_losses(model, combined_loader, device, optimizer, scheduler, scaler, config):
+    """Run validation and collect step losses across multiple epochs"""
+    try:
+        logger.warning("This is an analysis run on combined val+test data - model will not be saved or updated")
+        # Ensure we're in eval mode and no gradients are computed
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+        all_losses = []
+        epoch_losses = []
+        for epoch in range(config.epochs):
+            logger.info(f"\nStarting validation epoch {epoch+1}/{config.epochs}")
+            total_loss = 0
+            num_batches = len(combined_loader)
+            epoch_start_time = datetime.now()
+            with torch.no_grad():  # Extra safety to ensure no gradients
+                for step, batch in enumerate(combined_loader):
+                    # Move batch to device
+                    batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v
+                            for k, v in batch.items()}
+                    # Forward pass with mixed precision
+                    with torch.cuda.amp.autocast(enabled=config.mixed_precision != "no"):
+                        outputs = model(**batch)
+                        loss = outputs['loss'].item()
+                    total_loss += loss
+                    # Calculate running averages
+                    avg_loss = total_loss / (step + 1)
+                    # Get learning rates
+                    lrs = [group['lr'] for group in optimizer.param_groups]
+                    # Log to wandb
+                    wandb.log({
+                        'val/step_loss': loss,
+                        'val/running_avg_loss': avg_loss,
+                        'val/progress': (step + 1) / num_batches * 100,
+                        'val/learning_rate': lrs[0],  # Base learning rate
+                        'val/batch_size': config.batch_size,
+                        'val/epoch': epoch + 1,
+                        'val/global_step': epoch * num_batches + step
+                    })
+                    # Log progress
+                    if step % 10 == 0:
+                        elapsed_time = datetime.now() - epoch_start_time
+                        steps_per_sec = (step + 1) / elapsed_time.total_seconds()
+                        remaining_steps = num_batches - (step + 1)
+                        eta_seconds = remaining_steps / steps_per_sec if steps_per_sec > 0 else 0
+                        logger.info(
+                            f"Epoch [{epoch+1}/{config.epochs}] "
+                            f"Step [{step+1}/{num_batches}] "
+                            f"Loss: {loss:.4f} "
+                            f"Avg Loss: {avg_loss:.4f} "
+                            f"LR: {lrs[0]:.2e} "
+                            f"ETA: {int(eta_seconds)}s"
+                        )
+            # Calculate epoch statistics
+            epoch_avg_loss = total_loss / num_batches
+            epoch_losses.append({
+                'epoch': epoch + 1,
+                'avg_loss': epoch_avg_loss,
+                'elapsed_time': (datetime.now() - epoch_start_time).total_seconds()
+            })
+            # Log epoch metrics to wandb
+            wandb.log({
+                'val/epoch_avg_loss': epoch_avg_loss,
+                'val/epoch_number': epoch + 1,
+                'val/epoch_time': epoch_losses[-1]['elapsed_time']
+            })
+            # Clear GPU memory after each epoch
+            torch.cuda.empty_cache()
+        return epoch_losses
+    except Exception as e:
+        logger.error(f"Error collecting validation losses: {str(e)}")
+        raise
+def plot_validation_losses(epoch_losses):
+    """Plot validation epoch losses"""
+    try:
+        setup_plot_style()
+        # Create figure
+        fig, ax = plt.subplots()
+        # Extract data
+        epochs = [d['epoch'] for d in epoch_losses]
+        losses = [d['avg_loss'] for d in epoch_losses]
+        # Plot epoch losses
+        ax.plot(epochs, losses, 'go-', label='Epoch Average Loss', linewidth=2, markersize=8)
+        # Add trend line
+        z = np.polyfit(epochs, losses, 1)
+        p = np.poly1d(z)
+        ax.plot(epochs, p(epochs), "r--", alpha=0.8, label='Loss Trend')
+        # Customize plot
+        ax.set_title('Validation Epoch Losses')
+        ax.set_xlabel('Epoch')
+        ax.set_ylabel('Average Loss')
+        ax.legend()
+        ax.grid(True, linestyle='--', alpha=0.7)
+        # Adjust layout
+        plt.tight_layout()
+        # Create output directory if it doesn't exist
+        output_dir = Path('analysis/plots')
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Save plot
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        output_path = output_dir / f'validation_losses_{timestamp}.png'
+        plt.savefig(output_path, dpi=300, bbox_inches='tight')
+        logger.info(f"Plot saved to {output_path}")
+        # Log plot to wandb
+        wandb.log({
+            "val/loss_plot": wandb.Image(str(output_path))
+        })
+        # Show plot
+        plt.show()
+    except Exception as e:
+        logger.error(f"Error plotting validation losses: {str(e)}")
+        raise
+def calculate_loss_statistics(epoch_losses):
+    """Calculate and print loss statistics"""
+    try:
+        losses = [d['avg_loss'] for d in epoch_losses]
+        stats = {
+            'Mean Loss': np.mean(losses),
+            'Std Loss': np.std(losses),
+            'Min Loss': np.min(losses),
+            'Max Loss': np.max(losses),
+            'Best Epoch': epoch_losses[np.argmin(losses)]['epoch']
+        }
+        # Log statistics to wandb
+        wandb.log({
+            'val/mean_loss': stats['Mean Loss'],
+            'val/std_loss': stats['Std Loss'],
+            'val/min_loss': stats['Min Loss'],
+            'val/max_loss': stats['Max Loss'],
+            'val/best_epoch': stats['Best Epoch']
+        })
+        # Print statistics
+        print("\nValidation Loss Statistics:")
+        for metric_name, value in stats.items():
+            if metric_name == 'Best Epoch':
+                print(f"{metric_name}: {int(value)}")
+            else:
+                print(f"{metric_name}: {value:.4f}")
+        return stats
+    except Exception as e:
+        logger.error(f"Error calculating statistics: {str(e)}")
+        raise
+def main():
+    try:
+        # Initialize wandb
+        setup_wandb()
+        # Load model and data
+        logger.info("Loading model and data...")
+        model, combined_loader, device, optimizer, scheduler, scaler, config = load_model_and_data()
+        # Collect validation losses
+        logger.info("Collecting validation losses...")
+        epoch_losses = collect_validation_losses(
+            model, combined_loader, device, optimizer, scheduler, scaler, config
+        )
+        # Plot losses
+        logger.info("Plotting validation losses...")
+        plot_validation_losses(epoch_losses)
+        # Calculate and print statistics
+        logger.info("Calculating statistics...")
+        calculate_loss_statistics(epoch_losses)
+    except Exception as e:
+        logger.error(f"Error in main: {str(e)}")
+        raise
+    finally:
+        # Clean up
+        torch.cuda.empty_cache()
+        # Finish wandb run
+        wandb.finish()
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        logger.error(f"Script failed: {str(e)}")
+        raise

analysis/plot_roc_curves.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve, auc
+import os
+import json
+from pathlib import Path
+def plot_roc_curves(predictions_path, output_dir=None):
+    """
+    Plot ROC curves from model predictions
+    Args:
+        predictions_path (str): Path to the .npz file containing predictions
+        output_dir (str, optional): Directory to save plots. If None, will use same directory as predictions
+    """
+    # Load predictions
+    data = np.load(predictions_path)
+    predictions = data['predictions']
+    labels = data['labels']
+    langs = data['langs']
+    # Create output directory
+    if output_dir is None:
+        output_dir = os.path.dirname(predictions_path)
+    plots_dir = os.path.join(output_dir, 'plots')
+    os.makedirs(plots_dir, exist_ok=True)
+    # Define toxicity types
+    toxicity_types = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    # Define language mapping
+    id_to_lang = {
+        0: 'English (en)',
+        1: 'Russian (ru)',
+        2: 'Turkish (tr)',
+        3: 'Spanish (es)',
+        4: 'French (fr)',
+        5: 'Italian (it)',
+        6: 'Portuguese (pt)'
+    }
+    # Plot overall ROC curves (one per class)
+    plt.figure(figsize=(10, 8))
+    for i, class_name in enumerate(toxicity_types):
+        fpr, tpr, _ = roc_curve(labels[:, i], predictions[:, i])
+        roc_auc = auc(fpr, tpr)
+        plt.plot(fpr, tpr, label=f'{class_name} (AUC = {roc_auc:.3f})')
+    plt.plot([0, 1], [0, 1], 'k--', label='Random')
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('ROC Curves - All Classes')
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig(os.path.join(plots_dir, 'roc_all_classes.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+    # Plot per-class ROC curves with confidence intervals
+    n_bootstrap = 1000
+    n_classes = len(toxicity_types)
+    for i, class_name in enumerate(toxicity_types):
+        plt.figure(figsize=(8, 6))
+        # Calculate main ROC curve
+        fpr, tpr, _ = roc_curve(labels[:, i], predictions[:, i])
+        roc_auc = auc(fpr, tpr)
+        # Plot main curve
+        plt.plot(fpr, tpr, 'b-', label=f'ROC (AUC = {roc_auc:.3f})')
+        # Bootstrap for confidence intervals
+        tprs = []
+        aucs = []
+        mean_fpr = np.linspace(0, 1, 100)
+        for _ in range(n_bootstrap):
+            # Bootstrap sample indices
+            indices = np.random.randint(0, len(labels), len(labels))
+            if len(np.unique(labels[indices, i])) < 2:
+                continue
+            # Calculate ROC curve
+            fpr, tpr, _ = roc_curve(labels[indices, i], predictions[indices, i])
+            # Interpolate TPR at mean FPR points
+            interp_tpr = np.interp(mean_fpr, fpr, tpr)
+            interp_tpr[0] = 0.0
+            tprs.append(interp_tpr)
+            aucs.append(auc(fpr, tpr))
+        # Calculate confidence intervals
+        tprs = np.array(tprs)
+        mean_tpr = np.mean(tprs, axis=0)
+        std_tpr = np.std(tprs, axis=0)
+        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
+        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
+        # Plot confidence interval
+        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
+                        label=f'±1 std. dev.')
+        # Calculate AUC confidence interval
+        auc_mean = np.mean(aucs)
+        auc_std = np.std(aucs)
+        plt.plot([], [], ' ', label=f'AUC = {auc_mean:.3f} ± {auc_std:.3f}')
+        plt.plot([0, 1], [0, 1], 'k--', label='Random')
+        plt.xlabel('False Positive Rate')
+        plt.ylabel('True Positive Rate')
+        plt.title(f'ROC Curve - {class_name}')
+        plt.legend(loc='lower right')
+        plt.grid(True)
+        plt.tight_layout()
+        plt.savefig(os.path.join(plots_dir, f'roc_{class_name}.png'), dpi=300)
+        plt.close()
+    # Plot per-language ROC curves (for toxic class)
+    plt.figure(figsize=(10, 8))
+    for lang_id, lang_name in id_to_lang.items():
+        # Get samples for this language
+        lang_mask = langs == lang_id
+        if lang_mask.sum() > 0 and len(np.unique(labels[lang_mask, 0])) > 1:
+            fpr, tpr, _ = roc_curve(labels[lang_mask, 0], predictions[lang_mask, 0])
+            roc_auc = auc(fpr, tpr)
+            plt.plot(fpr, tpr, label=f'{lang_name} (AUC = {roc_auc:.3f})')
+    plt.plot([0, 1], [0, 1], 'k--', label='Random')
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('ROC Curves by Language - Toxic Class')
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig(os.path.join(plots_dir, 'roc_by_language.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+    print(f"\nROC curves have been saved to {plots_dir}")
+    print("\nGenerated plots:")
+    print("1. roc_all_classes.png - ROC curves for all toxicity classes")
+    print("2. roc_[class_name].png - Individual ROC curves with confidence intervals for each class")
+    print("3. roc_by_language.png - ROC curves for each language (toxic class)")
+if __name__ == '__main__':
+    # Use the latest evaluation results
+    eval_dir = 'evaluation_results'
+    if os.path.exists(eval_dir):
+        # Find most recent evaluation directory
+        eval_dirs = sorted([d for d in os.listdir(eval_dir) if d.startswith('eval_')], reverse=True)
+        if eval_dirs:
+            latest_eval = os.path.join(eval_dir, eval_dirs[0])
+            predictions_path = os.path.join(latest_eval, 'predictions.npz')
+            if os.path.exists(predictions_path):
+                plot_roc_curves(predictions_path)
+            else:
+                print(f"No predictions file found in {latest_eval}")
+        else:
+            print(f"No evaluation directories found in {eval_dir}")
+    else:
+        print(f"Evaluation directory {eval_dir} not found")

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import gradio as gr
+import torch
+import numpy as np
+import os
+import json
+from model.inference_optimized import OptimizedToxicityClassifier
+import matplotlib.pyplot as plt
+from typing import List, Dict
+import langid
+import pandas as pd
+# Configure paths
+ONNX_MODEL_PATH = os.environ.get("ONNX_MODEL_PATH", "weights/toxic_classifier.onnx")
+PYTORCH_MODEL_PATH = os.environ.get("PYTORCH_MODEL_PATH", "weights/toxic_classifier_xlm-roberta-large/pytorch_model.bin")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Supported languages
+SUPPORTED_LANGUAGES = {
+    'en': 'English',
+    'ru': 'Russian',
+    'tr': 'Turkish',
+    'es': 'Spanish',
+    'fr': 'French',
+    'it': 'Italian',
+    'pt': 'Portuguese'
+}
+# Initialize classifier
+try:
+    if os.path.exists(ONNX_MODEL_PATH):
+        classifier = OptimizedToxicityClassifier(onnx_path=ONNX_MODEL_PATH, device=DEVICE)
+        print(f"Loaded ONNX model from {ONNX_MODEL_PATH}")
+    else:
+        classifier = OptimizedToxicityClassifier(pytorch_path=PYTORCH_MODEL_PATH, device=DEVICE)
+        print(f"Loaded PyTorch model from {PYTORCH_MODEL_PATH}")
+except Exception as e:
+    print(f"Error loading model: {str(e)}")
+    classifier = None
+def detect_language(text: str) -> str:
+    """Detect language of input text"""
+    try:
+        lang, _ = langid.classify(text)
+        return lang if lang in SUPPORTED_LANGUAGES else 'en'
+    except:
+        return 'en'
+def predict_toxicity(text: str, selected_language: str = None) -> Dict:
+    """Predict toxicity of input text"""
+    if not text or not text.strip():
+        return {
+            "error": "Please enter some text to analyze.",
+            "html_result": "<div class='error'>Please enter some text to analyze.</div>"
+        }
+    if classifier is None:
+        return {
+            "error": "Model not loaded. Please check logs.",
+            "html_result": "<div class='error'>Model not loaded. Please check logs.</div>"
+        }
+    # Detect language if not specified
+    if not selected_language or selected_language == "Auto-detect":
+        lang_code = detect_language(text)
+        detected = True
+    else:
+        # Convert from display name to code
+        lang_code = next((code for code, name in SUPPORTED_LANGUAGES.items()
+                         if name == selected_language), 'en')
+        detected = False
+    # Run prediction
+    try:
+        results = classifier.predict([text], langs=[lang_code])[0]
+        # Format probabilities for display
+        probs = results["probabilities"]
+        sorted_categories = sorted(
+            [(label, probs[label]) for label in probs],
+            key=lambda x: x[1],
+            reverse=True
+        )
+        # Create bar chart
+        fig, ax = plt.subplots(figsize=(10, 6))
+        labels = [label.replace('_', ' ').title() for label, _ in sorted_categories]
+        values = [prob * 100 for _, prob in sorted_categories]
+        colors = ['#ff6b6b' if val >= 50 else '#74c0fc' for val in values]
+        ax.barh(labels, values, color=colors)
+        ax.set_xlim(0, 100)
+        ax.set_xlabel('Probability (%)')
+        ax.set_title('Toxicity Analysis')
+        ax.grid(axis='x', linestyle='--', alpha=0.7)
+        # Annotate values
+        for i, v in enumerate(values):
+            ax.text(v + 1, i, f'{v:.1f}%', va='center')
+        # Create HTML result
+        lang_display = SUPPORTED_LANGUAGES.get(lang_code, lang_code)
+        overall_result = "TOXIC" if results["is_toxic"] else "NON-TOXIC"
+        result_color = "#ff6b6b" if results["is_toxic"] else "#66d9e8"
+        html_result = f"""
+        <div style='margin-bottom: 20px;'>
+            <h2>Analysis Result: <span style='color: {result_color};'>{overall_result}</span></h2>
+            <h3>Language: {lang_display} {'(detected)' if detected else ''}</h3>
+        </div>
+        <div style='margin-bottom: 10px;'>
+            <table width='100%' style='border-collapse: collapse;'>
+                <tr style='background-color: #e9ecef; font-weight: bold;'>
+                    <th style='padding: 8px; text-align: left; border: 1px solid #dee2e6;'>Category</th>
+                    <th style='padding: 8px; text-align: right; border: 1px solid #dee2e6;'>Probability</th>
+                    <th style='padding: 8px; text-align: center; border: 1px solid #dee2e6;'>Status</th>
+                </tr>
+        """
+        # Add rows for each toxicity category
+        for label, prob in sorted_categories:
+            formatted_label = label.replace('_', ' ').title()
+            status = "DETECTED" if prob >= 0.5 else "Not Detected"
+            status_color = "#ff6b6b" if prob >= 0.5 else "#66d9e8"
+            prob_percent = f"{prob * 100:.1f}%"
+            html_result += f"""
+                <tr>
+                    <td style='padding: 8px; border: 1px solid #dee2e6;'>{formatted_label}</td>
+                    <td style='padding: 8px; text-align: right; border: 1px solid #dee2e6;'>{prob_percent}</td>
+                    <td style='padding: 8px; text-align: center; border: 1px solid #dee2e6; color: {status_color}; font-weight: bold;'>{status}</td>
+                </tr>
+            """
+        html_result += "</table></div>"
+        # Add detected categories if toxic
+        if results["is_toxic"]:
+            toxic_categories = [cat.replace('_', ' ').title() for cat in results["toxic_categories"]]
+            categories_list = ", ".join(toxic_categories)
+            html_result += f"""
+            <div style='margin-top: 10px;'>
+                <p><strong>Detected toxic categories:</strong> {categories_list}</p>
+            </div>
+            """
+        return {
+            "prediction": results,
+            "html_result": html_result,
+            "fig": fig
+        }
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {
+            "error": f"Error processing text: {str(e)}",
+            "html_result": f"<div class='error'>Error processing text: {str(e)}</div>"
+        }
+def create_app():
+    """Create and configure the Gradio interface"""
+    # Create language dropdown options
+    language_options = ["Auto-detect"] + list(SUPPORTED_LANGUAGES.values())
+    # Define the interface
+    with gr.Blocks(css="""
+        .error { color: #ff6b6b; font-weight: bold; padding: 10px; border: 1px solid #ff6b6b; }
+        .container { margin: 0 auto; max-width: 900px; }
+        .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
+        .example-text { font-style: italic; color: #666; }
+    """) as app:
+        gr.Markdown("""
+        # Multilingual Toxic Comment Classifier
+        This app analyzes text for different types of toxicity across multiple languages.
+        Enter your text, select a language (or let it auto-detect), and click 'Analyze'.
+        Supported languages: English, Russian, Turkish, Spanish, French, Italian, Portuguese
+        """)
+        with gr.Row():
+            with gr.Column(scale=3):
+                text_input = gr.Textbox(
+                    label="Enter text to analyze",
+                    placeholder="Type or paste text here...",
+                    lines=5
+                )
+                lang_dropdown = gr.Dropdown(
+                    choices=language_options,
+                    value="Auto-detect",
+                    label="Language"
+                )
+                analyze_btn = gr.Button("Analyze", variant="primary")
+            with gr.Column(scale=2):
+                gr.Markdown("### Example texts:")
+                with gr.Accordion("English example"):
+                    en_example_btn = gr.Button("Use English example")
+                with gr.Accordion("Spanish example"):
+                    es_example_btn = gr.Button("Use Spanish example")
+                with gr.Accordion("French example"):
+                    fr_example_btn = gr.Button("Use French example")
+        # Examples
+        en_example_text = "You are such an idiot, nobody likes your stupid content."
+        es_example_text = "Eres un completo idiota y nadie te quiere."
+        fr_example_text = "Tu es tellement stupide, personne n'aime ton contenu minable."
+        en_example_btn.click(
+            lambda: en_example_text,
+            outputs=text_input
+        )
+        es_example_btn.click(
+            lambda: es_example_text,
+            outputs=text_input
+        )
+        fr_example_btn.click(
+            lambda: fr_example_text,
+            outputs=text_input
+        )
+        # Output components
+        result_html = gr.HTML(label="Analysis Result")
+        plot_output = gr.Plot(label="Toxicity Probabilities")
+        # Set up event handling
+        analyze_btn.click(
+            predict_toxicity,
+            inputs=[text_input, lang_dropdown],
+            outputs=[result_html, plot_output]
+        )
+        # Also analyze on pressing Enter in the text box
+        text_input.submit(
+            predict_toxicity,
+            inputs=[text_input, lang_dropdown],
+            outputs=[result_html, plot_output]
+        )
+        gr.Markdown("""
+        ### About this model
+        This model classifies text into six toxicity categories:
+        - **Toxic**: General toxicity
+        - **Severe Toxic**: Extreme toxicity
+        - **Obscene**: Obscene content
+        - **Threat**: Threatening content
+        - **Insult**: Insulting content
+        - **Identity Hate**: Identity-based hate
+        Built using XLM-RoBERTa with language-aware fine-tuning.
+        """)
+    return app
+# Launch the app when script is run directly
+if __name__ == "__main__":
+    # Create and launch the app
+    app = create_app()
+    app.launch(
+        server_name="0.0.0.0",  # Bind to all interfaces
+        server_port=7860,       # Default Gradio port
+        share=True              # Generate public link
+    )

augmentation/balance_english.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import os
+import torch
+# Configure CPU and thread settings FIRST, before any other imports
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'
+os.environ['TF_CPU_ENABLE_AVX2'] = '1'
+os.environ['TF_CPU_ENABLE_AVX512F'] = '1'
+os.environ['TF_CPU_ENABLE_AVX512_VNNI'] = '1'
+os.environ['TF_CPU_ENABLE_FMA'] = '1'
+os.environ['MKL_NUM_THREADS'] = '80'
+os.environ['OMP_NUM_THREADS'] = '80'
+# Set PyTorch thread configurations once
+torch.set_num_threads(80)
+torch.set_num_interop_threads(10)
+# Now import everything else
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import logging
+from datetime import datetime
+import sys
+from toxic_augment import ToxicAugmenter
+import json
+# Configure logging
+log_dir = Path("logs")
+log_dir.mkdir(exist_ok=True)
+timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+log_file = log_dir / f"balance_english_{timestamp}.log"
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(log_file)
+    ]
+)
+logger = logging.getLogger(__name__)
+def analyze_label_distribution(df, lang='en'):
+    """Analyze label distribution for a specific language"""
+    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    lang_df = df[df['lang'] == lang]
+    total = len(lang_df)
+    if total == 0:
+        logger.warning(f"No samples found for language {lang.upper()}.")
+        return {}
+    logger.info(f"\nLabel Distribution for {lang.upper()}:")
+    logger.info("-" * 50)
+    dist = {}
+    for label in labels:
+        count = lang_df[label].sum()
+        percentage = (count / total) * 100
+        dist[label] = {'count': int(count), 'percentage': percentage}
+        logger.info(f"{label}: {count:,} ({percentage:.2f}%)")
+    return dist
+def analyze_language_distribution(df):
+    """Analyze current language distribution"""
+    lang_dist = df['lang'].value_counts()
+    logger.info("\nCurrent Language Distribution:")
+    logger.info("-" * 50)
+    for lang, count in lang_dist.items():
+        logger.info(f"{lang}: {count:,} comments ({count/len(df)*100:.2f}%)")
+    return lang_dist
+def calculate_required_samples(df):
+    """Calculate how many English samples we need to generate"""
+    lang_counts = df['lang'].value_counts()
+    target_count = lang_counts.max()  # Use the largest language count as target
+    en_count = lang_counts.get('en', 0)
+    required_samples = target_count - en_count
+    logger.info(f"\nTarget count per language: {target_count:,}")
+    logger.info(f"Current English count: {en_count:,}")
+    logger.info(f"Required additional English samples: {required_samples:,}")
+    return required_samples
+def generate_balanced_samples(df, required_samples):
+    """Generate samples maintaining original class distribution ratios"""
+    logger.info("\nGenerating balanced samples...")
+    # Get English samples
+    en_df = df[df['lang'] == 'en']
+    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    # Calculate target counts for each label
+    target_counts = {}
+    for label in labels:
+        count = en_df[label].sum()
+        ratio = count / len(en_df)
+        target_count = int(ratio * required_samples)
+        target_counts[label] = target_count
+        logger.info(f"Target count for {label}: {target_count:,}")
+    augmented_samples = []
+    augmenter = ToxicAugmenter()
+    total_generated = 0
+    # Generate samples for each label
+    for label, target_count in target_counts.items():
+        if target_count == 0:
+            continue
+        logger.info(f"\nGenerating {target_count:,} samples for {label}")
+        # Get seed texts with this label
+        seed_texts = en_df[en_df[label] == 1]['comment_text'].tolist()
+        if not seed_texts:
+            logger.warning(f"No seed texts found for {label}, skipping...")
+            continue
+        # Generate samples with 5-minute timeout
+        new_samples = augmenter.augment_dataset(
+            target_samples=target_count,
+            label=label,  # Using single label instead of label_combo
+            seed_texts=seed_texts,
+            timeout_minutes=5
+        )
+        if new_samples is not None and not new_samples.empty:
+            augmented_samples.append(new_samples)
+            total_generated += len(new_samples)
+            # Log progress
+            logger.info(f"✓ Generated {len(new_samples):,} samples")
+            logger.info(f"Progress: {total_generated:,}/{required_samples:,}")
+            # Check if we have reached our global required samples
+            if total_generated >= required_samples:
+                logger.info("Reached required sample count, stopping generation")
+                break
+    # Combine all generated samples
+    if augmented_samples:
+        augmented_df = pd.concat(augmented_samples, ignore_index=True)
+        augmented_df['lang'] = 'en'
+        # Ensure we don't exceed the required sample count
+        if len(augmented_df) > required_samples:
+            logger.info(f"Trimming excess samples from {len(augmented_df):,} to {required_samples:,}")
+            augmented_df = augmented_df.head(required_samples)
+        # Log final class distribution
+        logger.info("\nFinal class distribution in generated samples:")
+        for label in labels:
+            count = augmented_df[label].sum()
+            percentage = (count / len(augmented_df)) * 100
+            logger.info(f"{label}: {count:,} ({percentage:.2f}%)")
+        # Also log clean samples
+        clean_count = len(augmented_df[augmented_df[labels].sum(axis=1) == 0])
+        clean_percentage = (clean_count / len(augmented_df)) * 100
+        logger.info(f"Clean samples: {clean_count:,} ({clean_percentage:.2f}%)")
+        return augmented_df
+    else:
+        raise Exception("Failed to generate any valid samples")
+def balance_english_data():
+    """Main function to balance English data with other languages"""
+    try:
+        # Load dataset
+        input_file = 'dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_FINAL.csv'
+        logger.info(f"Loading dataset from {input_file}")
+        df = pd.read_csv(input_file)
+        # Analyze current distribution
+        logger.info("\nAnalyzing current distribution...")
+        initial_dist = analyze_language_distribution(df)
+        initial_label_dist = analyze_label_distribution(df, 'en')
+        # Calculate required samples
+        required_samples = calculate_required_samples(df)
+        if required_samples <= 0:
+            logger.info("English data is already balanced. No augmentation needed.")
+            return
+        # Generate balanced samples
+        augmented_df = generate_balanced_samples(df, required_samples)
+        # Merge with original dataset
+        logger.info("\nMerging datasets...")
+        output_file = 'dataset/processed/MULTILINGUAL_TOXIC_DATASET_BALANCED.csv'
+        # Combine datasets
+        combined_df = pd.concat([df, augmented_df], ignore_index=True)
+        # Save balanced dataset
+        combined_df.to_csv(output_file, index=False)
+        logger.info(f"\nSaved balanced dataset to {output_file}")
+        # Final distribution check
+        logger.info("\nFinal distribution after balancing:")
+        final_dist = analyze_language_distribution(combined_df)
+        final_label_dist = analyze_label_distribution(combined_df, 'en')
+        # Save distribution statistics
+        stats = {
+            'timestamp': timestamp,
+            'initial_distribution': {
+                'languages': initial_dist.to_dict(),
+                'english_labels': initial_label_dist
+            },
+            'final_distribution': {
+                'languages': final_dist.to_dict(),
+                'english_labels': final_label_dist
+            },
+            'samples_generated': len(augmented_df),
+            'total_samples': len(combined_df)
+        }
+        stats_file = f'logs/balance_stats_{timestamp}.json'
+        with open(stats_file, 'w') as f:
+            json.dump(stats, f, indent=2)
+        logger.info(f"\nSaved balancing statistics to {stats_file}")
+    except Exception as e:
+        logger.error(f"Error during balancing: {str(e)}")
+        raise
+def main():
+    balance_english_data()
+if __name__ == "__main__":
+    logger.info("Starting English data balancing process...")
+    main()

augmentation/threat_augment.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig
+)
+from langdetect import detect
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from pathlib import Path
+import logging
+import gc
+from typing import List
+import json
+from datetime import datetime, timedelta
+import time
+import sys
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+import joblib
+# Create log directories
+log_dir = Path("logs")
+log_dir.mkdir(exist_ok=True)
+# Get timestamp for log file
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+log_file = log_dir / f"generation_{timestamp}.log"
+# Configure logging once at the start
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(log_file)
+    ]
+)
+logger = logging.getLogger(__name__)
+logger.info(f"Starting new run. Log file: {log_file}")
+def log_separator(message: str = ""):
+    """Print a separator line with optional message"""
+    if message:
+        logger.info("\n" + "="*40 + f" {message} " + "="*40)
+    else:
+        logger.info("\n" + "="*100)
+class FastThreatValidator:
+    """Fast threat validation using logistic regression"""
+    def __init__(self, model_path: str = "weights/threat_validator.joblib"):
+        self.model_path = model_path
+        if Path(model_path).exists():
+            logger.info("Loading fast threat validator...")
+            model_data = joblib.load(model_path)
+            self.vectorizer = model_data['vectorizer']
+            self.model = model_data['model']
+            logger.info("✓ Fast validator loaded")
+        else:
+            logger.info("Training fast threat validator...")
+            self._train_validator()
+            logger.info("✓ Fast validator trained and saved")
+    def _train_validator(self):
+        """Train a simple logistic regression model for threat detection"""
+        # Load training data
+        train_df = pd.read_csv("dataset/split/train.csv")
+        # Prepare data
+        X = train_df['comment_text'].fillna('')
+        y = train_df['threat']
+        # Create and fit vectorizer
+        self.vectorizer = TfidfVectorizer(
+            max_features=10000,
+            ngram_range=(1, 2),
+            strip_accents='unicode',
+            min_df=2
+        )
+        X_vec = self.vectorizer.fit_transform(X)
+        # Train model
+        self.model = LogisticRegression(
+            C=1.0,
+            class_weight='balanced',
+            max_iter=200,
+            n_jobs=-1
+        )
+        self.model.fit(X_vec, y)
+        # Save model
+        joblib.dump({
+            'vectorizer': self.vectorizer,
+            'model': self.model
+        }, self.model_path)
+    def validate(self, texts: List[str], threshold: float = 0.6) -> List[bool]:
+        """Validate texts using the fast model"""
+        # Vectorize texts
+        X = self.vectorizer.transform(texts)
+        # Get probabilities
+        probs = self.model.predict_proba(X)[:, 1]
+        # Return boolean mask
+        return probs >= threshold
+class ThreatAugmenter:
+    def __init__(self, seed_samples_path: str = "dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_FINAL.csv"):
+        log_separator("INITIALIZATION")
+        # Use global log file
+        self.log_file = log_file
+        # Initialize generation buffer
+        self.generation_buffer = []
+        self.buffer_size = 100  # Flush buffer every 100 entries
+        # Multi-GPU setup
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 0:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            logger.info(f"Found {self.num_gpus} GPUs:")
+            for i in range(self.num_gpus):
+                mem = torch.cuda.get_device_properties(i).total_memory / 1024**3
+                logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)} ({mem:.1f}GB)")
+        # Load models
+        log_separator("LOADING MODELS")
+        logger.info("Loading Mistral-7B...")
+        # Configure model for multi-GPU
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True
+        )
+        self.llm = AutoModelForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            device_map="balanced",  # Ensures proper dual GPU usage
+            torch_dtype=torch.float16,
+            quantization_config=quantization_config,
+            max_memory={0: "22GB", 1: "22GB"}  # Explicitly set memory limits for each GPU
+        )
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            padding_side="left",
+            use_fast=True
+        )
+        self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
+        logger.info("✓ Mistral-7B loaded")
+        # Initialize fast validator
+        self.validator = FastThreatValidator()
+        logger.info("✓ Fast validator initialized")
+        # Load and preprocess data
+        log_separator("LOADING DATA")
+        logger.info(f"Loading samples from {seed_samples_path}")
+        self.seed_df = pd.read_csv(seed_samples_path)
+        self.en_threat_samples = self.seed_df[
+            (self.seed_df['lang'] == 'en') &
+            (self.seed_df['threat'] == 1)
+        ]
+        logger.info(f"Found {len(self.en_threat_samples)} EN threat samples")
+        # Optimize batch processing
+        self.max_batch_size = 48  # Increased batch size
+        self.prefetch_factor = 4
+        self.num_workers = 8
+        # Memory cleanup
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_separator()
+    def generate_prompt(self, seed_text: str) -> str:
+        """Generate a prompt for realistic threatening content"""
+        return f"""<s>[INST] Generate a threatening social media comment that is realistic and intimidating.
+Reference example (generate something different): {seed_text}
+Requirements:
+- Make it genuinely threatening
+- Use authentic social media language
+- Keep it under 50 words
+- Must be different from example
+Generate ONLY the comment: [/INST]"""
+    def flush_buffer(self):
+        """Flush the generation buffer to disk"""
+        if self.generation_buffer:
+            try:
+                with open(self.log_file, 'a', encoding='utf-8') as f:
+                    for entry in self.generation_buffer:
+                        f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+                self.generation_buffer = []
+            except Exception as e:
+                logger.error(f"Failed to flush buffer: {str(e)}")
+    def log_generation(self, seed_text: str, prompt: str, generated_text: str, is_valid: bool):
+        """Buffer log generation details"""
+        log_entry = {
+            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "seed_text": seed_text,
+            "prompt": prompt,
+            "generated_text": generated_text,
+            "is_valid": is_valid
+        }
+        self.generation_buffer.append(log_entry)
+        # Flush buffer if it reaches the size limit
+        if len(self.generation_buffer) >= self.buffer_size:
+            self.flush_buffer()
+    def generate_samples(self, prompts: List[str], seed_texts: List[str]) -> List[str]:
+        try:
+            with torch.amp.autocast('cuda', dtype=torch.float16):
+                inputs = self.llm_tokenizer(prompts, return_tensors="pt", padding=True,
+                                          truncation=True, max_length=256).to(self.llm.device)
+                outputs = self.llm.generate(
+                    **inputs,
+                    max_new_tokens=32,
+                    temperature=0.95,
+                    do_sample=True,
+                    top_p=0.92,
+                    top_k=50,
+                    num_return_sequences=1,
+                    repetition_penalty=1.15,
+                    pad_token_id=self.llm_tokenizer.pad_token_id,
+                    eos_token_id=self.llm_tokenizer.eos_token_id
+                )
+                texts = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=False)
+                cleaned_texts = []
+                valid_count = 0
+                # Process responses with minimal logging
+                for idx, text in enumerate(texts):
+                    if "[/INST]" in text and "</s>" in text:
+                        response = text.split("[/INST]")[1].split("</s>")[0].strip()
+                        response = response.strip().strip('"').strip("'")
+                        word_count = len(response.split())
+                        if (word_count >= 3 and word_count <= 50 and
+                            not any(x in response.lower() for x in [
+                                "generate", "requirements:", "reference",
+                                "[inst]", "example"
+                            ])):
+                            cleaned_texts.append(response)
+                            valid_count += 1
+                # Log only summary statistics
+                if valid_count > 0:
+                    logger.info(f"\nBatch Success: {valid_count}/{len(texts)} ({valid_count/len(texts)*100:.1f}%)")
+                return cleaned_texts
+        except Exception as e:
+            logger.error(f"Generation error: {str(e)}")
+            return []
+    def validate_toxicity(self, texts: List[str]) -> torch.Tensor:
+        """Validate texts using fast logistic regression"""
+        if not texts:
+            return torch.zeros(0, dtype=torch.bool)
+        # Get validation mask from fast validator
+        validation_mask = self.validator.validate(texts)
+        # Convert to torch tensor
+        return torch.tensor(validation_mask, dtype=torch.bool, device=self.llm.device)
+    def validate_language(self, texts: List[str]) -> List[bool]:
+        """Simple language validation"""
+        return [detect(text) == 'en' for text in texts]
+    def augment_dataset(self, target_samples: int = 500, batch_size: int = 32):
+        """Main augmentation loop with progress bar and CSV saving"""
+        try:
+            start_time = time.time()
+            logger.info(f"Starting generation: target={target_samples}, batch_size={batch_size}")
+            generated_samples = []
+            stats = {
+                "total_attempts": 0,
+                "valid_samples": 0,
+                "batch_times": []
+            }
+            # Create output directory if it doesn't exist
+            output_dir = Path("dataset/augmented")
+            output_dir.mkdir(parents=True, exist_ok=True)
+            # Generate timestamp for the filename
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_file = output_dir / f"threat_augmented_{timestamp}.csv"
+            # Initialize progress bar
+            pbar = tqdm(total=target_samples,
+                       desc="Generating samples",
+                       unit="samples",
+                       ncols=100,
+                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
+            while len(generated_samples) < target_samples:
+                batch_start = time.time()
+                seed_texts = self.en_threat_samples['comment_text'].sample(batch_size).tolist()
+                prompts = [self.generate_prompt(text) for text in seed_texts]
+                new_samples = self.generate_samples(prompts, seed_texts)
+                if not new_samples:
+                    continue
+                # Update statistics
+                batch_time = time.time() - batch_start
+                stats["batch_times"].append(batch_time)
+                stats["total_attempts"] += len(new_samples)
+                prev_len = len(generated_samples)
+                generated_samples.extend(new_samples)
+                stats["valid_samples"] = len(generated_samples)
+                # Update progress bar
+                pbar.update(len(generated_samples) - prev_len)
+                # Calculate and display success rate periodically
+                if len(stats["batch_times"]) % 10 == 0:  # Every 10 batches
+                    success_rate = (stats["valid_samples"] / stats["total_attempts"]) * 100
+                    avg_batch_time = sum(stats["batch_times"][-20:]) / min(len(stats["batch_times"]), 20)
+                    pbar.set_postfix({
+                        'Success Rate': f'{success_rate:.1f}%',
+                        'Batch Time': f'{avg_batch_time:.2f}s'
+                    })
+                # Cleanup
+                if len(generated_samples) % (batch_size * 5) == 0:
+                    torch.cuda.empty_cache()
+                    gc.collect()
+            # Close progress bar
+            pbar.close()
+            # Create DataFrame and save to CSV
+            df = pd.DataFrame({
+                'text': generated_samples[:target_samples],
+                'label': 1,  # These are all threat samples
+                'source': 'augmented',
+                'timestamp': timestamp
+            })
+            # Save to CSV
+            df.to_csv(output_file, index=False)
+            logger.info(f"\nSaved {len(df)} samples to {output_file}")
+            # Final stats
+            total_time = str(timedelta(seconds=int(time.time() - start_time)))
+            logger.info(f"Generation complete: {len(generated_samples)} samples generated in {total_time}")
+            return df
+        except Exception as e:
+            logger.error(f"Generation failed: {str(e)}")
+            raise
+if __name__ == "__main__":
+    torch.cuda.empty_cache()
+    gc.collect()
+    augmenter = ThreatAugmenter()
+    augmented_df = augmenter.augment_dataset(target_samples=500)

augmentation/toxic_augment.py ADDED Viewed

	@@ -0,0 +1,439 @@

+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig
+)
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from pathlib import Path
+import logging
+import gc
+from typing import List, Dict
+import json
+from datetime import datetime
+import time
+import sys
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+import joblib
+import random
+# Create log directories
+log_dir = Path("logs")
+log_dir.mkdir(exist_ok=True)
+# Get timestamp for log file
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+log_file = log_dir / f"generation_{timestamp}.log"
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(log_file)
+    ]
+)
+logger = logging.getLogger(__name__)
+logger.info(f"Starting new run. Log file: {log_file}")
+class FastToxicValidator:
+    """Fast toxicity validation using logistic regression"""
+    def __init__(self, model_path: str = "weights/toxic_validator.joblib"):
+        self.model_path = model_path
+        if Path(model_path).exists():
+            logger.info("Loading fast toxic validator...")
+            model_data = joblib.load(model_path)
+            self.vectorizers = model_data['vectorizers']
+            self.models = model_data['models']
+            logger.info("✓ Fast validator loaded")
+        else:
+            logger.info("Training fast toxic validator...")
+            self._train_validator()
+            logger.info("✓ Fast validator trained and saved")
+    def _train_validator(self):
+        """Train logistic regression models for each toxicity type"""
+        # Load training data
+        train_df = pd.read_csv("dataset/split/train.csv")
+        # Labels to validate
+        labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+        self.vectorizers = {}
+        self.models = {}
+        # Train a model for each label
+        for label in labels:
+            # Create and fit vectorizer
+            vectorizer = TfidfVectorizer(
+                max_features=10000,
+                ngram_range=(1, 2),
+                strip_accents='unicode',
+                min_df=2
+            )
+            X = vectorizer.fit_transform(train_df['comment_text'].fillna(''))
+            y = train_df[label]
+            # Train model
+            model = LogisticRegression(
+                C=1.0,
+                class_weight='balanced',
+                max_iter=200,
+                n_jobs=-1
+            )
+            model.fit(X, y)
+            self.vectorizers[label] = vectorizer
+            self.models[label] = model
+        # Save models
+        joblib.dump({
+            'vectorizers': self.vectorizers,
+            'models': self.models
+        }, self.model_path)
+    def get_probabilities(self, texts: List[str], label: str) -> np.ndarray:
+        """Get raw probabilities for a specific label"""
+        X = self.vectorizers[label].transform(texts)
+        return self.models[label].predict_proba(X)[:, 1]
+    def validate(self, texts: List[str], label: str, threshold: float = 0.5) -> List[bool]:
+        """Validate texts using the fast model with a lower threshold of 0.5"""
+        # Vectorize texts
+        X = self.vectorizers[label].transform(texts)
+        # Get probabilities
+        probs = self.models[label].predict_proba(X)[:, 1]
+        # Return boolean mask with lower threshold
+        return probs >= threshold
+class ToxicAugmenter:
+    def __init__(self):
+        logger.info("Initializing ToxicAugmenter...")
+        # Initialize generation buffer
+        self.generation_buffer = []
+        self.buffer_size = 100
+        # Multi-GPU setup
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 0:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            logger.info(f"Found {self.num_gpus} GPUs:")
+            for i in range(self.num_gpus):
+                mem = torch.cuda.get_device_properties(i).total_memory / 1024**3
+                logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)} ({mem:.1f}GB)")
+        # Load models with optimized settings
+        logger.info("Loading Mistral-7B...")
+        # Configure model for multi-GPU with optimized settings
+        quantization_config = BitsAndBytesConfig(
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True
+        )
+        # Initialize tokenizer first
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            padding_side="left",
+            use_fast=True,
+            model_max_length=512,  # Limit context size for faster processing
+            pad_token="</s>",  # Set pad token explicitly
+            add_eos_token=True,
+            add_bos_token=True
+        )
+        # Set pad token id explicitly
+        self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
+        self.llm_tokenizer.pad_token_id = self.llm_tokenizer.eos_token_id
+        # Now initialize model with tokenizer's pad token
+        self.llm = AutoModelForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            device_map="balanced",
+            torch_dtype=torch.float16,
+            quantization_config=quantization_config,
+            max_memory={0: "22GB", 1: "22GB"},
+            use_cache=True,  # Enable KV cache for faster generation
+            pad_token_id=self.llm_tokenizer.pad_token_id
+        )
+        logger.info("✓ Mistral-7B loaded")
+        # Initialize validator
+        self.validator = FastToxicValidator()
+        logger.info("✓ Fast validator initialized")
+    def generate_prompt(self, seed_text: str, label: str) -> str:
+        """Generate a prompt for a single toxicity type"""
+        base_prompt = """<s>[INST] You are tasked with generating a highly {type} comment. Your goal is to create content that would definitely be flagged as {type}.
+Example for reference (generate something different): "{seed_text}"
+Requirements:
+- Must be extremely {type} in nature
+- Use very strong language
+- Must be different from the example
+Generate ONLY the {type} comment (no explanations): [/INST]"""
+        type_mapping = {
+            'toxic': 'toxic and offensive',
+            'severe_toxic': 'toxic and hostile',
+            'obscene': 'obscene and vulgar',
+            'threat': 'threatening and intimidating',
+            'insult': 'insulting and demeaning',
+            'identity_hate': 'discriminatory and hateful'
+        }
+        return base_prompt.format(type=type_mapping[label], seed_text=seed_text)
+    def flush_buffer(self):
+        """Flush the generation buffer to disk"""
+        if self.generation_buffer:
+            try:
+                with open(log_file, 'a', encoding='utf-8') as f:
+                    for entry in self.generation_buffer:
+                        f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+                self.generation_buffer = []
+            except Exception as e:
+                logger.error(f"Failed to flush buffer: {str(e)}")
+    def log_generation(self, seed_text: str, prompt: str, generated_text: str, validation_results: Dict[str, bool]):
+        """Buffer log generation details with proper JSON serialization"""
+        # Convert numpy/torch boolean values to Python booleans
+        serializable_results = {
+            k: bool(v) for k, v in validation_results.items()
+        }
+        log_entry = {
+            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "seed_text": seed_text,
+            "prompt": prompt,
+            "generated_text": generated_text,
+            "validation_results": serializable_results
+        }
+        self.generation_buffer.append(log_entry)
+        # Flush buffer if it reaches the size limit
+        if len(self.generation_buffer) >= self.buffer_size:
+            self.flush_buffer()
+    def validate_sample(self, text: str, label: str, attempts: int) -> bool:
+        """Validate a generated sample for a single label"""
+        try:
+            # Get probability for the label
+            prob = self.validator.get_probabilities([text], label)[0]
+            # Super lenient thresholds to get samples
+            thresholds = {
+                'toxic': 0.3,        # Very lenient to get started
+                'severe_toxic': 0.2,
+                'obscene': 0.2,
+                'threat': 0.2,
+                'insult': 0.2,
+                'identity_hate': 0.2
+            }
+            threshold = thresholds.get(label, 0.2)
+            # Log validation result
+            passed = prob >= threshold
+            if passed or (attempts % 5 == 0):  # More frequent logging
+                logger.info(f"\nValidation - Label: {label}, Text: {text}")
+                logger.info(f"Probability: {prob:.3f}, Threshold: {threshold:.2f}, Passed: {passed}")
+            return passed
+        except Exception as e:
+            logger.error(f"Validation error: {str(e)}")
+            return False
+    def generate_samples(self, target_samples: int, label: str,
+                        seed_texts: List[str], total_timeout: int = 300) -> pd.DataFrame:
+        """Generate samples for a single label with timeouts"""
+        start_time = time.time()
+        generated_samples = []
+        attempts = 0
+        max_attempts = target_samples * 50  # Much more attempts allowed
+        batch_size = min(16, target_samples)  # Smaller batch size for better control
+        pbar = tqdm(total=target_samples, desc=f"Generating {label} samples")
+        try:
+            while len(generated_samples) < target_samples and attempts < max_attempts:
+                # Check timeout
+                if time.time() - start_time > total_timeout:
+                    logger.warning(f"Generation timed out after {total_timeout} seconds")
+                    break
+                attempts += 1
+                # Select random seed text and generate prompt
+                seed_text = random.choice(seed_texts)
+                prompt = self.generate_prompt(seed_text, label)
+                try:
+                    # Generate text with optimized parameters
+                    inputs = self.llm_tokenizer(prompt, return_tensors="pt", padding=True,
+                                              truncation=True, max_length=512).to(self.llm.device)
+                    with torch.no_grad():
+                        outputs = self.llm.generate(
+                            **inputs,
+                            max_new_tokens=200,     # Doubled for longer content
+                            num_beams=4,            # Added beam search
+                            temperature=1.35,       # Higher temperature for more randomness
+                            do_sample=True,
+                            top_p=0.99,            # Almost no filtering
+                            top_k=200,             # More options
+                            num_return_sequences=1,
+                            repetition_penalty=1.0, # No repetition penalty
+                            no_repeat_ngram_size=0, # No ngram blocking
+                            early_stopping=True,    # Stop when complete
+                            pad_token_id=self.llm_tokenizer.pad_token_id,
+                            bos_token_id=self.llm_tokenizer.bos_token_id,
+                            eos_token_id=self.llm_tokenizer.eos_token_id,
+                            use_cache=True
+                        )
+                    text = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    # Extract the generated text after [/INST]
+                    if "[/INST]" in text:
+                        output = text.split("[/INST]")[1].strip()
+                        output = output.strip().strip('"').strip("'")
+                        # Only check minimum length
+                        if len(output) >= 10:
+                            # Log generation attempt
+                            if attempts % 5 == 0:  # More frequent logging
+                                logger.info(f"\nAttempt {attempts}: Generated text: {output}")
+                            # Validate sample
+                            if self.validate_sample(output, label, attempts):
+                                sample_dict = {'comment_text': output}
+                                for l in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
+                                    sample_dict[l] = 1 if l == label else 0
+                                generated_samples.append(sample_dict)
+                                pbar.update(1)
+                                logger.info(f"✓ Valid {label} sample generated ({len(generated_samples)}/{target_samples})")
+                except Exception as e:
+                    logger.error(f"Generation error on attempt {attempts}: {str(e)}")
+                    continue
+                # Clear cache less frequently
+                if attempts % 200 == 0:
+                    torch.cuda.empty_cache()
+                    gc.collect()
+        finally:
+            pbar.close()
+            logger.info(f"Generation finished: {len(generated_samples)}/{target_samples} samples in {attempts} attempts")
+            # Return results even if partial
+            if generated_samples:
+                return pd.DataFrame(generated_samples)
+            return None
+    def augment_dataset(self, target_samples: int, label: str, seed_texts: List[str], timeout_minutes: int = 5) -> pd.DataFrame:
+        """Generate a specific number of samples with given label combination"""
+        logger.info(f"\nGenerating {target_samples} samples with label: {label}")
+        generated_samples = []
+        batch_size = min(32, target_samples)
+        start_time = time.time()
+        timeout_seconds = min(timeout_minutes * 60, 300)  # Hard limit of 5 minutes
+        total_generated = 0
+        pbar = None
+        try:
+            # Create progress bar
+            pbar = tqdm(
+                total=target_samples,
+                desc="Generating",
+                unit="samples",
+                ncols=100,
+                bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'
+            )
+            while total_generated < target_samples:
+                # Check timeout
+                elapsed_time = time.time() - start_time
+                if elapsed_time > timeout_seconds:
+                    logger.warning(f"Time limit reached after {elapsed_time/60:.1f} minutes")
+                    break
+                # Calculate remaining samples needed
+                remaining = target_samples - total_generated
+                current_batch_size = min(batch_size, remaining)
+                # Select batch of seed texts
+                batch_seeds = np.random.choice(seed_texts, size=current_batch_size)
+                prompts = [self.generate_prompt(seed, label) for seed in batch_seeds]
+                # Generate and validate samples
+                batch_start = time.time()
+                new_samples = self.generate_samples(
+                    target_samples=current_batch_size,
+                    label=label,
+                    seed_texts=batch_seeds,
+                    total_timeout=timeout_seconds - elapsed_time
+                )
+                if new_samples is not None and not new_samples.empty:
+                    if len(new_samples) > remaining:
+                        new_samples = new_samples.head(remaining)
+                    generated_samples.append(new_samples)
+                    num_new = len(new_samples)
+                    total_generated += num_new
+                    # Update progress bar
+                    pbar.update(num_new)
+                    # Calculate and display metrics
+                    elapsed_minutes = elapsed_time / 60
+                    rate = total_generated / elapsed_minutes if elapsed_minutes > 0 else 0
+                    batch_time = time.time() - batch_start
+                    time_remaining = max(0, timeout_seconds - elapsed_time)
+                    pbar.set_postfix({
+                        'rate': f'{rate:.1f}/min',
+                        'batch': f'{batch_time:.1f}s',
+                        'remain': f'{time_remaining:.0f}s'
+                    }, refresh=True)
+                # Memory management every few batches
+                if total_generated % (batch_size * 4) == 0:
+                    torch.cuda.empty_cache()
+            # Combine all generated samples
+            if generated_samples:
+                final_df = pd.concat(generated_samples, ignore_index=True)
+                if len(final_df) > target_samples:
+                    final_df = final_df.head(target_samples)
+                logger.info(f"Successfully generated {len(final_df)} samples in {elapsed_time/60:.1f} minutes")
+                return final_df
+            return None
+        except Exception as e:
+            logger.error(f"Generation error: {str(e)}")
+            return None
+        finally:
+            if pbar is not None:
+                pbar.close()
+            # Final cleanup
+            self.flush_buffer()
+            torch.cuda.empty_cache()

datacard.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# Jigsaw Toxic Comment Classification Dataset
+## Overview
+Version: 1.0
+Date Created: 2025-02-03
+### Description
+        The Jigsaw Toxic Comment Classification Dataset is designed to help identify and classify toxic online comments.
+        It contains text comments with multiple toxicity-related labels including general toxicity, severe toxicity,
+        obscenity, threats, insults, and identity-based hate speech.
+        The dataset includes:
+        1. Main training data with binary toxicity labels
+        2. Unintended bias training data with additional identity attributes
+        3. Processed versions with sequence length 128 for direct model input
+        4. Test and validation sets for model evaluation
+        This dataset was created by Jigsaw and Google's Conversation AI team to help improve online conversation quality
+        by identifying and classifying various forms of toxic comments.
+## Column Descriptions
+- **id**: Unique identifier for each comment
+- **comment_text**: The text content of the comment to be classified
+- **toxic**: Binary label indicating if the comment is toxic
+- **severe_toxic**: Binary label for extremely toxic comments
+- **obscene**: Binary label for obscene content
+- **threat**: Binary label for threatening content
+- **insult**: Binary label for insulting content
+- **identity_hate**: Binary label for identity-based hate speech
+- **target**: Overall toxicity score (in bias dataset)
+- **identity_attack**: Binary label for identity-based attacks
+- **identity_***: Various identity-related attributes in the bias dataset
+- **lang**: Language of the comment
+## Files

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,13 @@

+version: '3.8'
+services:
+  toxic-classifier:
+    build: .
+    runtime: nvidia  # Enable NVIDIA runtime for GPU support
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - WANDB_API_KEY=${WANDB_API_KEY}  # Set this in .env file
+    volumes:
+      - ./dataset:/app/dataset  # Mount dataset directory
+      - ./weights:/app/weights  # Mount weights directory
+    command: python model/train.py  # Default command, can be overridden

evaluation_results/eval_20250208_161149/confusion_matrices/cm_identity_hate.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_insult.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_obscene.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_severe_toxic.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_threat.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_0.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_1.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_2.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_3.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_4.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_5.png ADDED Viewed

evaluation_results/eval_20250208_161149/confusion_matrices/cm_toxic_6.png ADDED Viewed

evaluation_results/eval_20250208_161149/eval_params.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "timestamp": "20250208_161149",
+  "model_path": "weights/toxic_classifier_xlm-roberta-large",
+  "test_file": "dataset/split/test.csv",
+  "batch_size": 32,
+  "num_workers": null
+}

evaluation_results/eval_20250208_161149/evaluation_results.json ADDED Viewed

	@@ -0,0 +1,2020 @@

+{
+  "overall": {
+    "loss": 0.18776385083473274,
+    "auc_macro": 0.9259171799699759,
+    "auc_weighted": 0.9442696333538418,
+    "precision_macro": 0.4388604553772207,
+    "precision_weighted": 0.7008073672218381,
+    "recall_macro": 0.8836014181101747,
+    "recall_weighted": 0.9051010634378761,
+    "f1_macro": 0.530782857064369,
+    "f1_weighted": 0.7669279374035199,
+    "class_support": {
+      "toxic": 17646,
+      "severe_toxic": 1649,
+      "obscene": 8625,
+      "threat": 714,
+      "insult": 10201,
+      "identity_hate": 1882
+    },
+    "per_class_metrics": {
+      "toxic": {
+        "precision": 0.9115322083309974,
+        "recall": 0.9213986172503683,
+        "f1": 0.9164388580446975,
+        "support": 17646,
+        "specificity": 0.9121478677207437
+      },
+      "severe_toxic": {
+        "precision": 0.15755900489049543,
+        "recall": 0.8987265009096422,
+        "f1": 0.26811397557666217,
+        "support": 1649,
+        "specificity": 0.7666597956359139
+      },
+      "obscene": {
+        "precision": 0.6238325281803543,
+        "recall": 0.8983188405797101,
+        "f1": 0.7363269185079592,
+        "support": 8625,
+        "specificity": 0.8268539450765297
+      },
+      "threat": {
+        "precision": 0.10505486598309048,
+        "recall": 0.8179271708683473,
+        "f1": 0.18619480312450185,
+        "support": 714,
+        "specificity": 0.8574253453315757
+      },
+      "insult": {
+        "precision": 0.6205890336590663,
+        "recall": 0.8964807371826291,
+        "f1": 0.7334482896900189,
+        "support": 10201,
+        "specificity": 0.7799425355217067
+      },
+      "identity_hate": {
+        "precision": 0.21459509121932013,
+        "recall": 0.8687566418703507,
+        "f1": 0.3441742974423745,
+        "support": 1882,
+        "specificity": 0.822570123939987
+      }
+    },
+    "class_weights": {
+      "toxic": 0.43338163420684234,
+      "severe_toxic": 0.04049905444900165,
+      "obscene": 0.21182798339759806,
+      "threat": 0.017535673060392463,
+      "insult": 0.2505341749146548,
+      "identity_hate": 0.04622147997151067
+    },
+    "hamming_loss": 0.1618924586235303,
+    "exact_match": 0.499747247809481,
+    "specificity_macro": 0.8275999355377427,
+    "specificity_weighted": 0.8275999355377428,
+    "summary": {
+      "auc": {
+        "macro": 0.9259171799699759,
+        "weighted": 0.9442696333538418
+      },
+      "f1": {
+        "macro": 0.530782857064369,
+        "weighted": 0.7669279374035199
+      },
+      "precision": {
+        "macro": 0.4388604553772207,
+        "weighted": 0.7008073672218381
+      },
+      "recall": {
+        "macro": 0.8836014181101747,
+        "weighted": 0.9051010634378761
+      },
+      "specificity": {
+        "macro": 0.8275999355377427,
+        "weighted": 0.8275999355377428
+      },
+      "other_metrics": {
+        "hamming_loss": 0.1618924586235303,
+        "exact_match": 0.499747247809481
+      },
+      "class_support": {
+        "toxic": 17646,
+        "severe_toxic": 1649,
+        "obscene": 8625,
+        "threat": 714,
+        "insult": 10201,
+        "identity_hate": 1882
+      }
+    }
+  },
+  "per_language": {
+    "0": {
+      "auc": 0.9546775894690953,
+      "precision": 0.714413481020392,
+      "recall": 0.9246670642019479,
+      "f1": 0.7877150106257862,
+      "hamming_loss": 0.12826939843068874,
+      "exact_match": 0.5564516129032258,
+      "specificity": 0.8596476657420098,
+      "class_metrics": {
+        "toxic": {
+          "auc": 0.9621138334064959,
+          "threshold": 0.46047261357307434,
+          "precision": 0.8825137733163603,
+          "recall": 0.9342830882352909,
+          "f1": 0.9076608519017388,
+          "specificity": 0.8756218905472631,
+          "npv": 0.9301878222768437,
+          "positive_samples": 2176,
+          "true_positives": 2143,
+          "false_positives": 285,
+          "true_negatives": 2008,
+          "false_negatives": 150,
+          "auc_ci": [
+            0.9621138334064959,
+            0.9621138334064959
+          ],
+          "precision_ci": [
+            0.8825137733163603,
+            0.8825137733163603
+          ],
+          "recall_ci": [
+            0.9342830882352909,
+            0.9342830882352909
+          ],
+          "f1_ci": [
+            0.9076608519017388,
+            0.9076608519017388
+          ],
+          "specificity_ci": [
+            0.8756218905472631,
+            0.8756218905472631
+          ],
+          "npv_ci": [
+            0.9301878222768437,
+            0.9301878222768437
+          ],
+          "class_weights": {
+            "0.0": 0.951077943615257,
+            "1.0": 1.0542279411764706
+          }
+        },
+        "severe_toxic": {
+          "auc": 0.9499761279127715,
+          "threshold": 0.03537772223353386,
+          "precision": 0.8608043862269837,
+          "recall": 0.9492385786802037,
+          "f1": 0.9028611452277716,
+          "specificity": 0.8465042131632855,
+          "npv": 0.9434265401805545,
+          "positive_samples": 197,
+          "true_positives": 2177,
+          "false_positives": 352,
+          "true_negatives": 1941,
+          "false_negatives": 116,
+          "auc_ci": [
+            0.9499761279127715,
+            0.9499761279127715
+          ],
+          "precision_ci": [
+            0.8608043862269837,
+            0.8608043862269837
+          ],
+          "recall_ci": [
+            0.9492385786802037,
+            0.9492385786802037
+          ],
+          "f1_ci": [
+            0.9028611452277716,
+            0.9028611452277716
+          ],
+          "specificity_ci": [
+            0.8465042131632855,
+            0.8465042131632855
+          ],
+          "npv_ci": [
+            0.9434265401805545,
+            0.9434265401805545
+          ],
+          "class_weights": {
+            "0.0": 0.5224322477795491,
+            "1.0": 11.644670050761421
+          }
+        },
+        "obscene": {
+          "auc": 0.9572805958351019,
+          "threshold": 0.2777131497859955,
+          "precision": 0.8724828332798461,
+          "recall": 0.9115977291159771,
+          "f1": 0.8916114958872817,
+          "specificity": 0.8667660208643849,
+          "npv": 0.9074484866722257,
+          "positive_samples": 1233,
+          "true_positives": 2091,
+          "false_positives": 305,
+          "true_negatives": 1988,
+          "false_negatives": 202,
+          "auc_ci": [
+            0.9572805958351019,
+            0.9572805958351019
+          ],
+          "precision_ci": [
+            0.8724828332798461,
+            0.8724828332798461
+          ],
+          "recall_ci": [
+            0.9115977291159771,
+            0.9115977291159771
+          ],
+          "f1_ci": [
+            0.8916114958872817,
+            0.8916114958872817
+          ],
+          "specificity_ci": [
+            0.8667660208643849,
+            0.8667660208643849
+          ],
+          "npv_ci": [
+            0.9074484866722257,
+            0.9074484866722257
+          ],
+          "class_weights": {
+            "0.0": 0.6837555886736214,
+            "1.0": 1.8605028386050284
+          }
+        },
+        "threat": {
+          "auc": 0.9697358146798531,
+          "threshold": 0.016539234668016434,
+          "precision": 0.9045252081854022,
+          "recall": 0.9117647058823535,
+          "f1": 0.9081305291811165,
+          "specificity": 0.9037610619468958,
+          "npv": 0.9110528041980915,
+          "positive_samples": 68,
+          "true_positives": 2091,
+          "false_positives": 220,
+          "true_negatives": 2073,
+          "false_negatives": 202,
+          "auc_ci": [
+            0.9697358146798531,
+            0.9697358146798531
+          ],
+          "precision_ci": [
+            0.9045252081854022,
+            0.9045252081854022
+          ],
+          "recall_ci": [
+            0.9117647058823535,
+            0.9117647058823535
+          ],
+          "f1_ci": [
+            0.9081305291811165,
+            0.9081305291811165
+          ],
+          "specificity_ci": [
+            0.9037610619468958,
+            0.9037610619468958
+          ],
+          "npv_ci": [
+            0.9110528041980915,
+            0.9110528041980915
+          ],
+          "class_weights": {
+            "0.0": 0.5075221238938054,
+            "1.0": 33.73529411764706
+          }
+        },
+        "insult": {
+          "auc": 0.935014291573492,
+          "threshold": 0.25907590985298157,
+          "precision": 0.833978890287596,
+          "recall": 0.9098862642169729,
+          "f1": 0.8702805202104968,
+          "specificity": 0.8188679245282912,
+          "npv": 0.900862976980011,
+          "positive_samples": 1143,
+          "true_positives": 2087,
+          "false_positives": 415,
+          "true_negatives": 1878,
+          "false_negatives": 206,
+          "auc_ci": [
+            0.935014291573492,
+            0.935014291573492
+          ],
+          "precision_ci": [
+            0.833978890287596,
+            0.833978890287596
+          ],
+          "recall_ci": [
+            0.9098862642169729,
+            0.9098862642169729
+          ],
+          "f1_ci": [
+            0.8702805202104968,
+            0.8702805202104968
+          ],
+          "specificity_ci": [
+            0.8188679245282912,
+            0.8188679245282912
+          ],
+          "npv_ci": [
+            0.900862976980011,
+            0.900862976980011
+          ],
+          "class_weights": {
+            "0.0": 0.6658925979680697,
+            "1.0": 2.0069991251093615
+          }
+        },
+        "identity_hate": {
+          "auc": 0.9686336850292078,
+          "threshold": 0.026042653247714043,
+          "precision": 0.8623651962191886,
+          "recall": 0.9626168224299065,
+          "f1": 0.909737451082551,
+          "specificity": 0.8463648834019236,
+          "npv": 0.9576992819322562,
+          "positive_samples": 214,
+          "true_positives": 2208,
+          "false_positives": 352,
+          "true_negatives": 1941,
+          "false_negatives": 85,
+          "auc_ci": [
+            0.9686336850292078,
+            0.9686336850292078
+          ],
+          "precision_ci": [
+            0.8623651962191886,
+            0.8623651962191886
+          ],
+          "recall_ci": [
+            0.9626168224299065,
+            0.9626168224299065
+          ],
+          "f1_ci": [
+            0.909737451082551,
+            0.909737451082551
+          ],
+          "specificity_ci": [
+            0.8463648834019236,
+            0.8463648834019236
+          ],
+          "npv_ci": [
+            0.9576992819322562,
+            0.9576992819322562
+          ],
+          "class_weights": {
+            "0.0": 0.5244627343392776,
+            "1.0": 10.719626168224298
+          }
+        }
+      },
+      "sample_count": 4588
+    },
+    "1": {
+      "auc": 0.9420109561343032,
+      "precision": 0.7054445371054338,
+      "recall": 0.8937771830043493,
+      "f1": 0.7655260008199765,
+      "hamming_loss": 0.16467680852429553,
+      "exact_match": 0.49354900828037745,
+      "specificity": 0.8275039240639036,
+      "class_metrics": {
+        "toxic": {
+          "auc": 0.970066021237747,
+          "threshold": 0.44148319959640503,
+          "precision": 0.9051201281749973,
+          "recall": 0.916216216216217,
+          "f1": 0.910634371966946,
+          "specificity": 0.903956972723781,
+          "npv": 0.9151763423430814,
+          "positive_samples": 2590,
+          "true_positives": 2378,
+          "false_positives": 249,
+          "true_negatives": 2347,
+          "false_negatives": 217,
+          "auc_ci": [
+            0.970066021237747,
+            0.970066021237747
+          ],
+          "precision_ci": [
+            0.9051201281749973,
+            0.9051201281749973
+          ],
+          "recall_ci": [
+            0.916216216216217,
+            0.916216216216217
+          ],
+          "f1_ci": [
+            0.910634371966946,
+            0.910634371966946
+          ],
+          "specificity_ci": [
+            0.903956972723781,
+            0.903956972723781
+          ],
+          "npv_ci": [
+            0.9151763423430814,
+            0.9151763423430814
+          ],
+          "class_weights": {
+            "0.0": 0.9975028812908183,
+            "1.0": 1.0025096525096524
+          }
+        },
+        "severe_toxic": {
+          "auc": 0.9032119421376688,
+          "threshold": 0.03648429363965988,
+          "precision": 0.8147008122253235,
+          "recall": 0.8688524590163955,
+          "f1": 0.8409057392553343,
+          "specificity": 0.8023843200646473,
+          "npv": 0.8595146599106457,
+          "positive_samples": 244,
+          "true_positives": 2255,
+          "false_positives": 513,
+          "true_negatives": 2083,
+          "false_negatives": 340,
+          "auc_ci": [
+            0.9032119421376688,
+            0.9032119421376688
+          ],
+          "precision_ci": [
+            0.8147008122253235,
+            0.8147008122253235
+          ],
+          "recall_ci": [
+            0.8688524590163955,
+            0.8688524590163955
+          ],
+          "f1_ci": [
+            0.8409057392553343,
+            0.8409057392553343
+          ],
+          "specificity_ci": [
+            0.8023843200646473,
+            0.8023843200646473
+          ],
+          "npv_ci": [
+            0.8595146599106457,
+            0.8595146599106457
+          ],
+          "class_weights": {
+            "0.0": 0.5246514447363103,
+            "1.0": 10.64139344262295
+          }
+        },
+        "obscene": {
+          "auc": 0.9387485218400086,
+          "threshold": 0.1990610957145691,
+          "precision": 0.8573644543610149,
+          "recall": 0.8723747980614001,
+          "f1": 0.8648044977770555,
+          "specificity": 0.8548672566371623,
+          "npv": 0.8701005785595336,
+          "positive_samples": 1238,
+          "true_positives": 2265,
+          "false_positives": 376,
+          "true_negatives": 2219,
+          "false_negatives": 331,
+          "auc_ci": [
+            0.9387485218400086,
+            0.9387485218400086
+          ],
+          "precision_ci": [
+            0.8573644543610149,
+            0.8573644543610149
+          ],
+          "recall_ci": [
+            0.8723747980614001,
+            0.8723747980614001
+          ],
+          "f1_ci": [
+            0.8648044977770555,
+            0.8648044977770555
+          ],
+          "specificity_ci": [
+            0.8548672566371623,
+            0.8548672566371623
+          ],
+          "npv_ci": [
+            0.8701005785595336,
+            0.8701005785595336
+          ],
+          "class_weights": {
+            "0.0": 0.6565107458912769,
+            "1.0": 2.097334410339257
+          }
+        },
+        "threat": {
+          "auc": 0.930141945247047,
+          "threshold": 0.012619060464203358,
+          "precision": 0.8505847769217403,
+          "recall": 0.8773584905660369,
+          "f1": 0.8637642103418028,
+          "specificity": 0.8458816591311225,
+          "npv": 0.8733726632315268,
+          "positive_samples": 106,
+          "true_positives": 2278,
+          "false_positives": 400,
+          "true_negatives": 2196,
+          "false_negatives": 318,
+          "auc_ci": [
+            0.930141945247047,
+            0.930141945247047
+          ],
+          "precision_ci": [
+            0.8505847769217403,
+            0.8505847769217403
+          ],
+          "recall_ci": [
+            0.8773584905660369,
+            0.8773584905660369
+          ],
+          "f1_ci": [
+            0.8637642103418028,
+            0.8637642103418028
+          ],
+          "specificity_ci": [
+            0.8458816591311225,
+            0.8458816591311225
+          ],
+          "npv_ci": [
+            0.8733726632315268,
+            0.8733726632315268
+          ],
+          "class_weights": {
+            "0.0": 0.5104187143699627,
+            "1.0": 24.495283018867923
+          }
+        },
+        "insult": {
+          "auc": 0.9116567628368878,
+          "threshold": 0.24214455485343933,
+          "precision": 0.8063856025869378,
+          "recall": 0.8794466403162026,
+          "f1": 0.8413329522908936,
+          "specificity": 0.7888435374149729,
+          "npv": 0.8674359236672227,
+          "positive_samples": 1518,
+          "true_positives": 2283,
+          "false_positives": 548,
+          "true_negatives": 2048,
+          "false_negatives": 313,
+          "auc_ci": [
+            0.9116567628368878,
+            0.9116567628368878
+          ],
+          "precision_ci": [
+            0.8063856025869378,
+            0.8063856025869378
+          ],
+          "recall_ci": [
+            0.8794466403162026,
+            0.8794466403162026
+          ],
+          "f1_ci": [
+            0.8413329522908936,
+            0.8413329522908936
+          ],
+          "specificity_ci": [
+            0.7888435374149729,
+            0.7888435374149729
+          ],
+          "npv_ci": [
+            0.8674359236672227,
+            0.8674359236672227
+          ],
+          "class_weights": {
+            "0.0": 0.706530612244898,
+            "1.0": 1.7104743083003953
+          }
+        },
+        "identity_hate": {
+          "auc": 0.9000925697269513,
+          "threshold": 0.03167847916483879,
+          "precision": 0.7933569321076599,
+          "recall": 0.8865248226950354,
+          "f1": 0.8373572860825882,
+          "specificity": 0.7690897984117396,
+          "npv": 0.8714256962068888,
+          "positive_samples": 282,
+          "true_positives": 2301,
+          "false_positives": 599,
+          "true_negatives": 1996,
+          "false_negatives": 294,
+          "auc_ci": [
+            0.9000925697269513,
+            0.9000925697269513
+          ],
+          "precision_ci": [
+            0.7933569321076599,
+            0.7933569321076599
+          ],
+          "recall_ci": [
+            0.8865248226950354,
+            0.8865248226950354
+          ],
+          "f1_ci": [
+            0.8373572860825882,
+            0.8373572860825882
+          ],
+          "specificity_ci": [
+            0.7690897984117396,
+            0.7690897984117396
+          ],
+          "npv_ci": [
+            0.8714256962068888,
+            0.8714256962068888
+          ],
+          "class_weights": {
+            "0.0": 0.5287110568112401,
+            "1.0": 9.207446808510639
+          }
+        }
+      },
+      "sample_count": 5193
+    },
+    "2": {
+      "auc": 0.9291857688264461,
+      "precision": 0.6563281876729908,
+      "recall": 0.9071871335232032,
+      "f1": 0.7348671832220326,
+      "hamming_loss": 0.20595261153076377,
+      "exact_match": 0.4263025372845245,
+      "specificity": 0.7733622212755961,
+      "class_metrics": {
+        "toxic": {
+          "auc": 0.962186696069825,
+          "threshold": 0.3978160321712494,
+          "precision": 0.8937958373522624,
+          "recall": 0.9136996904024615,
+          "f1": 0.9036381748465286,
+          "specificity": 0.8914307871267977,
+          "npv": 0.9117341057406776,
+          "positive_samples": 2584,
+          "true_positives": 2358,
+          "false_positives": 280,
+          "true_negatives": 2301,
+          "false_negatives": 222,
+          "auc_ci": [
+            0.962186696069825,
+            0.962186696069825
+          ],
+          "precision_ci": [
+            0.8937958373522624,
+            0.8937958373522624
+          ],
+          "recall_ci": [
+            0.9136996904024615,
+            0.9136996904024615
+          ],
+          "f1_ci": [
+            0.9036381748465286,
+            0.9036381748465286
+          ],
+          "specificity_ci": [
+            0.8914307871267977,
+            0.8914307871267977
+          ],
+          "npv_ci": [
+            0.9117341057406776,
+            0.9117341057406776
+          ],
+          "class_weights": {
+            "0.0": 1.0009693679720821,
+            "1.0": 0.9990325077399381
+          }
+        },
+        "severe_toxic": {
+          "auc": 0.890519864426667,
+          "threshold": 0.015000982210040092,
+          "precision": 0.7460680730510791,
+          "recall": 0.918032786885247,
+          "f1": 0.8231651924456013,
+          "specificity": 0.6875381175035498,
+          "npv": 0.8934806428840502,
+          "positive_samples": 244,
+          "true_positives": 2369,
+          "false_positives": 806,
+          "true_negatives": 1774,
+          "false_negatives": 211,
+          "auc_ci": [
+            0.890519864426667,
+            0.890519864426667
+          ],
+          "precision_ci": [
+            0.7460680730510791,
+            0.7460680730510791
+          ],
+          "recall_ci": [
+            0.918032786885247,
+            0.918032786885247
+          ],
+          "f1_ci": [
+            0.8231651924456013,
+            0.8231651924456013
+          ],
+          "specificity_ci": [
+            0.6875381175035498,
+            0.6875381175035498
+          ],
+          "npv_ci": [
+            0.8934806428840502,
+            0.8934806428840502
+          ],
+          "class_weights": {
+            "0.0": 0.5248017889815003,
+            "1.0": 10.579918032786885
+          }
+        },
+        "obscene": {
+          "auc": 0.9233059279915251,
+          "threshold": 0.11362762749195099,
+          "precision": 0.7873800414823968,
+          "recall": 0.9095315024232634,
+          "f1": 0.8440592612850891,
+          "specificity": 0.7543949044586057,
+          "npv": 0.892919379205219,
+          "positive_samples": 1238,
+          "true_positives": 2347,
+          "false_positives": 634,
+          "true_negatives": 1947,
+          "false_negatives": 233,
+          "auc_ci": [
+            0.9233059279915251,
+            0.9233059279915251
+          ],
+          "precision_ci": [
+            0.7873800414823968,
+            0.7873800414823968
+          ],
+          "recall_ci": [
+            0.9095315024232634,
+            0.9095315024232634
+          ],
+          "f1_ci": [
+            0.8440592612850891,
+            0.8440592612850891
+          ],
+          "specificity_ci": [
+            0.7543949044586057,
+            0.7543949044586057
+          ],
+          "npv_ci": [
+            0.892919379205219,
+            0.892919379205219
+          ],
+          "class_weights": {
+            "0.0": 0.6577070063694268,
+            "1.0": 2.0852180936995155
+          }
+        },
+        "threat": {
+          "auc": 0.848578598380765,
+          "threshold": 0.008195769973099232,
+          "precision": 0.7785886139481758,
+          "recall": 0.8055555555555555,
+          "f1": 0.791842555156752,
+          "specificity": 0.7709198813056214,
+          "npv": 0.7985792107105536,
+          "positive_samples": 108,
+          "true_positives": 2079,
+          "false_positives": 591,
+          "true_negatives": 1990,
+          "false_negatives": 501,
+          "auc_ci": [
+            0.848578598380765,
+            0.848578598380765
+          ],
+          "precision_ci": [
+            0.7785886139481758,
+            0.7785886139481758
+          ],
+          "recall_ci": [
+            0.8055555555555555,
+            0.8055555555555555
+          ],
+          "f1_ci": [
+            0.791842555156752,
+            0.791842555156752
+          ],
+          "specificity_ci": [
+            0.7709198813056214,
+            0.7709198813056214
+          ],
+          "npv_ci": [
+            0.7985792107105536,
+            0.7985792107105536
+          ],
+          "class_weights": {
+            "0.0": 0.5106824925816024,
+            "1.0": 23.90277777777778
+          }
+        },
+        "insult": {
+          "auc": 0.8943137096607889,
+          "threshold": 0.1587354838848114,
+          "precision": 0.7484673378377763,
+          "recall": 0.9141347424042362,
+          "f1": 0.8230472043830551,
+          "specificity": 0.6927925459029957,
+          "npv": 0.889726581805318,
+          "positive_samples": 1514,
+          "true_positives": 2359,
+          "false_positives": 793,
+          "true_negatives": 1788,
+          "false_negatives": 221,
+          "auc_ci": [
+            0.8943137096607889,
+            0.8943137096607889
+          ],
+          "precision_ci": [
+            0.7484673378377763,
+            0.7484673378377763
+          ],
+          "recall_ci": [
+            0.9141347424042362,
+            0.9141347424042362
+          ],
+          "f1_ci": [
+            0.8230472043830551,
+            0.8230472043830551
+          ],
+          "specificity_ci": [
+            0.6927925459029957,
+            0.6927925459029957
+          ],
+          "npv_ci": [
+            0.889726581805318,
+            0.889726581805318
+          ],
+          "class_weights": {
+            "0.0": 0.7074540970128802,
+            "1.0": 1.7050858652575958
+          }
+        },
+        "identity_hate": {
+          "auc": 0.9040654827596841,
+          "threshold": 0.0467526838183403,
+          "precision": 0.8408828817107497,
+          "recall": 0.8291814946619218,
+          "f1": 0.8349911950184066,
+          "specificity": 0.8430970913560043,
+          "npv": 0.8315259121222329,
+          "positive_samples": 281,
+          "true_positives": 2140,
+          "false_positives": 405,
+          "true_negatives": 2176,
+          "false_negatives": 440,
+          "auc_ci": [
+            0.9040654827596841,
+            0.9040654827596841
+          ],
+          "precision_ci": [
+            0.8408828817107497,
+            0.8408828817107497
+          ],
+          "recall_ci": [
+            0.8291814946619218,
+            0.8291814946619218
+          ],
+          "f1_ci": [
+            0.8349911950184066,
+            0.8349911950184066
+          ],
+          "specificity_ci": [
+            0.8430970913560043,
+            0.8430970913560043
+          ],
+          "npv_ci": [
+            0.8315259121222329,
+            0.8315259121222329
+          ],
+          "class_weights": {
+            "0.0": 0.5287791888570258,
+            "1.0": 9.186832740213523
+          }
+        }
+      },
+      "sample_count": 5163
+    },
+    "3": {
+      "auc": 0.9472472410532857,
+      "precision": 0.6982701786686969,
+      "recall": 0.9152656355077337,
+      "f1": 0.7674148586410611,
+      "hamming_loss": 0.1731811145510836,
+      "exact_match": 0.48471362229102166,
+      "specificity": 0.8133241121366614,
+      "class_metrics": {
+        "toxic": {
+          "auc": 0.9747483574660619,
+          "threshold": 0.5033379793167114,
+          "precision": 0.9204374197691823,
+          "recall": 0.9294300116324036,
+          "f1": 0.9249118582673775,
+          "specificity": 0.9196601004248757,
+          "npv": 0.9287337466652424,
+          "positive_samples": 2579,
+          "true_positives": 2401,
+          "false_positives": 207,
+          "true_negatives": 2376,
+          "false_negatives": 182,
+          "auc_ci": [
+            0.9747483574660619,
+            0.9747483574660619
+          ],
+          "precision_ci": [
+            0.9204374197691823,
+            0.9204374197691823
+          ],
+          "recall_ci": [
+            0.9294300116324036,
+            0.9294300116324036
+          ],
+          "f1_ci": [
+            0.9249118582673775,
+            0.9249118582673775
+          ],
+          "specificity_ci": [
+            0.9196601004248757,
+            0.9196601004248757
+          ],
+          "npv_ci": [
+            0.9287337466652424,
+            0.9287337466652424
+          ],
+          "class_weights": {
+            "0.0": 0.9980687524140595,
+            "1.0": 1.0019387359441645
+          }
+        },
+        "severe_toxic": {
+          "auc": 0.9073687265747961,
+          "threshold": 0.021415209397673607,
+          "precision": 0.7618540559183846,
+          "recall": 0.93388429752066,
+          "f1": 0.8391430651806406,
+          "specificity": 0.7080795777506993,
+          "npv": 0.9146007419992344,
+          "positive_samples": 242,
+          "true_positives": 2413,
+          "false_positives": 754,
+          "true_negatives": 1829,
+          "false_negatives": 170,
+          "auc_ci": [
+            0.9073687265747961,
+            0.9073687265747961
+          ],
+          "precision_ci": [
+            0.7618540559183846,
+            0.7618540559183846
+          ],
+          "recall_ci": [
+            0.93388429752066,
+            0.93388429752066
+          ],
+          "f1_ci": [
+            0.8391430651806406,
+            0.8391430651806406
+          ],
+          "specificity_ci": [
+            0.7080795777506993,
+            0.7080795777506993
+          ],
+          "npv_ci": [
+            0.9146007419992344,
+            0.9146007419992344
+          ],
+          "class_weights": {
+            "0.0": 0.5245635403978888,
+            "1.0": 10.677685950413224
+          }
+        },
+        "obscene": {
+          "auc": 0.9429228614622618,
+          "threshold": 0.14896434545516968,
+          "precision": 0.822101549733319,
+          "recall": 0.9148418491484125,
+          "f1": 0.8659958665665364,
+          "specificity": 0.8020330368488026,
+          "npv": 0.9040137548341648,
+          "positive_samples": 1233,
+          "true_positives": 2363,
+          "false_positives": 511,
+          "true_negatives": 2072,
+          "false_negatives": 220,
+          "auc_ci": [
+            0.9429228614622618,
+            0.9429228614622618
+          ],
+          "precision_ci": [
+            0.822101549733319,
+            0.822101549733319
+          ],
+          "recall_ci": [
+            0.9148418491484125,
+            0.9148418491484125
+          ],
+          "f1_ci": [
+            0.8659958665665364,
+            0.8659958665665364
+          ],
+          "specificity_ci": [
+            0.8020330368488026,
+            0.8020330368488026
+          ],
+          "npv_ci": [
+            0.9040137548341648,
+            0.9040137548341648
+          ],
+          "class_weights": {
+            "0.0": 0.6566709021601016,
+            "1.0": 2.095701540957015
+          }
+        },
+        "threat": {
+          "auc": 0.8985232762406729,
+          "threshold": 0.013273251242935658,
+          "precision": 0.8299773755655987,
+          "recall": 0.8055555555555544,
+          "f1": 0.8175841319366995,
+          "specificity": 0.8349802371541444,
+          "npv": 0.8111134812286639,
+          "positive_samples": 108,
+          "true_positives": 2081,
+          "false_positives": 426,
+          "true_negatives": 2157,
+          "false_negatives": 502,
+          "auc_ci": [
+            0.8985232762406729,
+            0.8985232762406729
+          ],
+          "precision_ci": [
+            0.8299773755655987,
+            0.8299773755655987
+          ],
+          "recall_ci": [
+            0.8055555555555544,
+            0.8055555555555544
+          ],
+          "f1_ci": [
+            0.8175841319366995,
+            0.8175841319366995
+          ],
+          "specificity_ci": [
+            0.8349802371541444,
+            0.8349802371541444
+          ],
+          "npv_ci": [
+            0.8111134812286639,
+            0.8111134812286639
+          ],
+          "class_weights": {
+            "0.0": 0.5106719367588933,
+            "1.0": 23.925925925925927
+          }
+        },
+        "insult": {
+          "auc": 0.9178884966596437,
+          "threshold": 0.22368550300598145,
+          "precision": 0.8017937840347082,
+          "recall": 0.9065606361828928,
+          "f1": 0.8509647346472855,
+          "specificity": 0.7758950532932412,
+          "npv": 0.8925162032262658,
+          "positive_samples": 1509,
+          "true_positives": 2342,
+          "false_positives": 579,
+          "true_negatives": 2004,
+          "false_negatives": 241,
+          "auc_ci": [
+            0.9178884966596437,
+            0.9178884966596437
+          ],
+          "precision_ci": [
+            0.8017937840347082,
+            0.8017937840347082
+          ],
+          "recall_ci": [
+            0.9065606361828928,
+            0.9065606361828928
+          ],
+          "f1_ci": [
+            0.8509647346472855,
+            0.8509647346472855
+          ],
+          "specificity_ci": [
+            0.7758950532932412,
+            0.7758950532932412
+          ],
+          "npv_ci": [
+            0.8925162032262658,
+            0.8925162032262658
+          ],
+          "class_weights": {
+            "0.0": 0.70620388084176,
+            "1.0": 1.7123923127899272
+          }
+        },
+        "identity_hate": {
+          "auc": 0.9242209406948756,
+          "threshold": 0.042373284697532654,
+          "precision": 0.8424336725093711,
+          "recall": 0.8592057761732879,
+          "f1": 0.8507370677416805,
+          "specificity": 0.839296667348186,
+          "npv": 0.8563457480377756,
+          "positive_samples": 277,
+          "true_positives": 2220,
+          "false_positives": 415,
+          "true_negatives": 2168,
+          "false_negatives": 363,
+          "auc_ci": [
+            0.9242209406948756,
+            0.9242209406948756
+          ],
+          "precision_ci": [
+            0.8424336725093711,
+            0.8424336725093711
+          ],
+          "recall_ci": [
+            0.8592057761732879,
+            0.8592057761732879
+          ],
+          "f1_ci": [
+            0.8507370677416805,
+            0.8507370677416805
+          ],
+          "specificity_ci": [
+            0.839296667348186,
+            0.839296667348186
+          ],
+          "npv_ci": [
+            0.8563457480377756,
+            0.8563457480377756
+          ],
+          "class_weights": {
+            "0.0": 0.5283173175219792,
+            "1.0": 9.328519855595667
+          }
+        }
+      },
+      "sample_count": 5168
+    },
+    "4": {
+      "auc": 0.9418392933687934,
+      "precision": 0.7019672150256779,
+      "recall": 0.9036673990197736,
+      "f1": 0.766375554274002,
+      "hamming_loss": 0.1651803024428073,
+      "exact_match": 0.4955409073284219,
+      "specificity": 0.8245338509682739,
+      "class_metrics": {
+        "toxic": {
+          "auc": 0.9718317503718501,
+          "threshold": 0.4544762372970581,
+          "precision": 0.9205380327767301,
+          "recall": 0.9217594394705978,
+          "f1": 0.9211483312394544,
+          "specificity": 0.9204325994592514,
+          "npv": 0.9216554888385321,
+          "positive_samples": 2569,
+          "true_positives": 2377,
+          "false_positives": 205,
+          "true_negatives": 2373,
+          "false_negatives": 201,
+          "auc_ci": [
+            0.9718317503718501,
+            0.9718317503718501
+          ],
+          "precision_ci": [
+            0.9205380327767301,
+            0.9205380327767301
+          ],
+          "recall_ci": [
+            0.9217594394705978,
+            0.9217594394705978
+          ],
+          "f1_ci": [
+            0.9211483312394544,
+            0.9211483312394544
+          ],
+          "specificity_ci": [
+            0.9204325994592514,
+            0.9204325994592514
+          ],
+          "npv_ci": [
+            0.9216554888385321,
+            0.9216554888385321
+          ],
+          "class_weights": {
+            "0.0": 0.9961375048281189,
+            "1.0": 1.003892565200467
+          }
+        },
+        "severe_toxic": {
+          "auc": 0.8962662667751142,
+          "threshold": 0.0307308342307806,
+          "precision": 0.7913182428501319,
+          "recall": 0.8458333333333329,
+          "f1": 0.8176681460830066,
+          "specificity": 0.7769418462789687,
+          "npv": 0.834426745622858,
+          "positive_samples": 240,
+          "true_positives": 2181,
+          "false_positives": 575,
+          "true_negatives": 2003,
+          "false_negatives": 397,
+          "auc_ci": [
+            0.8962662667751142,
+            0.8962662667751142
+          ],
+          "precision_ci": [
+            0.7913182428501319,
+            0.7913182428501319
+          ],
+          "recall_ci": [
+            0.8458333333333329,
+            0.8458333333333329
+          ],
+          "f1_ci": [
+            0.8176681460830066,
+            0.8176681460830066
+          ],
+          "specificity_ci": [
+            0.7769418462789687,
+            0.7769418462789687
+          ],
+          "npv_ci": [
+            0.834426745622858,
+            0.834426745622858
+          ],
+          "class_weights": {
+            "0.0": 0.5244001626677511,
+            "1.0": 10.745833333333334
+          }
+        },
+        "obscene": {
+          "auc": 0.9401245966951454,
+          "threshold": 0.1775909662246704,
+          "precision": 0.8495468615216861,
+          "recall": 0.8913398692810475,
+          "f1": 0.8699417085541208,
+          "specificity": 0.8421453990848948,
+          "npv": 0.8857178178787266,
+          "positive_samples": 1224,
+          "true_positives": 2298,
+          "false_positives": 407,
+          "true_negatives": 2171,
+          "false_negatives": 280,
+          "auc_ci": [
+            0.9401245966951454,
+            0.9401245966951454
+          ],
+          "precision_ci": [
+            0.8495468615216861,
+            0.8495468615216861
+          ],
+          "recall_ci": [
+            0.8913398692810475,
+            0.8913398692810475
+          ],
+          "f1_ci": [
+            0.8699417085541208,
+            0.8699417085541208
+          ],
+          "specificity_ci": [
+            0.8421453990848948,
+            0.8421453990848948
+          ],
+          "npv_ci": [
+            0.8857178178787266,
+            0.8857178178787266
+          ],
+          "class_weights": {
+            "0.0": 0.6555668530757499,
+            "1.0": 2.1070261437908497
+          }
+        },
+        "threat": {
+          "auc": 0.8861722579224652,
+          "threshold": 0.014509523287415504,
+          "precision": 0.841106024006686,
+          "recall": 0.7943925233644874,
+          "f1": 0.81708215259711,
+          "specificity": 0.8499307067907416,
+          "npv": 0.8052107636996033,
+          "positive_samples": 107,
+          "true_positives": 2048,
+          "false_positives": 387,
+          "true_negatives": 2191,
+          "false_negatives": 530,
+          "auc_ci": [
+            0.8861722579224652,
+            0.8861722579224652
+          ],
+          "precision_ci": [
+            0.841106024006686,
+            0.841106024006686
+          ],
+          "recall_ci": [
+            0.7943925233644874,
+            0.7943925233644874
+          ],
+          "f1_ci": [
+            0.81708215259711,
+            0.81708215259711
+          ],
+          "specificity_ci": [
+            0.8499307067907416,
+            0.8499307067907416
+          ],
+          "npv_ci": [
+            0.8052107636996033,
+            0.8052107636996033
+          ],
+          "class_weights": {
+            "0.0": 0.5105919619877252,
+            "1.0": 24.102803738317757
+          }
+        },
+        "insult": {
+          "auc": 0.908347099690273,
+          "threshold": 0.19917058944702148,
+          "precision": 0.787211545222267,
+          "recall": 0.9028609447771131,
+          "f1": 0.8410793781503274,
+          "specificity": 0.755950752393989,
+          "npv": 0.8861326740097348,
+          "positive_samples": 1503,
+          "true_positives": 2328,
+          "false_positives": 629,
+          "true_negatives": 1949,
+          "false_negatives": 250,
+          "auc_ci": [
+            0.908347099690273,
+            0.908347099690273
+          ],
+          "precision_ci": [
+            0.787211545222267,
+            0.787211545222267
+          ],
+          "recall_ci": [
+            0.9028609447771131,
+            0.9028609447771131
+          ],
+          "f1_ci": [
+            0.8410793781503274,
+            0.8410793781503274
+          ],
+          "specificity_ci": [
+            0.755950752393989,
+            0.755950752393989
+          ],
+          "npv_ci": [
+            0.8861326740097348,
+            0.8861326740097348
+          ],
+          "class_weights": {
+            "0.0": 0.7056087551299589,
+            "1.0": 1.7159015302727878
+          }
+        },
+        "identity_hate": {
+          "auc": 0.9136671508934288,
+          "threshold": 0.031982019543647766,
+          "precision": 0.8173388685191341,
+          "recall": 0.8868613138686137,
+          "f1": 0.8506820152960648,
+          "specificity": 0.801801801801802,
+          "npv": 0.8763431199913764,
+          "positive_samples": 274,
+          "true_positives": 2287,
+          "false_positives": 511,
+          "true_negatives": 2067,
+          "false_negatives": 291,
+          "auc_ci": [
+            0.9136671508934288,
+            0.9136671508934288
+          ],
+          "precision_ci": [
+            0.8173388685191341,
+            0.8173388685191341
+          ],
+          "recall_ci": [
+            0.8868613138686137,
+            0.8868613138686137
+          ],
+          "f1_ci": [
+            0.8506820152960648,
+            0.8506820152960648
+          ],
+          "specificity_ci": [
+            0.801801801801802,
+            0.801801801801802
+          ],
+          "npv_ci": [
+            0.8763431199913764,
+            0.8763431199913764
+          ],
+          "class_weights": {
+            "0.0": 0.528050778050778,
+            "1.0": 9.412408759124087
+          }
+        }
+      },
+      "sample_count": 5158
+    },
+    "5": {
+      "auc": 0.9460152147041221,
+      "precision": 0.7347347983801011,
+      "recall": 0.8867510548523206,
+      "f1": 0.7840490209789418,
+      "hamming_loss": 0.13677289804378806,
+      "exact_match": 0.5347842984842596,
+      "specificity": 0.8623489178772902,
+      "class_metrics": {
+        "toxic": {
+          "auc": 0.9757415342563065,
+          "threshold": 0.5313886404037476,
+          "precision": 0.9310023292772915,
+          "recall": 0.9121306376360682,
+          "f1": 0.9214698705828952,
+          "specificity": 0.9324009324009348,
+          "npv": 0.9138763886248709,
+          "positive_samples": 2572,
+          "true_positives": 2346,
+          "false_positives": 173,
+          "true_negatives": 2399,
+          "false_negatives": 226,
+          "auc_ci": [
+            0.9757415342563065,
+            0.9757415342563065
+          ],
+          "precision_ci": [
+            0.9310023292772915,
+            0.9310023292772915
+          ],
+          "recall_ci": [
+            0.9121306376360682,
+            0.9121306376360682
+          ],
+          "f1_ci": [
+            0.9214698705828952,
+            0.9214698705828952
+          ],
+          "specificity_ci": [
+            0.9324009324009348,
+            0.9324009324009348
+          ],
+          "npv_ci": [
+            0.9138763886248709,
+            0.9138763886248709
+          ],
+          "class_weights": {
+            "0.0": 0.9996114996114996,
+            "1.0": 1.0003888024883358
+          }
+        },
+        "severe_toxic": {
+          "auc": 0.9032281899714669,
+          "threshold": 0.05001964047551155,
+          "precision": 0.8240547826417868,
+          "recall": 0.8458333333333334,
+          "f1": 0.8348020409069885,
+          "specificity": 0.8194048104362093,
+          "npv": 0.8416483326674401,
+          "positive_samples": 240,
+          "true_positives": 2176,
+          "false_positives": 464,
+          "true_negatives": 2108,
+          "false_negatives": 396,
+          "auc_ci": [
+            0.9032281899714669,
+            0.9032281899714669
+          ],
+          "precision_ci": [
+            0.8240547826417868,
+            0.8240547826417868
+          ],
+          "recall_ci": [
+            0.8458333333333334,
+            0.8458333333333334
+          ],
+          "f1_ci": [
+            0.8348020409069885,
+            0.8348020409069885
+          ],
+          "specificity_ci": [
+            0.8194048104362093,
+            0.8194048104362093
+          ],
+          "npv_ci": [
+            0.8416483326674401,
+            0.8416483326674401
+          ],
+          "class_weights": {
+            "0.0": 0.5244598450876478,
+            "1.0": 10.720833333333333
+          }
+        },
+        "obscene": {
+          "auc": 0.9399297347094935,
+          "threshold": 0.20134443044662476,
+          "precision": 0.8638120606436712,
+          "recall": 0.8799999999999917,
+          "f1": 0.8718308933886383,
+          "specificity": 0.8612598826829971,
+          "npv": 0.8777082380338568,
+          "positive_samples": 1225,
+          "true_positives": 2264,
+          "false_positives": 356,
+          "true_negatives": 2216,
+          "false_negatives": 308,
+          "auc_ci": [
+            0.9399297347094935,
+            0.9399297347094935
+          ],
+          "precision_ci": [
+            0.8638120606436712,
+            0.8638120606436712
+          ],
+          "recall_ci": [
+            0.8799999999999917,
+            0.8799999999999917
+          ],
+          "f1_ci": [
+            0.8718308933886383,
+            0.8718308933886383
+          ],
+          "specificity_ci": [
+            0.8612598826829971,
+            0.8612598826829971
+          ],
+          "npv_ci": [
+            0.8777082380338568,
+            0.8777082380338568
+          ],
+          "class_weights": {
+            "0.0": 0.6562101504718184,
+            "1.0": 2.100408163265306
+          }
+        },
+        "threat": {
+          "auc": 0.8786647405643102,
+          "threshold": 0.018557138741016388,
+          "precision": 0.8659949024954022,
+          "recall": 0.8055555555555568,
+          "f1": 0.834682556458845,
+          "specificity": 0.8753473600635171,
+          "npv": 0.8182408543184921,
+          "positive_samples": 108,
+          "true_positives": 2072,
+          "false_positives": 320,
+          "true_negatives": 2252,
+          "false_negatives": 500,
+          "auc_ci": [
+            0.8786647405643102,
+            0.8786647405643102
+          ],
+          "precision_ci": [
+            0.8659949024954022,
+            0.8659949024954022
+          ],
+          "recall_ci": [
+            0.8055555555555568,
+            0.8055555555555568
+          ],
+          "f1_ci": [
+            0.834682556458845,
+            0.834682556458845
+          ],
+          "specificity_ci": [
+            0.8753473600635171,
+            0.8753473600635171
+          ],
+          "npv_ci": [
+            0.8182408543184921,
+            0.8182408543184921
+          ],
+          "class_weights": {
+            "0.0": 0.5107185391028186,
+            "1.0": 23.824074074074073
+          }
+        },
+        "insult": {
+          "auc": 0.9170891169219639,
+          "threshold": 0.32249945402145386,
+          "precision": 0.8355108316117581,
+          "recall": 0.8716755319149065,
+          "f1": 0.8532101288125946,
+          "specificity": 0.8283909939593549,
+          "npv": 0.8658697667424693,
+          "positive_samples": 1504,
+          "true_positives": 2242,
+          "false_positives": 441,
+          "true_negatives": 2131,
+          "false_negatives": 330,
+          "auc_ci": [
+            0.9170891169219639,
+            0.9170891169219639
+          ],
+          "precision_ci": [
+            0.8355108316117581,
+            0.8355108316117581
+          ],
+          "recall_ci": [
+            0.8716755319149065,
+            0.8716755319149065
+          ],
+          "f1_ci": [
+            0.8532101288125946,
+            0.8532101288125946
+          ],
+          "specificity_ci": [
+            0.8283909939593549,
+            0.8283909939593549
+          ],
+          "npv_ci": [
+            0.8658697667424693,
+            0.8658697667424693
+          ],
+          "class_weights": {
+            "0.0": 0.7064799560680944,
+            "1.0": 1.7107712765957446
+          }
+        },
+        "identity_hate": {
+          "auc": 0.9171971252566641,
+          "threshold": 0.055891502648591995,
+          "precision": 0.8532420335871026,
+          "recall": 0.829710144927536,
+          "f1": 0.8413115718720496,
+          "specificity": 0.8572895277207252,
+          "npv": 0.8342805841339561,
+          "positive_samples": 276,
+          "true_positives": 2134,
+          "false_positives": 367,
+          "true_negatives": 2205,
+          "false_negatives": 438,
+          "auc_ci": [
+            0.9171971252566641,
+            0.9171971252566641
+          ],
+          "precision_ci": [
+            0.8532420335871026,
+            0.8532420335871026
+          ],
+          "recall_ci": [
+            0.829710144927536,
+            0.829710144927536
+          ],
+          "f1_ci": [
+            0.8413115718720496,
+            0.8413115718720496
+          ],
+          "specificity_ci": [
+            0.8572895277207252,
+            0.8572895277207252
+          ],
+          "npv_ci": [
+            0.8342805841339561,
+            0.8342805841339561
+          ],
+          "class_weights": {
+            "0.0": 0.5283367556468173,
+            "1.0": 9.322463768115941
+          }
+        }
+      },
+      "sample_count": 5146
+    },
+    "6": {
+      "auc": 0.9462815482574403,
+      "precision": 0.7134961462135606,
+      "recall": 0.9073793914943687,
+      "f1": 0.7744642816056855,
+      "hamming_loss": 0.15539933230611197,
+      "exact_match": 0.5132896764252697,
+      "specificity": 0.8360743701752594,
+      "class_metrics": {
+        "toxic": {
+          "auc": 0.9780732995232411,
+          "threshold": 0.5710838437080383,
+          "precision": 0.9379357119021944,
+          "recall": 0.9243012422360248,
+          "f1": 0.9310685643115885,
+          "specificity": 0.9388379204893005,
+          "npv": 0.9253858836387251,
+          "positive_samples": 2576,
+          "true_positives": 2399,
+          "false_positives": 158,
+          "true_negatives": 2437,
+          "false_negatives": 196,
+          "auc_ci": [
+            0.9780732995232411,
+            0.9780732995232411
+          ],
+          "precision_ci": [
+            0.9379357119021944,
+            0.9379357119021944
+          ],
+          "recall_ci": [
+            0.9243012422360248,
+            0.9243012422360248
+          ],
+          "f1_ci": [
+            0.9310685643115885,
+            0.9310685643115885
+          ],
+          "specificity_ci": [
+            0.9388379204893005,
+            0.9388379204893005
+          ],
+          "npv_ci": [
+            0.9253858836387251,
+            0.9253858836387251
+          ],
+          "class_weights": {
+            "0.0": 0.9923547400611621,
+            "1.0": 1.0077639751552796
+          }
+        },
+        "severe_toxic": {
+          "auc": 0.9067576592369966,
+          "threshold": 0.023807251825928688,
+          "precision": 0.7794259030353159,
+          "recall": 0.9380165289256208,
+          "f1": 0.8513989948241057,
+          "specificity": 0.7345454545454645,
+          "npv": 0.9221830255239729,
+          "positive_samples": 242,
+          "true_positives": 2435,
+          "false_positives": 689,
+          "true_negatives": 1906,
+          "false_negatives": 160,
+          "auc_ci": [
+            0.9067576592369966,
+            0.9067576592369966
+          ],
+          "precision_ci": [
+            0.7794259030353159,
+            0.7794259030353159
+          ],
+          "recall_ci": [
+            0.9380165289256208,
+            0.9380165289256208
+          ],
+          "f1_ci": [
+            0.8513989948241057,
+            0.8513989948241057
+          ],
+          "specificity_ci": [
+            0.7345454545454645,
+            0.7345454545454645
+          ],
+          "npv_ci": [
+            0.9221830255239729,
+            0.9221830255239729
+          ],
+          "class_weights": {
+            "0.0": 0.5244444444444445,
+            "1.0": 10.727272727272727
+          }
+        },
+        "obscene": {
+          "auc": 0.9375048626461102,
+          "threshold": 0.14760328829288483,
+          "precision": 0.8287449241470627,
+          "recall": 0.9084278768233371,
+          "f1": 0.8667588986547364,
+          "specificity": 0.8122789287518954,
+          "npv": 0.8986867106241987,
+          "positive_samples": 1234,
+          "true_positives": 2358,
+          "false_positives": 487,
+          "true_negatives": 2108,
+          "false_negatives": 237,
+          "auc_ci": [
+            0.9375048626461102,
+            0.9375048626461102
+          ],
+          "precision_ci": [
+            0.8287449241470627,
+            0.8287449241470627
+          ],
+          "recall_ci": [
+            0.9084278768233371,
+            0.9084278768233371
+          ],
+          "f1_ci": [
+            0.8667588986547364,
+            0.8667588986547364
+          ],
+          "specificity_ci": [
+            0.8122789287518954,
+            0.8122789287518954
+          ],
+          "npv_ci": [
+            0.8986867106241987,
+            0.8986867106241987
+          ],
+          "class_weights": {
+            "0.0": 0.6558868115209702,
+            "1.0": 2.1037277147487843
+          }
+        },
+        "threat": {
+          "auc": 0.9031869137455802,
+          "threshold": 0.026773449033498764,
+          "precision": 0.9112427696973145,
+          "recall": 0.761467889908257,
+          "f1": 0.8296498919893159,
+          "specificity": 0.9258312020460328,
+          "npv": 0.7951394486538688,
+          "positive_samples": 109,
+          "true_positives": 1976,
+          "false_positives": 192,
+          "true_negatives": 2403,
+          "false_negatives": 619,
+          "auc_ci": [
+            0.9031869137455802,
+            0.9031869137455802
+          ],
+          "precision_ci": [
+            0.9112427696973145,
+            0.9112427696973145
+          ],
+          "recall_ci": [
+            0.761467889908257,
+            0.761467889908257
+          ],
+          "f1_ci": [
+            0.8296498919893159,
+            0.8296498919893159
+          ],
+          "specificity_ci": [
+            0.9258312020460328,
+            0.9258312020460328
+          ],
+          "npv_ci": [
+            0.7951394486538688,
+            0.7951394486538688
+          ],
+          "class_weights": {
+            "0.0": 0.5107220145583317,
+            "1.0": 23.81651376146789
+          }
+        },
+        "insult": {
+          "auc": 0.9164838070297321,
+          "threshold": 0.2600024938583374,
+          "precision": 0.8178816065079044,
+          "recall": 0.8940397350993466,
+          "f1": 0.8542666500534941,
+          "specificity": 0.8009234111895767,
+          "npv": 0.8831600262588531,
+          "positive_samples": 1510,
+          "true_positives": 2320,
+          "false_positives": 516,
+          "true_negatives": 2079,
+          "false_negatives": 275,
+          "auc_ci": [
+            0.9164838070297321,
+            0.9164838070297321
+          ],
+          "precision_ci": [
+            0.8178816065079044,
+            0.8178816065079044
+          ],
+          "recall_ci": [
+            0.8940397350993466,
+            0.8940397350993466
+          ],
+          "f1_ci": [
+            0.8542666500534941,
+            0.8542666500534941
+          ],
+          "specificity_ci": [
+            0.8009234111895767,
+            0.8009234111895767
+          ],
+          "npv_ci": [
+            0.8831600262588531,
+            0.8831600262588531
+          ],
+          "class_weights": {
+            "0.0": 0.7050516023900054,
+            "1.0": 1.719205298013245
+          }
+        },
+        "identity_hate": {
+          "auc": 0.9038051609994096,
+          "threshold": 0.03315547853708267,
+          "precision": 0.8124487711378064,
+          "recall": 0.8489208633093526,
+          "f1": 0.8302844808144539,
+          "specificity": 0.804029304029316,
+          "npv": 0.8418199125360486,
+          "positive_samples": 278,
+          "true_positives": 2203,
+          "false_positives": 508,
+          "true_negatives": 2087,
+          "false_negatives": 392,
+          "auc_ci": [
+            0.9038051609994096,
+            0.9038051609994096
+          ],
+          "precision_ci": [
+            0.8124487711378064,
+            0.8124487711378064
+          ],
+          "recall_ci": [
+            0.8489208633093526,
+            0.8489208633093526
+          ],
+          "f1_ci": [
+            0.8302844808144539,
+            0.8302844808144539
+          ],
+          "specificity_ci": [
+            0.804029304029316,
+            0.804029304029316
+          ],
+          "npv_ci": [
+            0.8418199125360486,
+            0.8418199125360486
+          ],
+          "class_weights": {
+            "0.0": 0.5282865282865283,
+            "1.0": 9.338129496402878
+          }
+        }
+      },
+      "sample_count": 5192
+    }
+  },
+  "per_class": {},
+  "thresholds": {
+    "0": {
+      "toxic": 0.46047261357307434,
+      "severe_toxic": 0.03537772223353386,
+      "obscene": 0.2777131497859955,
+      "threat": 0.016539234668016434,
+      "insult": 0.25907590985298157,
+      "identity_hate": 0.026042653247714043
+    },
+    "1": {
+      "toxic": 0.44148319959640503,
+      "severe_toxic": 0.03648429363965988,
+      "obscene": 0.1990610957145691,
+      "threat": 0.012619060464203358,
+      "insult": 0.24214455485343933,
+      "identity_hate": 0.03167847916483879
+    },
+    "2": {
+      "toxic": 0.3978160321712494,
+      "severe_toxic": 0.015000982210040092,
+      "obscene": 0.11362762749195099,
+      "threat": 0.008195769973099232,
+      "insult": 0.1587354838848114,
+      "identity_hate": 0.0467526838183403
+    },
+    "3": {
+      "toxic": 0.5033379793167114,
+      "severe_toxic": 0.021415209397673607,
+      "obscene": 0.14896434545516968,
+      "threat": 0.013273251242935658,
+      "insult": 0.22368550300598145,
+      "identity_hate": 0.042373284697532654
+    },
+    "4": {
+      "toxic": 0.4544762372970581,
+      "severe_toxic": 0.0307308342307806,
+      "obscene": 0.1775909662246704,
+      "threat": 0.014509523287415504,
+      "insult": 0.19917058944702148,
+      "identity_hate": 0.031982019543647766
+    },
+    "5": {
+      "toxic": 0.5313886404037476,
+      "severe_toxic": 0.05001964047551155,
+      "obscene": 0.20134443044662476,
+      "threat": 0.018557138741016388,
+      "insult": 0.32249945402145386,
+      "identity_hate": 0.055891502648591995
+    },
+    "6": {
+      "toxic": 0.5710838437080383,
+      "severe_toxic": 0.023807251825928688,
+      "obscene": 0.14760328829288483,
+      "threat": 0.026773449033498764,
+      "insult": 0.2600024938583374,
+      "identity_hate": 0.03315547853708267
+    }
+  }
+}

evaluation_results/eval_20250208_161149/plots/calibration_0.png ADDED Viewed

Git LFS Details

SHA256: 1e520af6af852f9edeef0bc12c53741ec9028a81d0d7fc7105e7abe02c1121d7
Pointer size: 131 Bytes
Size of remote file: 112 kB

evaluation_results/eval_20250208_161149/plots/calibration_1.png ADDED Viewed

Git LFS Details

SHA256: d6bbf38e6f262d27f5209c1bb0b8174259b6183978e8844fb84ccd2b43810be0
Pointer size: 131 Bytes
Size of remote file: 111 kB

evaluation_results/eval_20250208_161149/plots/calibration_2.png ADDED Viewed

Git LFS Details

SHA256: 617690b2d238fcd53726552b1b979612f943976b2652013a078c6ce4d2496060
Pointer size: 131 Bytes
Size of remote file: 110 kB

evaluation_results/eval_20250208_161149/plots/calibration_3.png ADDED Viewed

Git LFS Details

SHA256: 78fe5b71ba88524f96205ba367b7b643d864a55bcd702e15be7c9a27e2a43007
Pointer size: 131 Bytes
Size of remote file: 111 kB

evaluation_results/eval_20250208_161149/plots/calibration_4.png ADDED Viewed

Git LFS Details

SHA256: e8e2df03bc3e34ccdf6c2a7c5fe305b0da7f4d6185948464ddc67f1ec4618b2f
Pointer size: 131 Bytes
Size of remote file: 110 kB

evaluation_results/eval_20250208_161149/plots/calibration_5.png ADDED Viewed

Git LFS Details

SHA256: f4c1e2d2529ebc23a3c1d07daf36507ea66208f49396dce354fc0ab6c8baa14a
Pointer size: 131 Bytes
Size of remote file: 110 kB

evaluation_results/eval_20250208_161149/plots/calibration_6.png ADDED Viewed

Git LFS Details

SHA256: d912dedb7a79ea521921afb948696787b2ba6206137d3359b619936e24455101
Pointer size: 131 Bytes
Size of remote file: 111 kB

evaluation_results/eval_20250208_161149/plots/class_calibration.png ADDED Viewed

Git LFS Details

SHA256: 6f0fed51177ba858d2fa386a5198020a00dc58255cb3940526072c2866f71212
Pointer size: 131 Bytes
Size of remote file: 112 kB

evaluation_results/eval_20250208_161149/plots/language_performance.png ADDED Viewed

evaluation_results/eval_20250208_161149/plots/metric_correlations.png ADDED Viewed

evaluation_results/eval_20250208_161149/plots/overall_calibration.png ADDED Viewed

evaluation_results/eval_20250208_161149/plots/performance_distributions.png ADDED Viewed

evaluation_results/eval_20250208_161149/predictions.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d562e6c02fc268d01464f9716846556a75e863ec9cc03d582f39e14191cbd496
+size 809713

evaluation_results/eval_20250208_161149/thresholds.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "0": {
+    "toxic": 0.46047261357307434,
+    "severe_toxic": 0.03537772223353386,
+    "obscene": 0.2777131497859955,
+    "threat": 0.016539234668016434,
+    "insult": 0.25907590985298157,
+    "identity_hate": 0.026042653247714043
+  },
+  "1": {
+    "toxic": 0.44148319959640503,
+    "severe_toxic": 0.03648429363965988,
+    "obscene": 0.1990610957145691,
+    "threat": 0.012619060464203358,
+    "insult": 0.24214455485343933,
+    "identity_hate": 0.03167847916483879
+  },
+  "2": {
+    "toxic": 0.3978160321712494,
+    "severe_toxic": 0.015000982210040092,
+    "obscene": 0.11362762749195099,
+    "threat": 0.008195769973099232,
+    "insult": 0.1587354838848114,
+    "identity_hate": 0.0467526838183403
+  },
+  "3": {
+    "toxic": 0.5033379793167114,
+    "severe_toxic": 0.021415209397673607,
+    "obscene": 0.14896434545516968,
+    "threat": 0.013273251242935658,
+    "insult": 0.22368550300598145,
+    "identity_hate": 0.042373284697532654
+  },
+  "4": {
+    "toxic": 0.4544762372970581,
+    "severe_toxic": 0.0307308342307806,
+    "obscene": 0.1775909662246704,
+    "threat": 0.014509523287415504,
+    "insult": 0.19917058944702148,
+    "identity_hate": 0.031982019543647766
+  },
+  "5": {
+    "toxic": 0.5313886404037476,
+    "severe_toxic": 0.05001964047551155,
+    "obscene": 0.20134443044662476,
+    "threat": 0.018557138741016388,
+    "insult": 0.32249945402145386,
+    "identity_hate": 0.055891502648591995
+  },
+  "6": {
+    "toxic": 0.5710838437080383,
+    "severe_toxic": 0.023807251825928688,
+    "obscene": 0.14760328829288483,
+    "threat": 0.026773449033498764,
+    "insult": 0.2600024938583374,
+    "identity_hate": 0.03315547853708267
+  }
+}

evaluation_results/eval_20250401_143401/eval_params.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "timestamp": "20250401_143401",
+  "model_path": "weights/toxic_classifier_xlm-roberta-large",
+  "checkpoint": null,
+  "test_file": "dataset/split/val.csv",
+  "batch_size": 64,
+  "num_workers": 16,
+  "cache_dir": "cached_data",
+  "force_retokenize": false,
+  "prefetch_factor": 2,
+  "max_length": 128,
+  "gc_frequency": 500,
+  "label_columns": [
+    "toxic",
+    "severe_toxic",
+    "obscene",
+    "threat",
+    "insult",
+    "identity_hate"
+  ]
+}

evaluation_results/eval_20250401_143401/evaluation_results.json ADDED Viewed

	@@ -0,0 +1,684 @@

+{
+  "default_thresholds": {
+    "overall": {
+      "auc_macro": 0.9116120481007194,
+      "auc_weighted": 0.9305869103434485,
+      "precision_macro": 0.7017348731216243,
+      "precision_weighted": 0.7941268867549155,
+      "recall_macro": 0.4685972374699909,
+      "recall_weighted": 0.7276981501898812,
+      "f1_macro": 0.5228946160541719,
+      "f1_weighted": 0.7469638283202927,
+      "hamming_loss": 0.08497391889618038,
+      "exact_match": 0.6461383139828369
+    },
+    "per_language": {
+      "0": {
+        "auc_macro": 0.9445681226397739,
+        "auc_weighted": 0.9465404082666297,
+        "precision_macro": 0.7219326082283263,
+        "precision_weighted": 0.7908382685179838,
+        "recall_macro": 0.5535398284592582,
+        "recall_weighted": 0.7833787465940054,
+        "f1_macro": 0.6000668677340134,
+        "f1_weighted": 0.7786737821480415,
+        "hamming_loss": 0.07650567773465575,
+        "exact_match": 0.6601983613626563,
+        "sample_count": 4638
+      },
+      "1": {
+        "auc_macro": 0.9064189306891727,
+        "auc_weighted": 0.9274078123911156,
+        "precision_macro": 0.6864158919056594,
+        "precision_weighted": 0.7852581089086744,
+        "recall_macro": 0.44366116589032245,
+        "recall_weighted": 0.7238780977896851,
+        "f1_macro": 0.48488161881757197,
+        "f1_weighted": 0.737051270947713,
+        "hamming_loss": 0.08752166377816291,
+        "exact_match": 0.6402849990371654,
+        "sample_count": 5193
+      },
+      "2": {
+        "auc_macro": 0.8945135400492461,
+        "auc_weighted": 0.9120120071881025,
+        "precision_macro": 0.7178271955012184,
+        "precision_weighted": 0.7982113173628885,
+        "recall_macro": 0.4043111379749362,
+        "recall_weighted": 0.6535947712418301,
+        "f1_macro": 0.4738257066120983,
+        "f1_weighted": 0.7027905834489889,
+        "hamming_loss": 0.09504905757810483,
+        "exact_match": 0.6229666924864447,
+        "sample_count": 5164
+      },
+      "3": {
+        "auc_macro": 0.9135727964673032,
+        "auc_weighted": 0.9339502655719858,
+        "precision_macro": 0.7093511783545062,
+        "precision_weighted": 0.7989932896421867,
+        "recall_macro": 0.4814045378504133,
+        "recall_weighted": 0.7405478070912451,
+        "f1_macro": 0.5327086132158053,
+        "f1_weighted": 0.7545000455696493,
+        "hamming_loss": 0.08359133126934984,
+        "exact_match": 0.6480263157894737,
+        "sample_count": 5168
+      },
+      "4": {
+        "auc_macro": 0.9050160058685811,
+        "auc_weighted": 0.9286663336151794,
+        "precision_macro": 0.6819384343494851,
+        "precision_weighted": 0.7945304496145832,
+        "recall_macro": 0.4656370270227365,
+        "recall_weighted": 0.7256427604871448,
+        "f1_macro": 0.5189060171591118,
+        "f1_weighted": 0.7474398480273773,
+        "hamming_loss": 0.08477150798267727,
+        "exact_match": 0.6509598603839442,
+        "sample_count": 5157
+      },
+      "5": {
+        "auc_macro": 0.9115535221829411,
+        "auc_weighted": 0.9337271942250184,
+        "precision_macro": 0.6927437323462047,
+        "precision_weighted": 0.7984424245250574,
+        "recall_macro": 0.4695924180409275,
+        "recall_weighted": 0.739629005059022,
+        "f1_macro": 0.5191221600663896,
+        "f1_weighted": 0.7554966948679994,
+        "hamming_loss": 0.08252364295893251,
+        "exact_match": 0.6525456665371162,
+        "sample_count": 5146
+      },
+      "6": {
+        "auc_macro": 0.9045493247421005,
+        "auc_weighted": 0.9308415576648513,
+        "precision_macro": 0.6958021612757893,
+        "precision_weighted": 0.7925797967619269,
+        "recall_macro": 0.4680867128534896,
+        "recall_weighted": 0.735071488645921,
+        "f1_macro": 0.5184729138243417,
+        "f1_weighted": 0.7510735996739993,
+        "hamming_loss": 0.0839753466872111,
+        "exact_match": 0.6494607087827426,
+        "sample_count": 5192
+      }
+    },
+    "per_class": {
+      "toxic": {
+        "auc": 0.9619106577495796,
+        "threshold": 0.5,
+        "precision": 0.9067127628925382,
+        "recall": 0.8891902582358592,
+        "f1": 0.8978660276161132,
+        "support": 17697,
+        "brier": 0.09342169378057544,
+        "true_positives": 15736,
+        "false_positives": 1619,
+        "true_negatives": 16342,
+        "false_negatives": 1961
+      },
+      "severe_toxic": {
+        "auc": 0.9017555053121755,
+        "threshold": 0.5,
+        "precision": 0.5620915032679739,
+        "recall": 0.15589123867069488,
+        "f1": 0.24408703878902555,
+        "support": 1655,
+        "brier": 0.05564494143865772,
+        "true_positives": 258,
+        "false_positives": 201,
+        "true_negatives": 33802,
+        "false_negatives": 1397
+      },
+      "obscene": {
+        "auc": 0.9247491461802884,
+        "threshold": 0.5,
+        "precision": 0.7636434008515031,
+        "recall": 0.686181312311616,
+        "f1": 0.7228430115405752,
+        "support": 8626,
+        "brier": 0.1102165916686836,
+        "true_positives": 5919,
+        "false_positives": 1832,
+        "true_negatives": 25200,
+        "false_negatives": 2707
+      },
+      "threat": {
+        "auc": 0.8978719938708597,
+        "threshold": 0.5,
+        "precision": 0.6042553191489362,
+        "recall": 0.1868421052631579,
+        "f1": 0.28542713567839195,
+        "support": 760,
+        "brier": 0.03694216309848939,
+        "true_positives": 142,
+        "false_positives": 93,
+        "true_negatives": 34805,
+        "false_negatives": 618
+      },
+      "insult": {
+        "auc": 0.8962985964590791,
+        "threshold": 0.5,
+        "precision": 0.6981960484871623,
+        "recall": 0.7172271791352093,
+        "f1": 0.7075836718901142,
+        "support": 10199,
+        "brier": 0.1366709113756841,
+        "true_positives": 7315,
+        "false_positives": 3162,
+        "true_negatives": 22297,
+        "false_negatives": 2884
+      },
+      "identity_hate": {
+        "auc": 0.887086389032334,
+        "threshold": 0.5,
+        "precision": 0.6755102040816326,
+        "recall": 0.17625133120340788,
+        "f1": 0.2795608108108108,
+        "support": 1878,
+        "brier": 0.06076370760519854,
+        "true_positives": 331,
+        "false_positives": 159,
+        "true_negatives": 33621,
+        "false_negatives": 1547
+      }
+    }
+  },
+  "optimized_thresholds": {
+    "overall": {
+      "auc_macro": 0.9116120481007194,
+      "auc_weighted": 0.9305869103434485,
+      "precision_macro": 0.5775888380947196,
+      "precision_weighted": 0.7443465124836487,
+      "recall_macro": 0.639900823721825,
+      "recall_weighted": 0.798186941075585,
+      "f1_macro": 0.6040131510667749,
+      "f1_weighted": 0.7686775463209056,
+      "hamming_loss": 0.09459775272496121,
+      "exact_match": 0.6191317516405855
+    },
+    "per_language": {
+      "0": {
+        "auc_macro": 0.9445681226397739,
+        "auc_weighted": 0.9465404082666297,
+        "precision_macro": 0.5885969911405202,
+        "precision_weighted": 0.7416734521846035,
+        "recall_macro": 0.7381385425477333,
+        "recall_weighted": 0.8514986376021798,
+        "f1_macro": 0.6497623010487168,
+        "f1_weighted": 0.7903759805291908,
+        "hamming_loss": 0.08746586172200661,
+        "exact_match": 0.6282880551962052,
+        "sample_count": 4638
+      },
+      "1": {
+        "auc_macro": 0.9064189306891727,
+        "auc_weighted": 0.9274078123911156,
+        "precision_macro": 0.5769491938694048,
+        "precision_weighted": 0.7372462490399235,
+        "recall_macro": 0.6223651765807731,
+        "recall_weighted": 0.7957133288680509,
+        "f1_macro": 0.5940383621467368,
+        "f1_weighted": 0.7630519259035966,
+        "hamming_loss": 0.09734257654534952,
+        "exact_match": 0.6112073945696129,
+        "sample_count": 5193
+      },
+      "2": {
+        "auc_macro": 0.8945135400492461,
+        "auc_weighted": 0.9120120071881025,
+        "precision_macro": 0.5883546567568967,
+        "precision_weighted": 0.7471472711374241,
+        "recall_macro": 0.5741089328356292,
+        "recall_weighted": 0.7323613205966147,
+        "f1_macro": 0.579910490554519,
+        "f1_weighted": 0.7393192722268676,
+        "hamming_loss": 0.10030983733539892,
+        "exact_match": 0.6094113090627421,
+        "sample_count": 5164
+      },
+      "3": {
+        "auc_macro": 0.9135727964673032,
+        "auc_weighted": 0.9339502655719858,
+        "precision_macro": 0.5674300764951785,
+        "precision_weighted": 0.7452385794349706,
+        "recall_macro": 0.6585754182827804,
+        "recall_weighted": 0.8117963367501261,
+        "f1_macro": 0.6075512335059755,
+        "f1_weighted": 0.7751847838928642,
+        "hamming_loss": 0.09404024767801858,
+        "exact_match": 0.6234520123839009,
+        "sample_count": 5168
+      },
+      "4": {
+        "auc_macro": 0.9050160058685811,
+        "auc_weighted": 0.9286663336151794,
+        "precision_macro": 0.5635774868138544,
+        "precision_weighted": 0.7453012013072762,
+        "recall_macro": 0.6307198572670079,
+        "recall_weighted": 0.793640054127199,
+        "f1_macro": 0.5906173214394316,
+        "f1_weighted": 0.7663604150980545,
+        "hamming_loss": 0.0963415422403206,
+        "exact_match": 0.6162497576110142,
+        "sample_count": 5157
+      },
+      "5": {
+        "auc_macro": 0.9115535221829411,
+        "auc_weighted": 0.9337271942250184,
+        "precision_macro": 0.577007586897046,
+        "precision_weighted": 0.7468873881119108,
+        "recall_macro": 0.635638229939968,
+        "recall_weighted": 0.8080944350758853,
+        "f1_macro": 0.5988862551226474,
+        "f1_weighted": 0.7742215916662522,
+        "hamming_loss": 0.09350304443580774,
+        "exact_match": 0.6195102992615624,
+        "sample_count": 5146
+      },
+      "6": {
+        "auc_macro": 0.9045493247421005,
+        "auc_weighted": 0.9308415576648513,
+        "precision_macro": 0.591572349044604,
+        "precision_weighted": 0.749047954356656,
+        "recall_macro": 0.6294384348455582,
+        "recall_weighted": 0.8016820857863751,
+        "f1_macro": 0.6039252504591597,
+        "f1_weighted": 0.772582192067038,
+        "hamming_loss": 0.09244992295839753,
+        "exact_match": 0.6267334360554699,
+        "sample_count": 5192
+      }
+    },
+    "per_class": {
+      "toxic": {
+        "auc": 0.9619106577495796,
+        "threshold": 0.4877551020408163,
+        "precision": 0.8999716472923164,
+        "recall": 0.8968186698310449,
+        "f1": 0.8983923921657421,
+        "support": 17697,
+        "brier": 0.09342169378057544,
+        "true_positives": 15871,
+        "false_positives": 1764,
+        "true_negatives": 16197,
+        "false_negatives": 1826
+      },
+      "severe_toxic": {
+        "auc": 0.9017555053121755,
+        "threshold": 0.373469387755102,
+        "precision": 0.34626149540183926,
+        "recall": 0.5232628398791541,
+        "f1": 0.4167468719923003,
+        "support": 1655,
+        "brier": 0.05564494143865772,
+        "true_positives": 866,
+        "false_positives": 1635,
+        "true_negatives": 32368,
+        "false_negatives": 789
+      },
+      "obscene": {
+        "auc": 0.9247491461802884,
+        "threshold": 0.4551020408163265,
+        "precision": 0.7017099430018999,
+        "recall": 0.770693252956179,
+        "f1": 0.734585635359116,
+        "support": 8626,
+        "brier": 0.1102165916686836,
+        "true_positives": 6648,
+        "false_positives": 2826,
+        "true_negatives": 24206,
+        "false_negatives": 1978
+      },
+      "threat": {
+        "auc": 0.8978719938708597,
+        "threshold": 0.38979591836734695,
+        "precision": 0.43684992570579495,
+        "recall": 0.3868421052631579,
+        "f1": 0.41032798325191905,
+        "support": 760,
+        "brier": 0.03694216309848939,
+        "true_positives": 294,
+        "false_positives": 379,
+        "true_negatives": 34519,
+        "false_negatives": 466
+      },
+      "insult": {
+        "auc": 0.8962985964590791,
+        "threshold": 0.463265306122449,
+        "precision": 0.6568989575638184,
+        "recall": 0.7846847730169625,
+        "f1": 0.7151282280403896,
+        "support": 10199,
+        "brier": 0.1366709113756841,
+        "true_positives": 8003,
+        "false_positives": 4180,
+        "true_negatives": 21279,
+        "false_negatives": 2196
+      },
+      "identity_hate": {
+        "auc": 0.887086389032334,
+        "threshold": 0.373469387755102,
+        "precision": 0.423841059602649,
+        "recall": 0.47710330138445156,
+        "f1": 0.44889779559118237,
+        "support": 1878,
+        "brier": 0.06076370760519854,
+        "true_positives": 896,
+        "false_positives": 1218,
+        "true_negatives": 32562,
+        "false_negatives": 982
+      }
+    }
+  },
+  "thresholds": {
+    "global": {
+      "toxic": {
+        "threshold": 0.4877551020408163,
+        "f1_score": 0.8926184748925591,
+        "support": 17697,
+        "total_samples": 35658
+      },
+      "severe_toxic": {
+        "threshold": 0.373469387755102,
+        "f1_score": 0.41132469871513055,
+        "support": 1655,
+        "total_samples": 35658
+      },
+      "obscene": {
+        "threshold": 0.4551020408163265,
+        "f1_score": 0.726924984126118,
+        "support": 8626,
+        "total_samples": 35658
+      },
+      "threat": {
+        "threshold": 0.38979591836734695,
+        "f1_score": 0.41018044345470683,
+        "support": 760,
+        "total_samples": 35658
+      },
+      "insult": {
+        "threshold": 0.463265306122449,
+        "f1_score": 0.7104171976414078,
+        "support": 10199,
+        "total_samples": 35658
+      },
+      "identity_hate": {
+        "threshold": 0.373469387755102,
+        "f1_score": 0.4444212159518569,
+        "support": 1878,
+        "total_samples": 35658
+      }
+    },
+    "per_language": {
+      "0": {
+        "toxic": {
+          "threshold": 0.4379310344827586,
+          "f1_score": 0.6362062357467935,
+          "support": 2228,
+          "total_samples": 4638
+        },
+        "severe_toxic": {
+          "threshold": 0.4241379310344827,
+          "f1_score": 0.6836346572759443,
+          "support": 199,
+          "total_samples": 4638
+        },
+        "obscene": {
+          "threshold": 0.4655172413793103,
+          "f1_score": 0.4812423489705398,
+          "support": 1235,
+          "total_samples": 4638
+        },
+        "threat": {
+          "threshold": 0.4655172413793103,
+          "f1_score": 0.560716193430073,
+          "support": 118,
+          "total_samples": 4638
+        },
+        "insult": {
+          "threshold": 0.6586206896551723,
+          "f1_score": 0.6797683196093679,
+          "support": 1144,
+          "total_samples": 4638
+        },
+        "identity_hate": {
+          "threshold": 0.6310344827586206,
+          "f1_score": 0.4653856089660791,
+          "support": 214,
+          "total_samples": 4638
+        }
+      },
+      "1": {
+        "toxic": {
+          "threshold": 0.38275862068965516,
+          "f1_score": 0.5653885349662379,
+          "support": 2589,
+          "total_samples": 5193
+        },
+        "severe_toxic": {
+          "threshold": 0.36896551724137927,
+          "f1_score": 0.6303988062940857,
+          "support": 245,
+          "total_samples": 5193
+        },
+        "obscene": {
+          "threshold": 0.6724137931034482,
+          "f1_score": 0.69776888519452,
+          "support": 1239,
+          "total_samples": 5193
+        },
+        "threat": {
+          "threshold": 0.5482758620689655,
+          "f1_score": 0.49444444444444446,
+          "support": 106,
+          "total_samples": 5193
+        },
+        "insult": {
+          "threshold": 0.45172413793103444,
+          "f1_score": 0.43592427815977264,
+          "support": 1514,
+          "total_samples": 5193
+        },
+        "identity_hate": {
+          "threshold": 0.603448275862069,
+          "f1_score": 0.437278850182076,
+          "support": 279,
+          "total_samples": 5193
+        }
+      },
+      "2": {
+        "toxic": {
+          "threshold": 0.36896551724137927,
+          "f1_score": 0.5636259188109024,
+          "support": 2585,
+          "total_samples": 5164
+        },
+        "severe_toxic": {
+          "threshold": 0.396551724137931,
+          "f1_score": 0.6242565552619788,
+          "support": 243,
+          "total_samples": 5164
+        },
+        "obscene": {
+          "threshold": 0.6310344827586206,
+          "f1_score": 0.609064783177638,
+          "support": 1233,
+          "total_samples": 5164
+        },
+        "threat": {
+          "threshold": 0.6862068965517241,
+          "f1_score": 0.4331632653061225,
+          "support": 110,
+          "total_samples": 5164
+        },
+        "insult": {
+          "threshold": 0.6586206896551723,
+          "f1_score": 0.5919194590653671,
+          "support": 1514,
+          "total_samples": 5164
+        },
+        "identity_hate": {
+          "threshold": 0.5896551724137931,
+          "f1_score": 0.44181963497241983,
+          "support": 282,
+          "total_samples": 5164
+        }
+      },
+      "3": {
+        "toxic": {
+          "threshold": 0.35517241379310344,
+          "f1_score": 0.5733103161693534,
+          "support": 2579,
+          "total_samples": 5168
+        },
+        "severe_toxic": {
+          "threshold": 0.38275862068965516,
+          "f1_score": 0.6597492750378473,
+          "support": 243,
+          "total_samples": 5168
+        },
+        "obscene": {
+          "threshold": 0.5896551724137931,
+          "f1_score": 0.5803338639295222,
+          "support": 1234,
+          "total_samples": 5168
+        },
+        "threat": {
+          "threshold": 0.5896551724137931,
+          "f1_score": 0.5531975271105706,
+          "support": 108,
+          "total_samples": 5168
+        },
+        "insult": {
+          "threshold": 0.4103448275862069,
+          "f1_score": 0.43932768516388326,
+          "support": 1511,
+          "total_samples": 5168
+        },
+        "identity_hate": {
+          "threshold": 0.5482758620689655,
+          "f1_score": 0.5223443223443224,
+          "support": 276,
+          "total_samples": 5168
+        }
+      },
+      "4": {
+        "toxic": {
+          "threshold": 0.36896551724137927,
+          "f1_score": 0.5671790360963849,
+          "support": 2568,
+          "total_samples": 5157
+        },
+        "severe_toxic": {
+          "threshold": 0.4241379310344827,
+          "f1_score": 0.6449236298292902,
+          "support": 240,
+          "total_samples": 5157
+        },
+        "obscene": {
+          "threshold": 0.5896551724137931,
+          "f1_score": 0.5763915317957939,
+          "support": 1225,
+          "total_samples": 5157
+        },
+        "threat": {
+          "threshold": 0.5482758620689655,
+          "f1_score": 0.5202898550724637,
+          "support": 105,
+          "total_samples": 5157
+        },
+        "insult": {
+          "threshold": 0.45172413793103444,
+          "f1_score": 0.44168323420099964,
+          "support": 1501,
+          "total_samples": 5157
+        },
+        "identity_hate": {
+          "threshold": 0.5344827586206896,
+          "f1_score": 0.3050612442147916,
+          "support": 273,
+          "total_samples": 5157
+        }
+      },
+      "5": {
+        "toxic": {
+          "threshold": 0.38275862068965516,
+          "f1_score": 0.5689208863252881,
+          "support": 2572,
+          "total_samples": 5146
+        },
+        "severe_toxic": {
+          "threshold": 0.38275862068965516,
+          "f1_score": 0.6483406115143644,
+          "support": 242,
+          "total_samples": 5146
+        },
+        "obscene": {
+          "threshold": 0.6172413793103448,
+          "f1_score": 0.7591744574190955,
+          "support": 1227,
+          "total_samples": 5146
+        },
+        "threat": {
+          "threshold": 0.5896551724137931,
+          "f1_score": 0.48909813468905516,
+          "support": 106,
+          "total_samples": 5146
+        },
+        "insult": {
+          "threshold": 0.4655172413793103,
+          "f1_score": 0.4438765689644482,
+          "support": 1506,
+          "total_samples": 5146
+        },
+        "identity_hate": {
+          "threshold": 0.4655172413793103,
+          "f1_score": 0.57592394533571,
+          "support": 277,
+          "total_samples": 5146
+        }
+      },
+      "6": {
+        "toxic": {
+          "threshold": 0.396551724137931,
+          "f1_score": 0.5707684299142913,
+          "support": 2576,
+          "total_samples": 5192
+        },
+        "severe_toxic": {
+          "threshold": 0.38275862068965516,
+          "f1_score": 0.6300280234278585,
+          "support": 243,
+          "total_samples": 5192
+        },
+        "obscene": {
+          "threshold": 0.603448275862069,
+          "f1_score": 0.5508854395728676,
+          "support": 1233,
+          "total_samples": 5192
+        },
+        "threat": {
+          "threshold": 0.4655172413793103,
+          "f1_score": 0.6029992790194665,
+          "support": 107,
+          "total_samples": 5192
+        },
+        "insult": {
+          "threshold": 0.4241379310344827,
+          "f1_score": 0.4434943555473952,
+          "support": 1509,
+          "total_samples": 5192
+        },
+        "identity_hate": {
+          "threshold": 0.6586206896551723,
+          "f1_score": 0.4569864410513042,
+          "support": 277,
+          "total_samples": 5192
+        }
+      }
+    }
+  }
+}

evaluation_results/eval_20250401_143401/plots/per_class_comparison.png ADDED Viewed

evaluation_results/eval_20250401_143401/plots/roc_all_classes.png ADDED Viewed

Git LFS Details

SHA256: cc99cf8a318efe9bde206d2e875905d037044c43b6d67f4a44cce849f30d2d0b
Pointer size: 131 Bytes
Size of remote file: 324 kB

evaluation_results/eval_20250401_143401/plots/roc_by_language.png ADDED Viewed

Git LFS Details

SHA256: 26176df08c42f1841e5cafec0f988b05fe54f53b820a028a3bf574f48bf52839
Pointer size: 131 Bytes
Size of remote file: 286 kB