Deeptanshuu's picture
Upload folder using huggingface_hub
d187b57 verified
raw
history blame
6.93 kB
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import KBinsDiscretizer
import matplotlib.pyplot as plt
import os
class ToxicityOrdinalEncoder:
def __init__(self, n_bins=4, strategy='quantile'):
self.n_bins = n_bins
self.strategy = strategy
self.bin_edges = {}
self.ordinal_mapping = {}
self.label_mapping = {}
def _get_optimal_bins(self, values):
"""Dynamically determine bins using statistical analysis"""
unique_vals = np.unique(values)
if len(unique_vals) <= self.n_bins:
return sorted(unique_vals)
# Handle 1D data properly and check sample size
if len(values) < 2:
return np.linspace(0, 1, self.n_bins + 1)
try:
# Transpose for correct KDE dimensions (d, N) = (1, samples)
kde = stats.gaussian_kde(values.T)
x = np.linspace(0, 1, 100)
minima = []
for i in range(1, len(x)-1):
if (kde(x[i]) < kde(x[i-1])) and (kde(x[i]) < kde(x[i+1])):
minima.append(x[i])
if minima:
return [0] + sorted(minima) + [1]
except np.linalg.LinAlgError:
pass
# Fallback to KBinsDiscretizer
est = KBinsDiscretizer(n_bins=self.n_bins,
encode='ordinal',
strategy=self.strategy)
est.fit(values)
return est.bin_edges_[0]
def fit(self, df, columns):
"""Learn optimal binning for each toxicity category"""
for col in columns:
# Filter and validate non-zero values
non_zero = df[col][df[col] > 0].values.reshape(-1, 1)
# Handle empty columns
if len(non_zero) == 0:
self.bin_edges[col] = [0, 1]
self.ordinal_mapping[col] = {0: 0}
continue
# Handle small sample sizes
if len(non_zero) < 2:
self.bin_edges[col] = np.linspace(0, 1, self.n_bins + 1)
continue
bins = self._get_optimal_bins(non_zero)
self.bin_edges[col] = bins
# Create ordinal mapping
self.ordinal_mapping[col] = {
val: i for i, val in enumerate(sorted(np.unique(bins)))
}
# Create label mapping for interpretability
self.label_mapping[col] = {
0: 'Non-toxic',
1: 'Low',
2: 'Medium',
3: 'High',
4: 'Severe'
}
return self
def transform(self, df, columns):
"""Apply learned ordinal mapping with safety checks"""
transformed = df.copy()
for col in columns:
if col not in self.bin_edges:
raise ValueError(f"Column {col} not fitted")
bins = self.bin_edges[col]
transformed[col] = pd.cut(df[col], bins=bins,
labels=False, include_lowest=True)
# Preserve zero as separate class
transformed[col] = np.where(df[col] == 0, 0, transformed[col] + 1)
transformed[col] = transformed[col].astype(int) # Ensure integer type
return transformed
def plot_toxicity_distribution(df, transformed_df, column, bin_edges, save_dir='images'):
"""Plot original vs binned distribution for a toxicity column"""
plt.figure(figsize=(15, 6))
# Original distribution
plt.subplot(1, 2, 1)
non_zero_vals = df[column][df[column] > 0]
if len(non_zero_vals) > 0:
plt.hist(non_zero_vals, bins=50, alpha=0.7)
plt.title(f'Original {column.replace("_", " ").title()} Distribution\n(Non-zero values)')
plt.xlabel('Toxicity Score')
plt.ylabel('Count')
# Add bin edges as vertical lines
for edge in bin_edges[column]:
plt.axvline(x=edge, color='r', linestyle='--', alpha=0.5)
else:
plt.text(0.5, 0.5, 'No non-zero values', ha='center', va='center')
# Binned distribution
plt.subplot(1, 2, 2)
unique_bins = sorted(transformed_df[column].unique())
plt.hist(transformed_df[column], bins=len(unique_bins),
range=(min(unique_bins)-0.5, max(unique_bins)+0.5),
alpha=0.7, rwidth=0.8)
plt.title(f'Binned {column.replace("_", " ").title()} Distribution')
plt.xlabel('Toxicity Level')
plt.ylabel('Count')
# Add labels for toxicity levels
plt.xticks(range(5), ['Non-toxic', 'Low', 'Medium', 'High', 'Severe'])
plt.tight_layout()
os.makedirs(save_dir, exist_ok=True)
plt.savefig(os.path.join(save_dir, f'{column}_distribution.png'))
plt.close()
def main():
# Load dataset
print("Loading dataset...")
input_file = 'dataset/raw/MULTILINGUAL_TOXIC_DATASET_367k_7LANG_cleaned.csv'
df = pd.read_csv(input_file)
# Define toxicity columns
toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# Print initial value distributions
print("\nInitial value distributions:")
for col in toxicity_cols:
print(f"\n{col.replace('_', ' ').title()}:")
print(df[col].value_counts().sort_index())
# Initialize and fit encoder
print("\nFitting toxicity encoder...")
encoder = ToxicityOrdinalEncoder(n_bins=4)
encoder.fit(df, toxicity_cols)
# Transform data
print("Transforming toxicity values...")
transformed_df = encoder.transform(df, toxicity_cols)
# Plot distributions
print("\nGenerating distribution plots...")
for col in toxicity_cols:
plot_toxicity_distribution(df, transformed_df, col, encoder.bin_edges)
# Print binning information
print("\nBin edges for each toxicity type:")
for col in toxicity_cols:
print(f"\n{col.replace('_', ' ').title()}:")
edges = encoder.bin_edges[col]
for i in range(len(edges)-1):
print(f"Level {encoder.label_mapping[col][i+1]}: {edges[i]:.3f} to {edges[i+1]:.3f}")
# Save transformed dataset
output_file = 'dataset/processed/MULTILINGUAL_TOXIC_DATASET_binned.csv'
print(f"\nSaving binned dataset to: {output_file}")
transformed_df.to_csv(output_file, index=False)
# Print final value distributions
print("\nFinal binned distributions:")
for col in toxicity_cols:
print(f"\n{col.replace('_', ' ').title()}:")
dist = transformed_df[col].value_counts().sort_index()
for level, count in dist.items():
print(f"{encoder.label_mapping[col][level]}: {count:,} ({count/len(df)*100:.1f}%)")
if __name__ == "__main__":
main()