|
import pandas as pd |
|
import numpy as np |
|
from pathlib import Path |
|
import os |
|
|
|
def clean_toxicity_labels(input_file, output_file=None): |
|
"""Clean toxicity labels by converting fractional values to binary using ceiling""" |
|
print(f"\nReading dataset: {input_file}") |
|
df = pd.read_csv(input_file) |
|
|
|
|
|
total_rows = len(df) |
|
print(f"\nInitial dataset size: {total_rows:,} comments") |
|
|
|
|
|
toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
|
|
|
|
|
print("\nInitial value distribution:") |
|
print("-" * 50) |
|
for col in toxicity_cols: |
|
unique_vals = df[col].value_counts().sort_index() |
|
print(f"\n{col.replace('_', ' ').title()}:") |
|
for val, count in unique_vals.items(): |
|
print(f" {val}: {count:,} comments") |
|
|
|
|
|
print("\nCleaning labels...") |
|
for col in toxicity_cols: |
|
|
|
unique_before = df[col].nunique() |
|
non_binary = df[~df[col].isin([0, 1])][col].unique() |
|
|
|
if len(non_binary) > 0: |
|
print(f"\n{col.replace('_', ' ').title()}:") |
|
print(f" Found {len(non_binary)} non-binary values: {sorted(non_binary)}") |
|
|
|
|
|
df[col] = np.ceil(df[col]).clip(0, 1).astype(int) |
|
|
|
|
|
unique_after = df[col].nunique() |
|
print(f" Unique values before: {unique_before}") |
|
print(f" Unique values after: {unique_after}") |
|
|
|
|
|
print("\nFinal value distribution:") |
|
print("-" * 50) |
|
for col in toxicity_cols: |
|
value_counts = df[col].value_counts().sort_index() |
|
total = len(df) |
|
print(f"\n{col.replace('_', ' ').title()}:") |
|
for val, count in value_counts.items(): |
|
percentage = (count / total) * 100 |
|
print(f" {val}: {count:,} comments ({percentage:.2f}%)") |
|
|
|
|
|
if output_file is None: |
|
base, ext = os.path.splitext(input_file) |
|
output_file = f"{base}_cleaned{ext}" |
|
|
|
os.makedirs(os.path.dirname(output_file), exist_ok=True) |
|
print(f"\nSaving cleaned dataset to: {output_file}") |
|
df.to_csv(output_file, index=False) |
|
print(f"File size: {Path(output_file).stat().st_size / (1024*1024):.1f} MB") |
|
|
|
return df |
|
|
|
if __name__ == "__main__": |
|
input_file = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_360K_7LANG.csv" |
|
output_file = "dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary.csv" |
|
|
|
cleaned_df = clean_toxicity_labels(input_file, output_file) |