|  | import pandas as pd | 
					
						
						|  | import numpy as np | 
					
						
						|  | from pathlib import Path | 
					
						
						|  | import os | 
					
						
						|  |  | 
					
						
						|  | def clean_toxicity_labels(input_file, output_file=None): | 
					
						
						|  | """Clean toxicity labels by converting fractional values to binary using ceiling""" | 
					
						
						|  | print(f"\nReading dataset: {input_file}") | 
					
						
						|  | df = pd.read_csv(input_file) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | total_rows = len(df) | 
					
						
						|  | print(f"\nInitial dataset size: {total_rows:,} comments") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("\nInitial value distribution:") | 
					
						
						|  | print("-" * 50) | 
					
						
						|  | for col in toxicity_cols: | 
					
						
						|  | unique_vals = df[col].value_counts().sort_index() | 
					
						
						|  | print(f"\n{col.replace('_', ' ').title()}:") | 
					
						
						|  | for val, count in unique_vals.items(): | 
					
						
						|  | print(f"  {val}: {count:,} comments") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("\nCleaning labels...") | 
					
						
						|  | for col in toxicity_cols: | 
					
						
						|  |  | 
					
						
						|  | unique_before = df[col].nunique() | 
					
						
						|  | non_binary = df[~df[col].isin([0, 1])][col].unique() | 
					
						
						|  |  | 
					
						
						|  | if len(non_binary) > 0: | 
					
						
						|  | print(f"\n{col.replace('_', ' ').title()}:") | 
					
						
						|  | print(f"  Found {len(non_binary)} non-binary values: {sorted(non_binary)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df[col] = np.ceil(df[col]).clip(0, 1).astype(int) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | unique_after = df[col].nunique() | 
					
						
						|  | print(f"  Unique values before: {unique_before}") | 
					
						
						|  | print(f"  Unique values after: {unique_after}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("\nFinal value distribution:") | 
					
						
						|  | print("-" * 50) | 
					
						
						|  | for col in toxicity_cols: | 
					
						
						|  | value_counts = df[col].value_counts().sort_index() | 
					
						
						|  | total = len(df) | 
					
						
						|  | print(f"\n{col.replace('_', ' ').title()}:") | 
					
						
						|  | for val, count in value_counts.items(): | 
					
						
						|  | percentage = (count / total) * 100 | 
					
						
						|  | print(f"  {val}: {count:,} comments ({percentage:.2f}%)") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if output_file is None: | 
					
						
						|  | base, ext = os.path.splitext(input_file) | 
					
						
						|  | output_file = f"{base}_cleaned{ext}" | 
					
						
						|  |  | 
					
						
						|  | os.makedirs(os.path.dirname(output_file), exist_ok=True) | 
					
						
						|  | print(f"\nSaving cleaned dataset to: {output_file}") | 
					
						
						|  | df.to_csv(output_file, index=False) | 
					
						
						|  | print(f"File size: {Path(output_file).stat().st_size / (1024*1024):.1f} MB") | 
					
						
						|  |  | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | input_file = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_360K_7LANG.csv" | 
					
						
						|  | output_file = "dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary.csv" | 
					
						
						|  |  | 
					
						
						|  | cleaned_df = clean_toxicity_labels(input_file, output_file) |