|  | import pandas as pd | 
					
						
						|  | import numpy as np | 
					
						
						|  | from pathlib import Path | 
					
						
						|  | import json | 
					
						
						|  | import os | 
					
						
						|  | from googletrans import Translator | 
					
						
						|  | from tqdm import tqdm | 
					
						
						|  | import time | 
					
						
						|  |  | 
					
						
						|  | def get_class_stats(df, lang, column): | 
					
						
						|  | """Calculate statistics for a specific class and language""" | 
					
						
						|  | lang_df = df[df['lang'] == lang] | 
					
						
						|  | total = int(len(lang_df)) | 
					
						
						|  | positive_count = int(lang_df[column].sum()) | 
					
						
						|  | return { | 
					
						
						|  | 'total': total, | 
					
						
						|  | 'positive_count': positive_count, | 
					
						
						|  | 'positive_ratio': float(positive_count / total if total > 0 else 0) | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | def backtranslate_text(text, translator, intermediate_lang='fr'): | 
					
						
						|  | """Backtranslate text using an intermediate language""" | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | time.sleep(1) | 
					
						
						|  |  | 
					
						
						|  | intermediate = translator.translate(text, dest=intermediate_lang).text | 
					
						
						|  |  | 
					
						
						|  | time.sleep(1) | 
					
						
						|  | back_to_en = translator.translate(intermediate, dest='en').text | 
					
						
						|  | return back_to_en | 
					
						
						|  | except Exception as e: | 
					
						
						|  | print(f"Translation error: {str(e)}") | 
					
						
						|  | return text | 
					
						
						|  |  | 
					
						
						|  | def balance_dataset_distributions(input_dir='dataset/balanced', output_dir='dataset/final_balanced'): | 
					
						
						|  | """Balance Turkish toxic class and augment English identity hate samples""" | 
					
						
						|  | print("\n=== Balancing Dataset Distributions ===\n") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | Path(output_dir).mkdir(parents=True, exist_ok=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("Loading datasets...") | 
					
						
						|  | train_df = pd.read_csv(os.path.join(input_dir, 'train_balanced.csv')) | 
					
						
						|  | val_df = pd.read_csv(os.path.join(input_dir, 'val_balanced.csv')) | 
					
						
						|  | test_df = pd.read_csv(os.path.join(input_dir, 'test_balanced.csv')) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("\nInitial Turkish Toxic Distribution:") | 
					
						
						|  | print("-" * 50) | 
					
						
						|  | for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]: | 
					
						
						|  | stats = get_class_stats(df, 'tr', 'toxic') | 
					
						
						|  | print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | tr_test = test_df[test_df['lang'] == 'tr'] | 
					
						
						|  | target_ratio = get_class_stats(train_df, 'tr', 'toxic')['positive_ratio'] | 
					
						
						|  | current_ratio = get_class_stats(test_df, 'tr', 'toxic')['positive_ratio'] | 
					
						
						|  |  | 
					
						
						|  | if current_ratio > target_ratio: | 
					
						
						|  | samples_to_remove = 150 | 
					
						
						|  | print(f"\nRemoving {samples_to_remove} Turkish toxic samples from test set...") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | np.random.seed(42) | 
					
						
						|  | tr_toxic_samples = test_df[ | 
					
						
						|  | (test_df['lang'] == 'tr') & | 
					
						
						|  | (test_df['toxic'] > 0) | 
					
						
						|  | ] | 
					
						
						|  | remove_idx = tr_toxic_samples.sample(n=samples_to_remove).index | 
					
						
						|  | test_df = test_df.drop(remove_idx) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("\nInitial English Identity Hate Distribution:") | 
					
						
						|  | print("-" * 50) | 
					
						
						|  | for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]: | 
					
						
						|  | stats = get_class_stats(df, 'en', 'identity_hate') | 
					
						
						|  | print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("\nAugmenting English identity hate samples in validation set...") | 
					
						
						|  | en_train_hate = train_df[ | 
					
						
						|  | (train_df['lang'] == 'en') & | 
					
						
						|  | (train_df['identity_hate'] > 0) | 
					
						
						|  | ] | 
					
						
						|  | samples = en_train_hate.sample(n=50, replace=True, random_state=42) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | translator = Translator() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("Performing backtranslation (this may take a few minutes)...") | 
					
						
						|  | augmented_samples = [] | 
					
						
						|  | for _, row in tqdm(samples.iterrows(), total=len(samples)): | 
					
						
						|  |  | 
					
						
						|  | new_sample = row.copy() | 
					
						
						|  | new_sample['comment_text'] = backtranslate_text(row['comment_text'], translator) | 
					
						
						|  | augmented_samples.append(new_sample) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | val_df = pd.concat([val_df, pd.DataFrame(augmented_samples)], ignore_index=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | print("\nSaving final balanced datasets...") | 
					
						
						|  | train_df.to_csv(os.path.join(output_dir, 'train_final.csv'), index=False) | 
					
						
						|  | val_df.to_csv(os.path.join(output_dir, 'val_final.csv'), index=False) | 
					
						
						|  | test_df.to_csv(os.path.join(output_dir, 'test_final.csv'), index=False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | stats = { | 
					
						
						|  | 'turkish_toxic': { | 
					
						
						|  | 'original_distribution': { | 
					
						
						|  | 'train': get_class_stats(train_df, 'tr', 'toxic'), | 
					
						
						|  | 'val': get_class_stats(val_df, 'tr', 'toxic'), | 
					
						
						|  | 'test': get_class_stats(test_df, 'tr', 'toxic') | 
					
						
						|  | }, | 
					
						
						|  | 'samples_removed': 150 | 
					
						
						|  | }, | 
					
						
						|  | 'english_identity_hate': { | 
					
						
						|  | 'original_distribution': { | 
					
						
						|  | 'train': get_class_stats(train_df, 'en', 'identity_hate'), | 
					
						
						|  | 'val': get_class_stats(val_df, 'en', 'identity_hate'), | 
					
						
						|  | 'test': get_class_stats(test_df, 'en', 'identity_hate') | 
					
						
						|  | }, | 
					
						
						|  | 'samples_added': 50 | 
					
						
						|  | } | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | with open(os.path.join(output_dir, 'balancing_stats.json'), 'w') as f: | 
					
						
						|  | json.dump(stats, f, indent=2) | 
					
						
						|  |  | 
					
						
						|  | return train_df, val_df, test_df | 
					
						
						|  |  | 
					
						
						|  | def validate_final_distributions(train_df, val_df, test_df): | 
					
						
						|  | """Validate the final distributions of all classes across languages""" | 
					
						
						|  | print("\nFinal Distribution Validation:") | 
					
						
						|  | print("-" * 50) | 
					
						
						|  |  | 
					
						
						|  | classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] | 
					
						
						|  | languages = sorted(train_df['lang'].unique()) | 
					
						
						|  |  | 
					
						
						|  | for lang in languages: | 
					
						
						|  | print(f"\n{lang.upper()}:") | 
					
						
						|  | for class_name in classes: | 
					
						
						|  | print(f"\n  {class_name.upper()}:") | 
					
						
						|  | for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]: | 
					
						
						|  | stats = get_class_stats(df, lang, class_name) | 
					
						
						|  | print(f"    {name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})") | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | train_df, val_df, test_df = balance_dataset_distributions() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | validate_final_distributions(train_df, val_df, test_df) |