File size: 6,395 Bytes
d187b57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import pandas as pd
import numpy as np
from pathlib import Path
import json
import os
from googletrans import Translator
from tqdm import tqdm
import time
def get_class_stats(df, lang, column):
"""Calculate statistics for a specific class and language"""
lang_df = df[df['lang'] == lang]
total = int(len(lang_df))
positive_count = int(lang_df[column].sum())
return {
'total': total,
'positive_count': positive_count,
'positive_ratio': float(positive_count / total if total > 0 else 0)
}
def backtranslate_text(text, translator, intermediate_lang='fr'):
"""Backtranslate text using an intermediate language"""
try:
# Add delay to avoid rate limiting
time.sleep(1)
# Translate to intermediate language
intermediate = translator.translate(text, dest=intermediate_lang).text
# Translate back to English
time.sleep(1)
back_to_en = translator.translate(intermediate, dest='en').text
return back_to_en
except Exception as e:
print(f"Translation error: {str(e)}")
return text
def balance_dataset_distributions(input_dir='dataset/balanced', output_dir='dataset/final_balanced'):
"""Balance Turkish toxic class and augment English identity hate samples"""
print("\n=== Balancing Dataset Distributions ===\n")
# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Load datasets
print("Loading datasets...")
train_df = pd.read_csv(os.path.join(input_dir, 'train_balanced.csv'))
val_df = pd.read_csv(os.path.join(input_dir, 'val_balanced.csv'))
test_df = pd.read_csv(os.path.join(input_dir, 'test_balanced.csv'))
# 1. Fix Turkish Toxic Class Balance
print("\nInitial Turkish Toxic Distribution:")
print("-" * 50)
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
stats = get_class_stats(df, 'tr', 'toxic')
print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
# Remove excess Turkish toxic samples from test
tr_test = test_df[test_df['lang'] == 'tr']
target_ratio = get_class_stats(train_df, 'tr', 'toxic')['positive_ratio']
current_ratio = get_class_stats(test_df, 'tr', 'toxic')['positive_ratio']
if current_ratio > target_ratio:
samples_to_remove = 150 # As specified
print(f"\nRemoving {samples_to_remove} Turkish toxic samples from test set...")
# Identify and remove samples
np.random.seed(42)
tr_toxic_samples = test_df[
(test_df['lang'] == 'tr') &
(test_df['toxic'] > 0)
]
remove_idx = tr_toxic_samples.sample(n=samples_to_remove).index
test_df = test_df.drop(remove_idx)
# 2. Augment English Identity Hate in Validation
print("\nInitial English Identity Hate Distribution:")
print("-" * 50)
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
stats = get_class_stats(df, 'en', 'identity_hate')
print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
# Select samples for backtranslation
print("\nAugmenting English identity hate samples in validation set...")
en_train_hate = train_df[
(train_df['lang'] == 'en') &
(train_df['identity_hate'] > 0)
]
samples = en_train_hate.sample(n=50, replace=True, random_state=42)
# Initialize translator
translator = Translator()
# Perform backtranslation
print("Performing backtranslation (this may take a few minutes)...")
augmented_samples = []
for _, row in tqdm(samples.iterrows(), total=len(samples)):
# Create new sample with backtranslated text
new_sample = row.copy()
new_sample['comment_text'] = backtranslate_text(row['comment_text'], translator)
augmented_samples.append(new_sample)
# Add augmented samples to validation set
val_df = pd.concat([val_df, pd.DataFrame(augmented_samples)], ignore_index=True)
# Save balanced datasets
print("\nSaving final balanced datasets...")
train_df.to_csv(os.path.join(output_dir, 'train_final.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val_final.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test_final.csv'), index=False)
# Save balancing statistics
stats = {
'turkish_toxic': {
'original_distribution': {
'train': get_class_stats(train_df, 'tr', 'toxic'),
'val': get_class_stats(val_df, 'tr', 'toxic'),
'test': get_class_stats(test_df, 'tr', 'toxic')
},
'samples_removed': 150
},
'english_identity_hate': {
'original_distribution': {
'train': get_class_stats(train_df, 'en', 'identity_hate'),
'val': get_class_stats(val_df, 'en', 'identity_hate'),
'test': get_class_stats(test_df, 'en', 'identity_hate')
},
'samples_added': 50
}
}
with open(os.path.join(output_dir, 'balancing_stats.json'), 'w') as f:
json.dump(stats, f, indent=2)
return train_df, val_df, test_df
def validate_final_distributions(train_df, val_df, test_df):
"""Validate the final distributions of all classes across languages"""
print("\nFinal Distribution Validation:")
print("-" * 50)
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
languages = sorted(train_df['lang'].unique())
for lang in languages:
print(f"\n{lang.upper()}:")
for class_name in classes:
print(f"\n {class_name.upper()}:")
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
stats = get_class_stats(df, lang, class_name)
print(f" {name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
if __name__ == "__main__":
# First install required package if not already installed
# !pip install googletrans==4.0.0-rc1
# Balance datasets
train_df, val_df, test_df = balance_dataset_distributions()
# Validate final distributions
validate_final_distributions(train_df, val_df, test_df) |