Deeptanshuu's picture
Upload folder using huggingface_hub
d187b57 verified
import pandas as pd
import numpy as np
from pathlib import Path
import json
import os
from googletrans import Translator
from tqdm import tqdm
import time
def get_class_stats(df, lang, column):
"""Calculate statistics for a specific class and language"""
lang_df = df[df['lang'] == lang]
total = int(len(lang_df))
positive_count = int(lang_df[column].sum())
return {
'total': total,
'positive_count': positive_count,
'positive_ratio': float(positive_count / total if total > 0 else 0)
}
def backtranslate_text(text, translator, intermediate_lang='fr'):
"""Backtranslate text using an intermediate language"""
try:
# Add delay to avoid rate limiting
time.sleep(1)
# Translate to intermediate language
intermediate = translator.translate(text, dest=intermediate_lang).text
# Translate back to English
time.sleep(1)
back_to_en = translator.translate(intermediate, dest='en').text
return back_to_en
except Exception as e:
print(f"Translation error: {str(e)}")
return text
def balance_dataset_distributions(input_dir='dataset/balanced', output_dir='dataset/final_balanced'):
"""Balance Turkish toxic class and augment English identity hate samples"""
print("\n=== Balancing Dataset Distributions ===\n")
# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Load datasets
print("Loading datasets...")
train_df = pd.read_csv(os.path.join(input_dir, 'train_balanced.csv'))
val_df = pd.read_csv(os.path.join(input_dir, 'val_balanced.csv'))
test_df = pd.read_csv(os.path.join(input_dir, 'test_balanced.csv'))
# 1. Fix Turkish Toxic Class Balance
print("\nInitial Turkish Toxic Distribution:")
print("-" * 50)
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
stats = get_class_stats(df, 'tr', 'toxic')
print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
# Remove excess Turkish toxic samples from test
tr_test = test_df[test_df['lang'] == 'tr']
target_ratio = get_class_stats(train_df, 'tr', 'toxic')['positive_ratio']
current_ratio = get_class_stats(test_df, 'tr', 'toxic')['positive_ratio']
if current_ratio > target_ratio:
samples_to_remove = 150 # As specified
print(f"\nRemoving {samples_to_remove} Turkish toxic samples from test set...")
# Identify and remove samples
np.random.seed(42)
tr_toxic_samples = test_df[
(test_df['lang'] == 'tr') &
(test_df['toxic'] > 0)
]
remove_idx = tr_toxic_samples.sample(n=samples_to_remove).index
test_df = test_df.drop(remove_idx)
# 2. Augment English Identity Hate in Validation
print("\nInitial English Identity Hate Distribution:")
print("-" * 50)
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
stats = get_class_stats(df, 'en', 'identity_hate')
print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
# Select samples for backtranslation
print("\nAugmenting English identity hate samples in validation set...")
en_train_hate = train_df[
(train_df['lang'] == 'en') &
(train_df['identity_hate'] > 0)
]
samples = en_train_hate.sample(n=50, replace=True, random_state=42)
# Initialize translator
translator = Translator()
# Perform backtranslation
print("Performing backtranslation (this may take a few minutes)...")
augmented_samples = []
for _, row in tqdm(samples.iterrows(), total=len(samples)):
# Create new sample with backtranslated text
new_sample = row.copy()
new_sample['comment_text'] = backtranslate_text(row['comment_text'], translator)
augmented_samples.append(new_sample)
# Add augmented samples to validation set
val_df = pd.concat([val_df, pd.DataFrame(augmented_samples)], ignore_index=True)
# Save balanced datasets
print("\nSaving final balanced datasets...")
train_df.to_csv(os.path.join(output_dir, 'train_final.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val_final.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test_final.csv'), index=False)
# Save balancing statistics
stats = {
'turkish_toxic': {
'original_distribution': {
'train': get_class_stats(train_df, 'tr', 'toxic'),
'val': get_class_stats(val_df, 'tr', 'toxic'),
'test': get_class_stats(test_df, 'tr', 'toxic')
},
'samples_removed': 150
},
'english_identity_hate': {
'original_distribution': {
'train': get_class_stats(train_df, 'en', 'identity_hate'),
'val': get_class_stats(val_df, 'en', 'identity_hate'),
'test': get_class_stats(test_df, 'en', 'identity_hate')
},
'samples_added': 50
}
}
with open(os.path.join(output_dir, 'balancing_stats.json'), 'w') as f:
json.dump(stats, f, indent=2)
return train_df, val_df, test_df
def validate_final_distributions(train_df, val_df, test_df):
"""Validate the final distributions of all classes across languages"""
print("\nFinal Distribution Validation:")
print("-" * 50)
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
languages = sorted(train_df['lang'].unique())
for lang in languages:
print(f"\n{lang.upper()}:")
for class_name in classes:
print(f"\n {class_name.upper()}:")
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
stats = get_class_stats(df, lang, class_name)
print(f" {name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
if __name__ == "__main__":
# First install required package if not already installed
# !pip install googletrans==4.0.0-rc1
# Balance datasets
train_df, val_df, test_df = balance_dataset_distributions()
# Validate final distributions
validate_final_distributions(train_df, val_df, test_df)