File size: 6,395 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
import numpy as np
from pathlib import Path
import json
import os
from googletrans import Translator
from tqdm import tqdm
import time

def get_class_stats(df, lang, column):
    """Calculate statistics for a specific class and language"""
    lang_df = df[df['lang'] == lang]
    total = int(len(lang_df))
    positive_count = int(lang_df[column].sum())
    return {
        'total': total,
        'positive_count': positive_count,
        'positive_ratio': float(positive_count / total if total > 0 else 0)
    }

def backtranslate_text(text, translator, intermediate_lang='fr'):
    """Backtranslate text using an intermediate language"""
    try:
        # Add delay to avoid rate limiting
        time.sleep(1)
        # Translate to intermediate language
        intermediate = translator.translate(text, dest=intermediate_lang).text
        # Translate back to English
        time.sleep(1)
        back_to_en = translator.translate(intermediate, dest='en').text
        return back_to_en
    except Exception as e:
        print(f"Translation error: {str(e)}")
        return text

def balance_dataset_distributions(input_dir='dataset/balanced', output_dir='dataset/final_balanced'):
    """Balance Turkish toxic class and augment English identity hate samples"""
    print("\n=== Balancing Dataset Distributions ===\n")
    
    # Create output directory
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Load datasets
    print("Loading datasets...")
    train_df = pd.read_csv(os.path.join(input_dir, 'train_balanced.csv'))
    val_df = pd.read_csv(os.path.join(input_dir, 'val_balanced.csv'))
    test_df = pd.read_csv(os.path.join(input_dir, 'test_balanced.csv'))
    
    # 1. Fix Turkish Toxic Class Balance
    print("\nInitial Turkish Toxic Distribution:")
    print("-" * 50)
    for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
        stats = get_class_stats(df, 'tr', 'toxic')
        print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
    
    # Remove excess Turkish toxic samples from test
    tr_test = test_df[test_df['lang'] == 'tr']
    target_ratio = get_class_stats(train_df, 'tr', 'toxic')['positive_ratio']
    current_ratio = get_class_stats(test_df, 'tr', 'toxic')['positive_ratio']
    
    if current_ratio > target_ratio:
        samples_to_remove = 150  # As specified
        print(f"\nRemoving {samples_to_remove} Turkish toxic samples from test set...")
        
        # Identify and remove samples
        np.random.seed(42)
        tr_toxic_samples = test_df[
            (test_df['lang'] == 'tr') & 
            (test_df['toxic'] > 0)
        ]
        remove_idx = tr_toxic_samples.sample(n=samples_to_remove).index
        test_df = test_df.drop(remove_idx)
    
    # 2. Augment English Identity Hate in Validation
    print("\nInitial English Identity Hate Distribution:")
    print("-" * 50)
    for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
        stats = get_class_stats(df, 'en', 'identity_hate')
        print(f"{name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")
    
    # Select samples for backtranslation
    print("\nAugmenting English identity hate samples in validation set...")
    en_train_hate = train_df[
        (train_df['lang'] == 'en') & 
        (train_df['identity_hate'] > 0)
    ]
    samples = en_train_hate.sample(n=50, replace=True, random_state=42)
    
    # Initialize translator
    translator = Translator()
    
    # Perform backtranslation
    print("Performing backtranslation (this may take a few minutes)...")
    augmented_samples = []
    for _, row in tqdm(samples.iterrows(), total=len(samples)):
        # Create new sample with backtranslated text
        new_sample = row.copy()
        new_sample['comment_text'] = backtranslate_text(row['comment_text'], translator)
        augmented_samples.append(new_sample)
    
    # Add augmented samples to validation set
    val_df = pd.concat([val_df, pd.DataFrame(augmented_samples)], ignore_index=True)
    
    # Save balanced datasets
    print("\nSaving final balanced datasets...")
    train_df.to_csv(os.path.join(output_dir, 'train_final.csv'), index=False)
    val_df.to_csv(os.path.join(output_dir, 'val_final.csv'), index=False)
    test_df.to_csv(os.path.join(output_dir, 'test_final.csv'), index=False)
    
    # Save balancing statistics
    stats = {
        'turkish_toxic': {
            'original_distribution': {
                'train': get_class_stats(train_df, 'tr', 'toxic'),
                'val': get_class_stats(val_df, 'tr', 'toxic'),
                'test': get_class_stats(test_df, 'tr', 'toxic')
            },
            'samples_removed': 150
        },
        'english_identity_hate': {
            'original_distribution': {
                'train': get_class_stats(train_df, 'en', 'identity_hate'),
                'val': get_class_stats(val_df, 'en', 'identity_hate'),
                'test': get_class_stats(test_df, 'en', 'identity_hate')
            },
            'samples_added': 50
        }
    }
    
    with open(os.path.join(output_dir, 'balancing_stats.json'), 'w') as f:
        json.dump(stats, f, indent=2)
    
    return train_df, val_df, test_df

def validate_final_distributions(train_df, val_df, test_df):
    """Validate the final distributions of all classes across languages"""
    print("\nFinal Distribution Validation:")
    print("-" * 50)
    
    classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    languages = sorted(train_df['lang'].unique())
    
    for lang in languages:
        print(f"\n{lang.upper()}:")
        for class_name in classes:
            print(f"\n  {class_name.upper()}:")
            for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
                stats = get_class_stats(df, lang, class_name)
                print(f"    {name}: {stats['positive_count']}/{stats['total']} ({stats['positive_ratio']:.2%})")

if __name__ == "__main__":
    # First install required package if not already installed
    # !pip install googletrans==4.0.0-rc1
    
    # Balance datasets
    train_df, val_df, test_df = balance_dataset_distributions()
    
    # Validate final distributions
    validate_final_distributions(train_df, val_df, test_df)