File size: 2,940 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
from pathlib import Path
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(message)s'
)
logger = logging.getLogger(__name__)

def merge_datasets():
    """Merge augmented threat dataset with main dataset"""
    try:
        # Load main dataset
        logger.info("Loading main dataset...")
        main_df = pd.read_csv("dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_FINAL.csv")
        logger.info(f"Main dataset: {len(main_df):,} rows")
        
        # Load augmented dataset
        augmented_path = Path("dataset/augmented")
        latest_augmented = max(augmented_path.glob("threat_augmented_*.csv"))
        logger.info(f"Loading augmented dataset: {latest_augmented.name}")
        aug_df = pd.read_csv(latest_augmented)
        logger.info(f"Augmented dataset: {len(aug_df):,} rows")
        
        # Standardize columns for augmented data
        logger.info("Standardizing columns...")
        aug_df_standardized = pd.DataFrame({
            'comment_text': aug_df['text'],
            'toxic': 1,
            'severe_toxic': 0,
            'obscene': 0,
            'threat': 1,
            'insult': 0,
            'identity_hate': 0,
            'lang': 'en'
        })
        
        # Check for duplicates between datasets
        logger.info("Checking for duplicates...")
        combined_texts = pd.concat([main_df['comment_text'], aug_df_standardized['comment_text']])
        duplicates = combined_texts.duplicated(keep='first')
        duplicate_count = duplicates[len(main_df):].sum()
        logger.info(f"Found {duplicate_count} duplicates in augmented data")
        
        # Remove duplicates from augmented data
        aug_df_standardized = aug_df_standardized[~duplicates[len(main_df):].values]
        logger.info(f"Augmented dataset after duplicate removal: {len(aug_df_standardized):,} rows")
        
        # Merge datasets
        merged_df = pd.concat([main_df, aug_df_standardized], ignore_index=True)
        logger.info(f"Final merged dataset: {len(merged_df):,} rows")
        
        # Save merged dataset
        output_path = f"dataset/processed/MULTILINGUAL_TOXIC_DATASET_AUGMENTED.csv"
        merged_df.to_csv(output_path, index=False)
        logger.info(f"Saved merged dataset to: {output_path}")
        
        # Print statistics
        logger.info("\nDataset Statistics:")
        logger.info(f"Original samples: {len(main_df):,}")
        logger.info(f"Added threat samples: {len(aug_df_standardized):,}")
        logger.info(f"Total samples: {len(merged_df):,}")
        logger.info(f"Threat samples in final dataset: {merged_df['threat'].sum():,}")
        
        return merged_df
        
    except Exception as e:
        logger.error(f"Error merging datasets: {str(e)}")
        raise

if __name__ == "__main__":
    merged_df = merge_datasets()