|
import pandas as pd |
|
import numpy as np |
|
from pathlib import Path |
|
import os |
|
import hashlib |
|
|
|
def generate_comment_id(row, toxicity_cols): |
|
"""Generate a unique ID encoding language and toxicity information""" |
|
|
|
tox_code = ''.join(['1' if row[col] > 0 else '0' for col in toxicity_cols]) |
|
|
|
|
|
text_hash = hashlib.md5(row['comment_text'].encode()).hexdigest()[:6] |
|
|
|
|
|
|
|
|
|
return f"{row['lang']}_{tox_code}_{text_hash}" |
|
|
|
def add_dataset_ids(input_file, output_file=None): |
|
"""Add meaningful IDs to the dataset""" |
|
print(f"\nReading dataset: {input_file}") |
|
df = pd.read_csv(input_file) |
|
|
|
|
|
total_rows = len(df) |
|
print(f"\nInitial dataset size: {total_rows:,} comments") |
|
|
|
|
|
toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
|
|
|
print("\nGenerating IDs...") |
|
|
|
df['id'] = df.apply(lambda row: generate_comment_id(row, toxicity_cols), axis=1) |
|
|
|
|
|
unique_ids = df['id'].nunique() |
|
print(f"\nGenerated {unique_ids:,} unique IDs") |
|
|
|
if unique_ids < total_rows: |
|
print(f"Warning: {total_rows - unique_ids:,} duplicate IDs found") |
|
|
|
df['id'] = df.groupby('id').cumcount().astype(str) + '_' + df['id'] |
|
print("Added suffixes to make IDs unique") |
|
|
|
|
|
print("\nSample IDs by language:") |
|
print("-" * 50) |
|
for lang in df['lang'].unique(): |
|
lang_sample = df[df['lang'] == lang].sample(n=min(3, len(df[df['lang'] == lang])), random_state=42) |
|
print(f"\n{lang.upper()}:") |
|
for _, row in lang_sample.iterrows(): |
|
tox_types = [col for col in toxicity_cols if row[col] > 0] |
|
print(f"ID: {row['id']}") |
|
print(f"Toxicity: {', '.join(tox_types) if tox_types else 'None'}") |
|
print(f"Text: {row['comment_text'][:100]}...") |
|
|
|
|
|
cols = ['id'] + [col for col in df.columns if col != 'id'] |
|
df = df[cols] |
|
|
|
|
|
if output_file is None: |
|
base, ext = os.path.splitext(input_file) |
|
output_file = f"{base}_with_ids{ext}" |
|
|
|
os.makedirs(os.path.dirname(output_file), exist_ok=True) |
|
print(f"\nSaving dataset with IDs to: {output_file}") |
|
df.to_csv(output_file, index=False) |
|
print(f"File size: {Path(output_file).stat().st_size / (1024*1024):.1f} MB") |
|
|
|
return df |
|
|
|
if __name__ == "__main__": |
|
input_file = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary.csv" |
|
output_file = "dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_binary_with_ids.csv" |
|
|
|
df_with_ids = add_dataset_ids(input_file, output_file) |