|
import pandas as pd |
|
from pathlib import Path |
|
import sys |
|
from tqdm import tqdm |
|
|
|
def remove_english_comments(input_path, output_path=None): |
|
"""Remove English comments from a dataset with progress tracking""" |
|
print(f"\nReading input file: {input_path}") |
|
|
|
|
|
if output_path is None: |
|
output_path = str(Path(input_path).with_suffix('').with_name(f"{Path(input_path).stem}_non_english.csv")) |
|
|
|
try: |
|
|
|
df = pd.read_csv(input_path, encoding='utf-8') |
|
total_rows = len(df) |
|
|
|
print(f"\nDataset Info:") |
|
print(f"Initial Rows: {total_rows:,}") |
|
print(f"Columns: {', '.join(df.columns)}") |
|
|
|
|
|
print("\nFiltering out English comments...") |
|
non_english_df = df[df['lang'] != 'en'] |
|
|
|
|
|
print(f"\nSaving to: {output_path}") |
|
non_english_df.to_csv(output_path, index=False, encoding='utf-8') |
|
|
|
|
|
english_rows = total_rows - len(non_english_df) |
|
|
|
print(f"\n✓ Successfully removed English comments") |
|
print(f"Initial rows: {total_rows:,}") |
|
print(f"Remaining non-English rows: {len(non_english_df):,}") |
|
print(f"Removed English rows: {english_rows:,}") |
|
print(f"Output file: {output_path}") |
|
print(f"Output file size: {Path(output_path).stat().st_size / (1024*1024):.1f} MB") |
|
|
|
except Exception as e: |
|
print(f"\n❌ Error: {str(e)}") |
|
sys.exit(1) |
|
|
|
if __name__ == "__main__": |
|
input_path = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_347k_7LANG.csv" |
|
output_path = input_path.replace(".csv", "_non_english.csv") |
|
|
|
remove_english_comments(input_path, output_path) |