Multilingual_Toxic_Comment_Classifier / utils /remove_english.py

Upload folder using huggingface_hub

d187b57 verified 3 months ago

1.9 kB

	import pandas as pd
	from pathlib import Path
	import sys
	from tqdm import tqdm

	def remove_english_comments(input_path, output_path=None):
	"""Remove English comments from a dataset with progress tracking"""
	print(f"\nReading input file: {input_path}")

	# If no output path specified, use input name with _non_english suffix
	if output_path is None:
	output_path = str(Path(input_path).with_suffix('').with_name(f"{Path(input_path).stem}_non_english.csv"))

	try:
	# Read input file with UTF-8 encoding
	df = pd.read_csv(input_path, encoding='utf-8')
	total_rows = len(df)

	print(f"\nDataset Info:")
	print(f"Initial Rows: {total_rows:,}")
	print(f"Columns: {', '.join(df.columns)}")

	# Filter out English comments (where lang == 'en')
	print("\nFiltering out English comments...")
	non_english_df = df[df['lang'] != 'en']

	# Save to CSV with UTF-8 encoding
	print(f"\nSaving to: {output_path}")
	non_english_df.to_csv(output_path, index=False, encoding='utf-8')

	# Get statistics
	english_rows = total_rows - len(non_english_df)

	print(f"\n✓ Successfully removed English comments")
	print(f"Initial rows: {total_rows:,}")
	print(f"Remaining non-English rows: {len(non_english_df):,}")
	print(f"Removed English rows: {english_rows:,}")
	print(f"Output file: {output_path}")
	print(f"Output file size: {Path(output_path).stat().st_size / (1024*1024):.1f} MB")

	except Exception as e:
	print(f"\n❌ Error: {str(e)}")
	sys.exit(1)

	if __name__ == "__main__":
	input_path = "dataset/raw/MULTILINGUAL_TOXIC_DATASET_347k_7LANG.csv"
	output_path = input_path.replace(".csv", "_non_english.csv")

	remove_english_comments(input_path, output_path)