|
import pandas as pd |
|
import re |
|
from bs4 import BeautifulSoup |
|
from tqdm import tqdm |
|
import logging |
|
from pathlib import Path |
|
|
|
def clean_text(text): |
|
"""Clean text by removing URLs, HTML tags, and special characters""" |
|
try: |
|
|
|
text = str(text) |
|
|
|
|
|
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) |
|
|
|
|
|
text = BeautifulSoup(text, "html.parser").get_text() |
|
|
|
|
|
text = re.sub(r'[^\w\s.,!?-]', ' ', text) |
|
|
|
|
|
text = ' '.join(text.split()) |
|
|
|
|
|
text = re.sub(r'([.,!?])\1+', r'\1', text) |
|
|
|
|
|
text = re.sub(r'\s+([.,!?])', r'\1', text) |
|
|
|
return text.strip() |
|
except Exception as e: |
|
logging.error(f"Error cleaning text: {str(e)}") |
|
return text |
|
|
|
def try_read_csv(file_path): |
|
"""Try different encodings to read the CSV file""" |
|
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252'] |
|
|
|
for encoding in encodings: |
|
try: |
|
print(f"Trying {encoding} encoding...") |
|
return pd.read_csv(file_path, encoding=encoding) |
|
except UnicodeDecodeError: |
|
continue |
|
except Exception as e: |
|
print(f"Error with {encoding}: {str(e)}") |
|
continue |
|
|
|
raise ValueError("Could not read file with any of the attempted encodings") |
|
|
|
def clean_dataset(input_path, output_path=None): |
|
"""Clean comment text in a dataset""" |
|
print(f"\nReading input file: {input_path}") |
|
|
|
|
|
if output_path is None: |
|
output_path = str(Path(input_path).with_suffix('').with_name(f"{Path(input_path).stem}_cleaned.csv")) |
|
|
|
try: |
|
|
|
df = try_read_csv(input_path) |
|
total_rows = len(df) |
|
|
|
print(f"\nDataset Info:") |
|
print(f"Initial Rows: {total_rows:,}") |
|
print(f"Columns: {', '.join(df.columns)}") |
|
|
|
|
|
if 'comment_text' not in df.columns: |
|
|
|
text_columns = [col for col in df.columns if 'text' in col.lower() or 'comment' in col.lower()] |
|
if text_columns: |
|
print(f"\nUsing '{text_columns[0]}' as comment column") |
|
df['comment_text'] = df[text_columns[0]] |
|
else: |
|
raise ValueError("Could not find comment text column") |
|
|
|
|
|
print("\nCleaning comments...") |
|
tqdm.pandas() |
|
df['comment_text'] = df['comment_text'].progress_apply(clean_text) |
|
|
|
|
|
non_empty_mask = df['comment_text'].str.strip().str.len() > 0 |
|
df = df[non_empty_mask] |
|
|
|
|
|
print(f"\nSaving to: {output_path}") |
|
df.to_csv(output_path, index=False, encoding='utf-8') |
|
|
|
|
|
print(f"\n✓ Successfully cleaned comments") |
|
print(f"Initial rows: {total_rows:,}") |
|
print(f"Final rows: {len(df):,}") |
|
print(f"Removed empty rows: {total_rows - len(df):,}") |
|
print(f"Output file: {output_path}") |
|
print(f"Output file size: {Path(output_path).stat().st_size / (1024*1024):.1f} MB") |
|
|
|
|
|
print("\nSample of cleaned comments:") |
|
for i, (orig, cleaned) in enumerate(zip(df['comment_text'].head(3), df['comment_text'].head(3))): |
|
print(f"\nExample {i+1}:") |
|
print(f"Original : {orig[:100]}...") |
|
print(f"Cleaned : {cleaned[:100]}...") |
|
|
|
except Exception as e: |
|
print(f"\n❌ Error: {str(e)}") |
|
return |
|
|
|
if __name__ == "__main__": |
|
input_path = "dataset/raw/english-trash.csv" |
|
output_path = "dataset/raw/english-comments-cleaned.csv" |
|
|
|
clean_dataset(input_path, output_path) |