File size: 4,261 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import logging
from pathlib import Path

def clean_text(text):
    """Clean text by removing URLs, HTML tags, and special characters"""
    try:
        # Convert to string if not already
        text = str(text)
        
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        
        # Remove HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()
        
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', ' ', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Remove multiple punctuation
        text = re.sub(r'([.,!?])\1+', r'\1', text)
        
        # Remove spaces before punctuation
        text = re.sub(r'\s+([.,!?])', r'\1', text)
        
        return text.strip()
    except Exception as e:
        logging.error(f"Error cleaning text: {str(e)}")
        return text

def try_read_csv(file_path):
    """Try different encodings to read the CSV file"""
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            print(f"Trying {encoding} encoding...")
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error with {encoding}: {str(e)}")
            continue
    
    raise ValueError("Could not read file with any of the attempted encodings")

def clean_dataset(input_path, output_path=None):
    """Clean comment text in a dataset"""
    print(f"\nReading input file: {input_path}")
    
    # If no output path specified, use input name with _cleaned suffix
    if output_path is None:
        output_path = str(Path(input_path).with_suffix('').with_name(f"{Path(input_path).stem}_cleaned.csv"))
    
    try:
        # Try reading with different encodings
        df = try_read_csv(input_path)
        total_rows = len(df)
        
        print(f"\nDataset Info:")
        print(f"Initial Rows: {total_rows:,}")
        print(f"Columns: {', '.join(df.columns)}")
        
        # Verify 'comment_text' column exists
        if 'comment_text' not in df.columns:
            # Try to find a column that might contain the comments
            text_columns = [col for col in df.columns if 'text' in col.lower() or 'comment' in col.lower()]
            if text_columns:
                print(f"\nUsing '{text_columns[0]}' as comment column")
                df['comment_text'] = df[text_columns[0]]
            else:
                raise ValueError("Could not find comment text column")
        
        # Clean comment text with progress bar
        print("\nCleaning comments...")
        tqdm.pandas()
        df['comment_text'] = df['comment_text'].progress_apply(clean_text)
        
        # Remove empty comments
        non_empty_mask = df['comment_text'].str.strip().str.len() > 0
        df = df[non_empty_mask]
        
        # Save cleaned dataset
        print(f"\nSaving to: {output_path}")
        df.to_csv(output_path, index=False, encoding='utf-8')
        
        # Print statistics
        print(f"\n✓ Successfully cleaned comments")
        print(f"Initial rows: {total_rows:,}")
        print(f"Final rows: {len(df):,}")
        print(f"Removed empty rows: {total_rows - len(df):,}")
        print(f"Output file: {output_path}")
        print(f"Output file size: {Path(output_path).stat().st_size / (1024*1024):.1f} MB")
        
        # Sample of cleaned comments
        print("\nSample of cleaned comments:")
        for i, (orig, cleaned) in enumerate(zip(df['comment_text'].head(3), df['comment_text'].head(3))):
            print(f"\nExample {i+1}:")
            print(f"Original : {orig[:100]}...")
            print(f"Cleaned  : {cleaned[:100]}...")
        
    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        return

if __name__ == "__main__":
    input_path = "dataset/raw/english-trash.csv"
    output_path = "dataset/raw/english-comments-cleaned.csv"
    
    clean_dataset(input_path, output_path)