File size: 4,445 Bytes

d187b57

import pandas as pd
import numpy as np
from text_preprocessor import TextPreprocessor
from tqdm import tqdm
import logging
from pathlib import Path
import time

def process_dataset(input_path: str, output_path: str = None, batch_size: int = 1000):
    """
    Process a dataset using the TextPreprocessor with efficient batch processing.
    
    Args:
        input_path: Path to input CSV file
        output_path: Path to save processed CSV file. If None, will use input name with _processed suffix
        batch_size: Number of texts to process in each batch
    """
    # Setup output path
    if output_path is None:
        input_path = Path(input_path)
        output_path = input_path.parent / f"{input_path.stem}_processed{input_path.suffix}"
    
    # Initialize preprocessor
    preprocessor = TextPreprocessor()
    
    print(f"\nProcessing dataset: {input_path}")
    start_time = time.time()
    
    try:
        # Read the dataset
        print("Reading dataset...")
        df = pd.read_csv(input_path)
        total_rows = len(df)
        print(f"Total rows: {total_rows:,}")
        
        # Process in batches with progress bar
        print("\nProcessing text...")
        
        # Calculate number of batches
        num_batches = (total_rows + batch_size - 1) // batch_size
        
        for i in tqdm(range(0, total_rows, batch_size), total=num_batches, desc="Processing batches"):
            # Get batch
            batch_start = i
            batch_end = min(i + batch_size, total_rows)
            
            # Process each text in the batch
            for idx in range(batch_start, batch_end):
                text = df.loc[idx, 'comment_text']
                lang = df.loc[idx, 'lang'] if 'lang' in df.columns else 'en'
                
                # Process text
                processed = preprocessor.preprocess_text(
                    text,
                    lang=lang,
                    clean_options={
                        'remove_stops': True,
                        'remove_numbers': True,
                        'remove_urls': True,
                        'remove_emails': True,
                        'remove_mentions': True,
                        'remove_hashtags': True,
                        'expand_contractions': True,
                        'remove_accents': False,
                        'min_word_length': 2
                    },
                    do_stemming=True
                )
                
                # Update the text directly
                df.loc[idx, 'comment_text'] = processed
            
            # Optional: Print sample from first batch
            if i == 0:
                print("\nSample processing results:")
                for j in range(min(3, batch_size)):
                    print(f"\nProcessed text {j+1}: {df.loc[j, 'comment_text'][:100]}...")
        
        # Save processed dataset
        print(f"\nSaving processed dataset to: {output_path}")
        df.to_csv(output_path, index=False)
        
        # Print statistics
        end_time = time.time()
        processing_time = end_time - start_time
        
        print("\nProcessing Complete!")
        print("-" * 50)
        print(f"Total rows processed: {total_rows:,}")
        print(f"Processing time: {processing_time/60:.2f} minutes")
        print(f"Average time per text: {processing_time/total_rows*1000:.2f} ms")
        print(f"Output file size: {Path(output_path).stat().st_size/1024/1024:.1f} MB")
        
        # Print sample of unique words before and after
        print("\nVocabulary Statistics:")
        sample_size = min(1000, total_rows)
        original_words = set(' '.join(df['comment_text'].head(sample_size).astype(str)).split())
        processed_words = set(' '.join(df['processed_text'].head(sample_size).astype(str)).split())
        print(f"Sample unique words (first {sample_size:,} rows):")
        print(f"Before processing: {len(original_words):,}")
        print(f"After processing : {len(processed_words):,}")
        print(f"Reduction: {(1 - len(processed_words)/len(original_words))*100:.1f}%")
        
    except Exception as e:
        print(f"\nError processing dataset: {str(e)}")
        raise

if __name__ == "__main__":
    # Process training dataset
    input_file = "dataset/split/train.csv"
    output_file = "dataset/split/train_no_stopwords.csv"
    
    process_dataset(input_file, output_file)