|
import pandas as pd |
|
import numpy as np |
|
from text_preprocessor import TextPreprocessor |
|
from tqdm import tqdm |
|
import logging |
|
from pathlib import Path |
|
import time |
|
|
|
def process_dataset(input_path: str, output_path: str = None, batch_size: int = 1000): |
|
""" |
|
Process a dataset using the TextPreprocessor with efficient batch processing. |
|
|
|
Args: |
|
input_path: Path to input CSV file |
|
output_path: Path to save processed CSV file. If None, will use input name with _processed suffix |
|
batch_size: Number of texts to process in each batch |
|
""" |
|
|
|
if output_path is None: |
|
input_path = Path(input_path) |
|
output_path = input_path.parent / f"{input_path.stem}_processed{input_path.suffix}" |
|
|
|
|
|
preprocessor = TextPreprocessor() |
|
|
|
print(f"\nProcessing dataset: {input_path}") |
|
start_time = time.time() |
|
|
|
try: |
|
|
|
print("Reading dataset...") |
|
df = pd.read_csv(input_path) |
|
total_rows = len(df) |
|
print(f"Total rows: {total_rows:,}") |
|
|
|
|
|
print("\nProcessing text...") |
|
|
|
|
|
num_batches = (total_rows + batch_size - 1) // batch_size |
|
|
|
for i in tqdm(range(0, total_rows, batch_size), total=num_batches, desc="Processing batches"): |
|
|
|
batch_start = i |
|
batch_end = min(i + batch_size, total_rows) |
|
|
|
|
|
for idx in range(batch_start, batch_end): |
|
text = df.loc[idx, 'comment_text'] |
|
lang = df.loc[idx, 'lang'] if 'lang' in df.columns else 'en' |
|
|
|
|
|
processed = preprocessor.preprocess_text( |
|
text, |
|
lang=lang, |
|
clean_options={ |
|
'remove_stops': True, |
|
'remove_numbers': True, |
|
'remove_urls': True, |
|
'remove_emails': True, |
|
'remove_mentions': True, |
|
'remove_hashtags': True, |
|
'expand_contractions': True, |
|
'remove_accents': False, |
|
'min_word_length': 2 |
|
}, |
|
do_stemming=True |
|
) |
|
|
|
|
|
df.loc[idx, 'comment_text'] = processed |
|
|
|
|
|
if i == 0: |
|
print("\nSample processing results:") |
|
for j in range(min(3, batch_size)): |
|
print(f"\nProcessed text {j+1}: {df.loc[j, 'comment_text'][:100]}...") |
|
|
|
|
|
print(f"\nSaving processed dataset to: {output_path}") |
|
df.to_csv(output_path, index=False) |
|
|
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
print("\nProcessing Complete!") |
|
print("-" * 50) |
|
print(f"Total rows processed: {total_rows:,}") |
|
print(f"Processing time: {processing_time/60:.2f} minutes") |
|
print(f"Average time per text: {processing_time/total_rows*1000:.2f} ms") |
|
print(f"Output file size: {Path(output_path).stat().st_size/1024/1024:.1f} MB") |
|
|
|
|
|
print("\nVocabulary Statistics:") |
|
sample_size = min(1000, total_rows) |
|
original_words = set(' '.join(df['comment_text'].head(sample_size).astype(str)).split()) |
|
processed_words = set(' '.join(df['processed_text'].head(sample_size).astype(str)).split()) |
|
print(f"Sample unique words (first {sample_size:,} rows):") |
|
print(f"Before processing: {len(original_words):,}") |
|
print(f"After processing : {len(processed_words):,}") |
|
print(f"Reduction: {(1 - len(processed_words)/len(original_words))*100:.1f}%") |
|
|
|
except Exception as e: |
|
print(f"\nError processing dataset: {str(e)}") |
|
raise |
|
|
|
if __name__ == "__main__": |
|
|
|
input_file = "dataset/split/train.csv" |
|
output_file = "dataset/split/train_no_stopwords.csv" |
|
|
|
process_dataset(input_file, output_file) |