File size: 1,924 Bytes
d187b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
from pathlib import Path
import sys
from tqdm import tqdm

def convert_parquet_to_csv(parquet_path, csv_path=None):
    """Convert a parquet file to CSV with progress tracking"""
    print(f"\nReading parquet file: {parquet_path}")
    
    # If no CSV path specified, use the same name with .csv extension
    if csv_path is None:
        csv_path = str(Path(parquet_path).with_suffix('.csv'))
    
    try:
        # Read parquet file
        df = pd.read_parquet(parquet_path)
        total_rows = len(df)
        
        print(f"\nDataset Info:")
        print(f"Rows: {total_rows:,}")
        print(f"Columns: {', '.join(df.columns)}")
        print(f"\nSaving to CSV: {csv_path}")
        
        # Save to CSV with progress bar
        with tqdm(total=total_rows, desc="Converting") as pbar:
            # Use chunksize for memory efficiency
            chunk_size = 10000
            for i in range(0, total_rows, chunk_size):
                end_idx = min(i + chunk_size, total_rows)
                chunk = df.iloc[i:end_idx]
                
                # Write mode: 'w' for first chunk, 'a' for rest
                mode = 'w' if i == 0 else 'a'
                header = i == 0  # Only write header for first chunk
                
                chunk.to_csv(csv_path, mode=mode, header=header, index=False)
                pbar.update(len(chunk))
        
        print(f"\n✓ Successfully converted to CSV")
        print(f"Output file size: {Path(csv_path).stat().st_size / (1024*1024):.1f} MB")
        
    except Exception as e:
        print(f"\n❌ Error: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
        
    parquet_path = "dataset/raw/jigsaw-toxic-comment-train-processed-seqlen128_original .parquet"
    csv_path = "dataset/raw/jigsaw-en-only-toxic-comment-train-processed-seqlen128_original.csv"
    
    convert_parquet_to_csv(parquet_path, csv_path)