|
import pandas as pd |
|
from pathlib import Path |
|
import sys |
|
from tqdm import tqdm |
|
|
|
def convert_parquet_to_csv(parquet_path, csv_path=None): |
|
"""Convert a parquet file to CSV with progress tracking""" |
|
print(f"\nReading parquet file: {parquet_path}") |
|
|
|
|
|
if csv_path is None: |
|
csv_path = str(Path(parquet_path).with_suffix('.csv')) |
|
|
|
try: |
|
|
|
df = pd.read_parquet(parquet_path) |
|
total_rows = len(df) |
|
|
|
print(f"\nDataset Info:") |
|
print(f"Rows: {total_rows:,}") |
|
print(f"Columns: {', '.join(df.columns)}") |
|
print(f"\nSaving to CSV: {csv_path}") |
|
|
|
|
|
with tqdm(total=total_rows, desc="Converting") as pbar: |
|
|
|
chunk_size = 10000 |
|
for i in range(0, total_rows, chunk_size): |
|
end_idx = min(i + chunk_size, total_rows) |
|
chunk = df.iloc[i:end_idx] |
|
|
|
|
|
mode = 'w' if i == 0 else 'a' |
|
header = i == 0 |
|
|
|
chunk.to_csv(csv_path, mode=mode, header=header, index=False) |
|
pbar.update(len(chunk)) |
|
|
|
print(f"\n✓ Successfully converted to CSV") |
|
print(f"Output file size: {Path(csv_path).stat().st_size / (1024*1024):.1f} MB") |
|
|
|
except Exception as e: |
|
print(f"\n❌ Error: {str(e)}") |
|
sys.exit(1) |
|
|
|
if __name__ == "__main__": |
|
|
|
parquet_path = "dataset/raw/jigsaw-toxic-comment-train-processed-seqlen128_original .parquet" |
|
csv_path = "dataset/raw/jigsaw-en-only-toxic-comment-train-processed-seqlen128_original.csv" |
|
|
|
convert_parquet_to_csv(parquet_path, csv_path) |