import os import subprocess import zipfile import glob import pandas as pd def download_and_extract_dataset(dataset="Cornell-University/arxiv", dest_dir="arxiv_dataset"): os.makedirs(dest_dir, exist_ok=True) print("Downloading dataset...") try: subprocess.run( ["kaggle", "datasets", "download", "-d", dataset, "-p", dest_dir], check=True, ) print("Download finished") except subprocess.CalledProcessError as e: print("Downloading error: ", e) return zip_filename = os.path.join(dest_dir, "arxiv.zip") if os.path.exists(zip_filename): print("Unpacking dataset...") with zipfile.ZipFile(zip_filename, "r") as zip_ref: zip_ref.extractall(dest_dir) print("Unpacking finished") else: print("Zip-file is not found") def filter_csv(dest_dir="arxiv_dataset"): csv_file = glob.glob(os.path.join(dest_dir, "*.csv"))[0] df = pd.read_csv(csv_file) df = df[["title", "authors", "abstract", "categories"]] df.to_csv(os.path.join(dest_dir, "filtered_arxiv.csv"), index=False) if __name__ == "__main__": download_and_extract_dataset() filter_csv()