Spaces:
Sleeping
Sleeping
File size: 1,238 Bytes
29c361b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import os
import subprocess
import zipfile
import glob
import pandas as pd
def download_and_extract_dataset(dataset="Cornell-University/arxiv", dest_dir="arxiv_dataset"):
os.makedirs(dest_dir, exist_ok=True)
print("Downloading dataset...")
try:
subprocess.run(
["kaggle", "datasets", "download", "-d", dataset, "-p", dest_dir],
check=True,
)
print("Download finished")
except subprocess.CalledProcessError as e:
print("Downloading error: ", e)
return
zip_filename = os.path.join(dest_dir, "arxiv.zip")
if os.path.exists(zip_filename):
print("Unpacking dataset...")
with zipfile.ZipFile(zip_filename, "r") as zip_ref:
zip_ref.extractall(dest_dir)
print("Unpacking finished")
else:
print("Zip-file is not found")
def filter_csv(dest_dir="arxiv_dataset"):
csv_file = glob.glob(os.path.join(dest_dir, "*.csv"))[0]
df = pd.read_csv(csv_file)
df = df[["title", "authors", "abstract", "categories"]]
df.to_csv(os.path.join(dest_dir, "filtered_arxiv.csv"), index=False)
if __name__ == "__main__":
download_and_extract_dataset()
filter_csv()
|