igorithm commited on
Commit
29c361b
·
verified ·
1 Parent(s): 7aae4ed

Add download_dataset.py

Browse files
category_classification/dataset/download_dataset.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import zipfile
4
+ import glob
5
+ import pandas as pd
6
+
7
+ def download_and_extract_dataset(dataset="Cornell-University/arxiv", dest_dir="arxiv_dataset"):
8
+ os.makedirs(dest_dir, exist_ok=True)
9
+ print("Downloading dataset...")
10
+
11
+ try:
12
+ subprocess.run(
13
+ ["kaggle", "datasets", "download", "-d", dataset, "-p", dest_dir],
14
+ check=True,
15
+ )
16
+ print("Download finished")
17
+
18
+ except subprocess.CalledProcessError as e:
19
+ print("Downloading error: ", e)
20
+ return
21
+
22
+ zip_filename = os.path.join(dest_dir, "arxiv.zip")
23
+ if os.path.exists(zip_filename):
24
+ print("Unpacking dataset...")
25
+ with zipfile.ZipFile(zip_filename, "r") as zip_ref:
26
+ zip_ref.extractall(dest_dir)
27
+ print("Unpacking finished")
28
+ else:
29
+ print("Zip-file is not found")
30
+
31
+ def filter_csv(dest_dir="arxiv_dataset"):
32
+ csv_file = glob.glob(os.path.join(dest_dir, "*.csv"))[0]
33
+ df = pd.read_csv(csv_file)
34
+ df = df[["title", "authors", "abstract", "categories"]]
35
+ df.to_csv(os.path.join(dest_dir, "filtered_arxiv.csv"), index=False)
36
+
37
+ if __name__ == "__main__":
38
+ download_and_extract_dataset()
39
+ filter_csv()