Spaces:
Running
Running
import datasets | |
import pandas as pd | |
from tqdm.notebook import tqdm | |
import json | |
import os | |
def load_dataset(dataset_name, **kwargs): | |
if dataset_name == "Stocknet": | |
root_path = r"../../../stocknet-dataset/tweet/raw" | |
stock_lists = os.listdir(root_path) | |
all = pd.DataFrame() | |
for stock in tqdm(stock_lists, desc="Loading Stocknet dataset..."): | |
stock_path = os.path.join(root_path, stock) | |
date_files = os.listdir(stock_path) | |
for date in date_files: | |
with open(os.path.join(stock_path, date_files[0])) as f: | |
json_list = f.readlines() | |
tmp_json = [] | |
for json_str in json_list: | |
tmp_json.append(json.loads(json_str)) | |
tmp_json = pd.DataFrame(tmp_json) | |
all = pd.concat([all, tmp_json], axis=0) | |
all = all.reset_index(drop=True) | |
all = datasets.Dataset.from_pandas(all) | |
return all | |
else: | |
raise NotImplementedError("Only support Stocknet dataset for now") | |