Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
import datasets
import pandas as pd
from tqdm.notebook import tqdm
import json
import os
def load_dataset(dataset_name, **kwargs):
if dataset_name == "Stocknet":
root_path = r"../../../stocknet-dataset/tweet/raw"
stock_lists = os.listdir(root_path)
all = pd.DataFrame()
for stock in tqdm(stock_lists, desc="Loading Stocknet dataset..."):
stock_path = os.path.join(root_path, stock)
date_files = os.listdir(stock_path)
for date in date_files:
with open(os.path.join(stock_path, date_files[0])) as f:
json_list = f.readlines()
tmp_json = []
for json_str in json_list:
tmp_json.append(json.loads(json_str))
tmp_json = pd.DataFrame(tmp_json)
all = pd.concat([all, tmp_json], axis=0)
all = all.reset_index(drop=True)
all = datasets.Dataset.from_pandas(all)
return all
else:
raise NotImplementedError("Only support Stocknet dataset for now")