Valeriy Sinyukov
Fixedfile path to written dataset, add option to save to csv
ce2c261
raw
history blame contribute delete
1.91 kB
from pathlib import Path
from kaggle import api as kapi
import pandas as pd
from sklearn.model_selection import train_test_split as sk_train_test_split
def download_dataset(dest_dir, dataset, filename):
if (Path(dest_dir) / filename).exists():
print('Dataset already exists, do not download')
return
print('Downloading dataset...')
kapi.dataset_download_file(dataset=dataset, file_name=filename, path=dest_dir, quiet=False)
# Takes a lot of RAM
def read_dataset(dest_dir, filename) -> pd.DataFrame:
print('Reading dataset...')
json_file_path = Path(dest_dir) / filename
df = pd.read_json(json_file_path, lines=True)
print('Dataset read')
return df
def download_and_read_dataset(dest_dir, dataset, filename):
download_dataset(dest_dir=dest_dir, dataset=dataset, filename=filename)
return read_dataset(dest_dir=dest_dir, filename=filename)
def filter_columns(df: pd.DataFrame, columns) -> pd.DataFrame:
print("Removing unwanted columns...")
df = df[columns]
print("Columns removed...")
return df
def create_features_labels(df: pd.DataFrame, old_label, new_label):
def transform_categories(categories):
categories = categories.split()
category = categories[0]
if '.' in category:
return category[: category.index(".")]
return category
labels = df[old_label].apply(transform_categories)
labels = labels.rename(new_label)
features = df.drop(old_label, axis=1)
return features, labels
def train_test_split(X, y, test_size=0.25):
return sk_train_test_split(X, y, test_size=test_size, stratify=y)
def write_dataset(dest_dir, X, y, filename, to_json : bool = True):
dest_dir = Path(dest_dir)
df = pd.concat((X, y), axis=1)
if to_json:
df.to_json(dest_dir / filename, orient="records", lines=True)
else:
df.to_csv(dest_dir / filename)