from pathlib import Path from kaggle import api as kapi import pandas as pd from sklearn.model_selection import train_test_split as sk_train_test_split def download_dataset(dest_dir, dataset, filename): if (Path(dest_dir) / filename).exists(): print('Dataset already exists, do not download') return print('Downloading dataset...') kapi.dataset_download_file(dataset=dataset, file_name=filename, path=dest_dir, quiet=False) # Takes a lot of RAM def read_dataset(dest_dir, filename) -> pd.DataFrame: print('Reading dataset...') json_file_path = Path(dest_dir) / filename df = pd.read_json(json_file_path, lines=True) print('Dataset read') return df def download_and_read_dataset(dest_dir, dataset, filename): download_dataset(dest_dir=dest_dir, dataset=dataset, filename=filename) return read_dataset(dest_dir=dest_dir, filename=filename) def filter_columns(df: pd.DataFrame, columns) -> pd.DataFrame: print("Removing unwanted columns...") df = df[columns] print("Columns removed...") return df def create_features_labels(df: pd.DataFrame, old_label, new_label): def transform_categories(categories): categories = categories.split() category = categories[0] if '.' in category: return category[: category.index(".")] return category labels = df[old_label].apply(transform_categories) labels = labels.rename(new_label) features = df.drop(old_label, axis=1) return features, labels def train_test_split(X, y, test_size=0.25): return sk_train_test_split(X, y, test_size=test_size, stratify=y) def write_dataset(dest_dir, X, y, filename, to_json : bool = True): dest_dir = Path(dest_dir) df = pd.concat((X, y), axis=1) if to_json: df.to_json(dest_dir / filename, orient="records", lines=True) else: df.to_csv(dest_dir / filename)