|
import streamlit as st |
|
import pandas as pd |
|
import json |
|
import io |
|
from utils.dataset_utils import get_dataset_info, detect_dataset_format |
|
|
|
def render_dataset_uploader(): |
|
""" |
|
Renders the dataset upload component that supports CSV and JSON formats. |
|
""" |
|
st.markdown(""" |
|
<div class="upload-container"> |
|
<p>Upload your dataset in CSV or JSON format</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
uploaded_file = st.file_uploader( |
|
"Choose a file", |
|
type=["csv", "json"], |
|
help="Upload a CSV or JSON file containing your dataset" |
|
) |
|
|
|
|
|
st.markdown("Or use a sample dataset:") |
|
sample_dataset = st.selectbox( |
|
"Select a sample dataset", |
|
["None", "Iris Dataset", "Titanic Dataset", "Boston Housing Dataset"] |
|
) |
|
|
|
|
|
if uploaded_file is not None: |
|
try: |
|
|
|
file_extension = uploaded_file.name.split(".")[-1].lower() |
|
|
|
if file_extension == "csv": |
|
df = pd.read_csv(uploaded_file) |
|
dataset_type = "csv" |
|
elif file_extension == "json": |
|
|
|
try: |
|
|
|
df = pd.read_json(uploaded_file) |
|
dataset_type = "json" |
|
except: |
|
|
|
try: |
|
df = pd.read_json(uploaded_file, lines=True) |
|
dataset_type = "jsonl" |
|
except: |
|
|
|
content = json.loads(uploaded_file.getvalue().decode("utf-8")) |
|
if isinstance(content, list): |
|
df = pd.DataFrame(content) |
|
elif isinstance(content, dict): |
|
|
|
if any(isinstance(v, list) for v in content.values()): |
|
|
|
for key, value in content.items(): |
|
if isinstance(value, list): |
|
df = pd.DataFrame(value) |
|
break |
|
else: |
|
|
|
df = pd.DataFrame([content]) |
|
dataset_type = "json" |
|
else: |
|
st.error(f"Unsupported file format: {file_extension}") |
|
return |
|
|
|
|
|
st.session_state.dataset = df |
|
st.session_state.dataset_name = uploaded_file.name |
|
st.session_state.dataset_type = dataset_type |
|
st.session_state.dataset_info = get_dataset_info(df) |
|
|
|
except Exception as e: |
|
st.error(f"Error loading dataset: {str(e)}") |
|
|
|
|
|
elif sample_dataset != "None": |
|
try: |
|
if sample_dataset == "Iris Dataset": |
|
|
|
from sklearn.datasets import load_iris |
|
iris = load_iris() |
|
df = pd.DataFrame(data=iris.data, columns=iris.feature_names) |
|
df['target'] = iris.target |
|
dataset_type = "csv" |
|
|
|
elif sample_dataset == "Titanic Dataset": |
|
|
|
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" |
|
df = pd.read_csv(url) |
|
dataset_type = "csv" |
|
|
|
elif sample_dataset == "Boston Housing Dataset": |
|
|
|
from sklearn.datasets import fetch_california_housing |
|
housing = fetch_california_housing() |
|
df = pd.DataFrame(data=housing.data, columns=housing.feature_names) |
|
df['target'] = housing.target |
|
dataset_type = "csv" |
|
|
|
|
|
st.session_state.dataset = df |
|
st.session_state.dataset_name = sample_dataset |
|
st.session_state.dataset_type = dataset_type |
|
st.session_state.dataset_info = get_dataset_info(df) |
|
|
|
except Exception as e: |
|
st.error(f"Error loading sample dataset: {str(e)}") |
|
|