Spaces:
Running
Running
""" | |
Dataset version control UI component for the ML Dataset & Code Generation Manager. | |
Provides UI for viewing, comparing, and restoring dataset versions. | |
""" | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import datetime | |
import hashlib | |
import plotly.express as px | |
from pathlib import Path | |
from typing import Dict, List, Optional, Tuple, Any | |
from database import version_control | |
def render_version_control_ui(dataset_id: int, df: Optional[pd.DataFrame] = None): | |
""" | |
Render the version control UI for a dataset | |
Args: | |
dataset_id: ID of the dataset | |
df: Current DataFrame of the dataset (optional) | |
""" | |
st.header("Dataset Version Control") | |
# Get all versions of the dataset | |
versions = version_control.get_versions(dataset_id) | |
if not versions: | |
st.info("No versions found for this dataset. Save changes to create the first version.") | |
if df is not None and st.button("Create Initial Version"): | |
version = version_control.create_version( | |
dataset_id=dataset_id, | |
df=df, | |
description="Initial version" | |
) | |
st.success(f"Created initial version: {version.version_id}") | |
st.experimental_rerun() | |
return | |
# Display version history | |
st.subheader("Version History") | |
version_data = [] | |
for v in versions: | |
version_data.append({ | |
"Version ID": v.version_id, | |
"Date": v.timestamp.strftime("%Y-%m-%d %H:%M:%S"), | |
"Rows": v.metadata.get("rows", "N/A"), | |
"Columns": v.metadata.get("columns", "N/A"), | |
"Description": v.description | |
}) | |
version_df = pd.DataFrame(version_data) | |
st.dataframe(version_df, use_container_width=True) | |
# Version actions section | |
st.subheader("Version Actions") | |
col1, col2 = st.columns(2) | |
with col1: | |
selected_version = st.selectbox( | |
"Select Version", | |
options=[v.version_id for v in versions], | |
format_func=lambda x: f"{x} - {next((v.timestamp.strftime('%Y-%m-%d %H:%M:%S') for v in versions if v.version_id == x), '')}" | |
) | |
# Get selected version object | |
selected_v = next((v for v in versions if v.version_id == selected_version), None) | |
if selected_v: | |
st.write(f"**Description:** {selected_v.description}") | |
st.write(f"**Created:** {selected_v.timestamp.strftime('%Y-%m-%d %H:%M:%S')}") | |
# Display metadata | |
if selected_v.metadata: | |
with st.expander("Version Metadata"): | |
for key, value in selected_v.metadata.items(): | |
if key != "column_names": # Show column names separately | |
st.write(f"**{key}:** {value}") | |
if "column_names" in selected_v.metadata: | |
st.write("**Columns:**") | |
st.write(", ".join(selected_v.metadata["column_names"])) | |
with col2: | |
st.write("**Actions:**") | |
if selected_v: | |
# Load selected version | |
if st.button("View Version Data"): | |
version_df = version_control.load_version_data(selected_v) | |
st.session_state["viewing_version_df"] = version_df | |
st.session_state["viewing_version_id"] = selected_v.version_id | |
# Restore version | |
if st.button("Restore This Version"): | |
if df is not None: | |
description = st.session_state.get("restore_description", f"Restored from {selected_v.version_id}") | |
new_version = version_control.restore_version( | |
dataset_id=dataset_id, | |
version_id=selected_v.version_id, | |
description=description | |
) | |
st.success(f"Restored version {selected_v.version_id} as new version {new_version.version_id}") | |
st.experimental_rerun() | |
else: | |
st.error("Cannot restore version: No dataset provided") | |
# Compare versions | |
if len(versions) > 1: | |
st.write("**Compare Versions:**") | |
compare_v1 = st.selectbox("Version 1", options=[v.version_id for v in versions], key="compare_v1") | |
compare_v2 = st.selectbox("Version 2", options=[v.version_id for v in versions], key="compare_v2") | |
if st.button("Compare Versions"): | |
if compare_v1 != compare_v2: | |
comparison = version_control.compare_versions( | |
dataset_id=dataset_id, | |
version_id1=compare_v1, | |
version_id2=compare_v2 | |
) | |
st.session_state["version_comparison"] = comparison | |
else: | |
st.warning("Please select different versions to compare") | |
# Show version data if requested | |
if "viewing_version_df" in st.session_state: | |
st.subheader(f"Data for Version: {st.session_state['viewing_version_id']}") | |
st.dataframe(st.session_state["viewing_version_df"], use_container_width=True) | |
if st.button("Clear Version View"): | |
del st.session_state["viewing_version_df"] | |
del st.session_state["viewing_version_id"] | |
st.experimental_rerun() | |
# Show version comparison if requested | |
if "version_comparison" in st.session_state: | |
comparison = st.session_state["version_comparison"] | |
st.subheader(f"Version Comparison") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write(f"**Version 1:** {comparison['version1']}") | |
st.write(f"**Date:** {comparison['version1_timestamp'].strftime('%Y-%m-%d %H:%M:%S')}") | |
with col2: | |
st.write(f"**Version 2:** {comparison['version2']}") | |
st.write(f"**Date:** {comparison['version2_timestamp'].strftime('%Y-%m-%d %H:%M:%S')}") | |
st.write(f"**Rows Changed:** {comparison['rows_diff']} ({'+' if comparison['rows_diff'] > 0 else ''}{comparison['rows_diff']})") | |
if comparison["columns_added"]: | |
st.write("**Columns Added:**") | |
for col in comparison["columns_added"]: | |
st.write(f"- {col}") | |
if comparison["columns_removed"]: | |
st.write("**Columns Removed:**") | |
for col in comparison["columns_removed"]: | |
st.write(f"- {col}") | |
if comparison["columns_diff"]: | |
st.write("**Columns Changed:**") | |
for col, diff in comparison["columns_diff"].items(): | |
if diff.get("type_changed", False): | |
st.write(f"- {col}: Type changed from {diff['type1']} to {diff['type2']}") | |
elif diff.get("values_changed", False): | |
st.write(f"- {col}: Values changed") | |
if st.button("Clear Comparison"): | |
del st.session_state["version_comparison"] | |
st.experimental_rerun() | |
def render_save_version_ui(dataset_id: int, df: pd.DataFrame): | |
""" | |
Render UI for saving a new version of a dataset | |
Args: | |
dataset_id: ID of the dataset | |
df: DataFrame to save | |
""" | |
st.subheader("Save Current Version") | |
# Get latest version if any | |
latest_version = version_control.get_latest_version(dataset_id) | |
# Calculate changes if a previous version exists | |
if latest_version: | |
try: | |
prev_df = version_control.load_version_data(latest_version) | |
rows_diff = len(df) - len(prev_df) | |
cols_diff = len(df.columns) - len(prev_df.columns) | |
st.write(f"Changes from last version:") | |
st.write(f"- Rows: {'+' if rows_diff > 0 else ''}{rows_diff}") | |
st.write(f"- Columns: {'+' if cols_diff > 0 else ''}{cols_diff}") | |
# Check content hash | |
current_hash = hashlib.md5(df.to_json().encode()).hexdigest()[:8] | |
if current_hash == latest_version.metadata.get("content_hash"): | |
st.info("No changes detected in the data content since the last version.") | |
except: | |
st.warning("Could not compare with previous version.") | |
# Input for version description | |
description = st.text_area("Version Description", placeholder="Describe the changes in this version", key="version_description") | |
# Save button | |
if st.button("Save Version"): | |
version = version_control.create_version( | |
dataset_id=dataset_id, | |
df=df, | |
description=description | |
) | |
st.success(f"Created new version: {version.version_id}") | |
return version | |
return None | |
def render_version_visualization(dataset_id: int): | |
""" | |
Render visualization of dataset versions | |
Args: | |
dataset_id: ID of the dataset | |
""" | |
versions = version_control.get_versions(dataset_id) | |
if not versions: | |
st.info("No versions available to visualize.") | |
return | |
st.subheader("Version Metrics Visualization") | |
# Prepare data for visualization | |
viz_data = [] | |
for version in versions: | |
viz_data.append({ | |
"Version": version.version_id[:8] + "...", # Truncated ID for display | |
"Date": version.timestamp, | |
"Rows": version.metadata.get("rows", 0), | |
"Columns": version.metadata.get("columns", 0), | |
"Full Version ID": version.version_id, # For tooltip | |
"Description": version.description | |
}) | |
viz_df = pd.DataFrame(viz_data) | |
# Visualize row counts over versions | |
fig1 = px.line( | |
viz_df, | |
x="Date", | |
y="Rows", | |
title="Dataset Size (Rows) Across Versions", | |
markers=True, | |
hover_data=["Full Version ID", "Description"] | |
) | |
st.plotly_chart(fig1, use_container_width=True) | |
# Visualize column counts over versions | |
fig2 = px.line( | |
viz_df, | |
x="Date", | |
y="Columns", | |
title="Dataset Structure (Columns) Across Versions", | |
markers=True, | |
hover_data=["Full Version ID", "Description"] | |
) | |
st.plotly_chart(fig2, use_container_width=True) |