Spaces:
Running
Running
File size: 5,368 Bytes
0cb60c7 67f471c 771365f 0cb60c7 c9d2489 67f471c 771365f 0cb60c7 771365f 0cb60c7 9a72b36 830b865 acc4e78 276ed24 9a72b36 830b865 276ed24 0cb60c7 771365f 0cb60c7 830b865 0cb60c7 276ed24 771365f 0cb60c7 9138597 0cb60c7 771365f 0cb60c7 771365f 0cb60c7 771365f 0cb60c7 9138597 771365f 0cb60c7 9a72b36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import gradio as gr
import pandas as pd
import sweetviz as sv
import tempfile
import os
import category_encoders as ce
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np
import io
import base64
class DataAnalyzer:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
self.df = None
def generate_sweetviz_report(self, df):
self.df = df # Store DataFrame for other analyses
report = sv.analyze(df)
report_path = os.path.join(self.temp_dir, "report.html")
report.show_html(report_path, open_browser=False)
with open(report_path, 'r', encoding='utf-8') as f:
html_content = f.read()
html_with_table = f"""
<table width="100%" style="border-collapse: collapse;">
<tr>
<td style="padding: 20px; border: 1px solid #ddd;">
<div style="height: 800px; overflow: auto;">
{html_content}
</div>
</td>
</tr>
</table>
"""
os.remove(report_path)
return html_with_table
def encode_and_visualize(self, column_name, encoder_type='binary'):
if self.df is None or column_name not in self.df.columns:
return None
# Create DataFrame with only the selected column
df_subset = self.df[[column_name]].copy()
# Select encoder
encoders = {
'binary': ce.BinaryEncoder(),
'onehot': ce.OneHotEncoder(),
'catboost': ce.CatBoostEncoder(),
'count': ce.CountEncoder()
}
encoder = encoders.get(encoder_type)
# Encode data
encoded_df = encoder.fit_transform(df_subset)
# Scale the encoded features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(encoded_df)
# Apply UMAP
reducer = umap.UMAP(
n_neighbors=15,
min_dist=0.1,
n_components=2,
random_state=42
)
embedding = reducer.fit_transform(scaled_data)
# Create visualization
plt.figure(figsize=(10, 6))
scatter = plt.scatter(
embedding[:, 0],
embedding[:, 1],
c=pd.factorize(df_subset[column_name])[0],
cmap='viridis',
alpha=0.6
)
plt.colorbar(scatter)
plt.title(f'UMAP visualization of {column_name}\nusing {encoder_type} encoding')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
# Save plot to bytes
buf = io.BytesIO()
plt.savefig(buf, format='png', bbox_inches='tight')
plt.close()
buf.seek(0)
return buf
def create_interface():
analyzer = DataAnalyzer()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Data Analysis Dashboard")
with gr.Tabs():
with gr.TabItem("Sweetviz Analysis"):
file_input = gr.File(label="Upload CSV")
report_html = gr.HTML()
with gr.TabItem("Categorical Analysis"):
with gr.Row():
column_dropdown = gr.Dropdown(
label="Select Categorical Column",
choices=[],
interactive=True
)
encoder_dropdown = gr.Dropdown(
label="Select Encoder",
choices=['binary', 'onehot', 'catboost', 'count'],
value='binary',
interactive=True
)
plot_output = gr.Image(label="UMAP Visualization")
def process_file(file):
if file is None:
return None, gr.Dropdown(choices=[])
try:
df = pd.read_csv(file.name)
# Get categorical columns
cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
return analyzer.generate_sweetviz_report(df), gr.Dropdown(choices=cat_columns)
except Exception as e:
return f"Error generating report: {str(e)}", gr.Dropdown(choices=[])
def update_plot(column, encoder_type):
if column is None:
return None
try:
plot_bytes = analyzer.encode_and_visualize(column, encoder_type)
return plot_bytes
except Exception as e:
return None
file_input.change(
fn=process_file,
inputs=[file_input],
outputs=[report_html, column_dropdown]
)
column_dropdown.change(
fn=update_plot,
inputs=[column_dropdown, encoder_dropdown],
outputs=[plot_output]
)
encoder_dropdown.change(
fn=update_plot,
inputs=[column_dropdown, encoder_dropdown],
outputs=[plot_output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(show_error=True) |