csv-plus-plus / app.py
baconnier's picture
Update app.py
b2f41cc verified
raw
history blame
9.23 kB
import gradio as gr
import pandas as pd
import sweetviz as sv
import tempfile
import os
import category_encoders as ce
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from autoviz.AutoViz_Class import AutoViz_Class
import shutil
import warnings
warnings.filterwarnings('ignore')
class DataAnalyzer:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
self.df = None
self.AV = AutoViz_Class()
def generate_sweetviz_report(self, df):
if df is None:
return "Please upload a dataset first"
self.df = df
report = sv.analyze(df)
report_path = os.path.join(self.temp_dir, "report.html")
report.show_html(report_path, open_browser=False)
with open(report_path, 'r', encoding='utf-8') as f:
html_content = f.read()
html_with_table = f"""
<table width="100%" style="border-collapse: collapse;">
<tr>
<td style="padding: 20px; border: 1px solid #ddd;">
<div style="height: 800px; overflow: auto;">
{html_content}
</div>
</td>
</tr>
</table>
"""
os.remove(report_path)
return html_with_table
def generate_autoviz_report(self, df):
if df is None:
return "Please upload a dataset first"
viz_temp_dir = os.path.join(self.temp_dir, "autoviz_output")
if os.path.exists(viz_temp_dir):
shutil.rmtree(viz_temp_dir)
os.makedirs(viz_temp_dir)
try:
# Data preprocessing
df = df.copy()
# Handle datetime columns
for col in df.columns:
try:
df[col] = pd.to_datetime(df[col], errors='ignore')
except:
pass
datetime_columns = df.select_dtypes(include=['datetime64']).columns
for col in datetime_columns:
df[f'{col}_year'] = df[col].dt.year
df[f'{col}_month'] = df[col].dt.month
df = df.drop(columns=[col])
# Try to convert string columns to numeric where possible
for col in df.select_dtypes(include=['object']).columns:
try:
df[col] = pd.to_numeric(df[col], errors='ignore')
except:
pass
# Convert remaining string columns to categorical if cardinality is low
object_columns = df.select_dtypes(include=['object']).columns
for col in object_columns:
if df[col].nunique() < 50:
df[col] = df[col].astype('category')
# Sample data if needed
if len(df) > 5000:
df = df.sample(n=5000, random_state=42)
# Print data info for debugging
print("\nDataset Info:")
print(df.info())
print("\nColumn Types:")
print(df.dtypes)
plt.close('all')
# Run AutoViz
dfte = self.AV.AutoViz(
filename='',
sep=',',
depVar='',
dfte=df,
header=0,
verbose=1,
lowess=False,
chart_format='svg',
max_rows_analyzed=5000,
max_cols_analyzed=30,
save_plot_dir=viz_temp_dir
)
# Collect visualizations
html_parts = []
if os.path.exists(viz_temp_dir):
for file in sorted(os.listdir(viz_temp_dir)):
if file.endswith('.html') or file.endswith('.svg'):
file_path = os.path.join(viz_temp_dir, file)
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
if content.strip():
html_parts.append(content)
except Exception as e:
print(f"Error reading file {file}: {str(e)}")
if not html_parts:
return f"""
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
<h3>Data Summary</h3>
<p>Total Rows: {len(df)}</p>
<p>Total Columns: {len(df.columns)}</p>
<p>Column Types:</p>
<pre>{df.dtypes.to_string()}</pre>
<hr>
<h3>No visualizations were generated</h3>
<p>This might be due to:</p>
<ul>
<li>All columns being categorical with high cardinality</li>
<li>No numeric columns for analysis</li>
<li>Data format not suitable for visualization</li>
</ul>
</div>
"""
combined_html = f"""
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
<h2 style="text-align: center;">AutoViz Analysis Report</h2>
<div style="margin: 20px;">
<h3>Dataset Summary</h3>
<p>Rows analyzed: {len(df)}</p>
<p>Columns: {len(df.columns)}</p>
<p>Column Types:</p>
<pre>{df.dtypes.to_string()}</pre>
</div>
<hr>
{'<hr>'.join(html_parts)}
</div>
"""
return combined_html
except Exception as e:
import traceback
error_message = f"""
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
<h3>Error in AutoViz Analysis</h3>
<p>Error details: {str(e)}</p>
<p>Stack trace:</p>
<pre>{traceback.format_exc()}</pre>
<p>Dataset Info:</p>
<pre>
Rows: {len(df)}
Columns: {len(df.columns)}
Types:\n{df.dtypes.to_string()}
</pre>
</div>
"""
return error_message
finally:
if os.path.exists(viz_temp_dir):
shutil.rmtree(viz_temp_dir)
def create_interface():
analyzer = DataAnalyzer()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Data Analysis Dashboard")
# Store the dataframe in a state variable
current_df = gr.State(None)
with gr.Tabs():
# First Tab: Data Upload & Preview
with gr.TabItem("Data Upload & Preview"):
with gr.Row():
file_input = gr.File(label="Upload CSV")
data_preview = gr.Dataframe(label="Data Preview", interactive=False)
def load_data(file):
if file is None:
return None, None
try:
df = pd.read_csv(file.name)
return df.head(), df
except Exception as e:
return None, None
file_input.change(
fn=load_data,
inputs=[file_input],
outputs=[data_preview, current_df]
)
# Second Tab: Sweetviz Analysis
with gr.TabItem("Sweetviz Analysis"):
with gr.Row():
sweetviz_button = gr.Button("Generate Sweetviz Report")
sweetviz_output = gr.HTML(label="Sweetviz Report")
def generate_sweetviz(df):
if df is None:
return "Please upload a dataset first"
return analyzer.generate_sweetviz_report(df)
sweetviz_button.click(
fn=generate_sweetviz,
inputs=[current_df],
outputs=[sweetviz_output]
)
# Third Tab: AutoViz Analysis
with gr.TabItem("AutoViz Analysis"):
with gr.Row():
autoviz_button = gr.Button("Generate AutoViz Report")
autoviz_output = gr.HTML(label="AutoViz Report")
def generate_autoviz(df):
if df is None:
return "Please upload a dataset first"
return analyzer.generate_autoviz_report(df)
autoviz_button.click(
fn=generate_autoviz,
inputs=[current_df],
outputs=[autoviz_output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(show_error=True)