csv-plus-plus / app.py
baconnier's picture
Update app.py
179691f verified
raw
history blame
8.16 kB
import gradio as gr
import pandas as pd
import sweetviz as sv
import tempfile
import os
import category_encoders as ce
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from autoviz.AutoViz_Class import AutoViz_Class
import shutil
import warnings
warnings.filterwarnings('ignore')
class DataAnalyzer:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
self.df = None
self.AV = AutoViz_Class()
def generate_sweetviz_report(self, df):
if df is None:
return "Please upload a dataset first"
report = sv.analyze(df)
report_path = os.path.join(self.temp_dir, "report.html")
report.show_html(report_path, open_browser=False)
with open(report_path, 'r', encoding='utf-8') as f:
html_content = f.read()
html_with_table = f"""
<table width="100%" style="border-collapse: collapse;">
<tr>
<td style="padding: 20px; border: 1px solid #ddd;">
<div style="height: 800px; overflow: auto;">
{html_content}
</div>
</td>
</tr>
</table>
"""
os.remove(report_path)
return html_with_table
def generate_autoviz_report(self, df):
if df is None:
return "Please upload a dataset first"
viz_temp_dir = os.path.join(self.temp_dir, "autoviz_output")
if os.path.exists(viz_temp_dir):
shutil.rmtree(viz_temp_dir)
os.makedirs(viz_temp_dir)
try:
# Sample data if it's too large
if len(df) > 5000:
df = df.sample(n=5000, random_state=42)
# Ensure all columns are properly formatted
df = df.copy()
# Convert numeric columns that might be stored as strings
for col in df.columns:
if df[col].dtype == 'object':
try:
df[col] = pd.to_numeric(df[col], errors='ignore')
except:
pass
plt.close('all') # Close any existing plots
# Configure AutoViz with more specific parameters
dfte = self.AV.AutoViz(
filename='',
sep=',',
depVar='',
dfte=df,
header=0,
verbose=1, # Set to 1 to see progress
lowess=False,
chart_format='html',
max_rows_analyzed=5000,
max_cols_analyzed=30,
save_plot_dir=viz_temp_dir,
ignore_warnings=True,
sampling=True, # Enable sampling
sample_size=5000
)
# Collect and combine HTML files
html_parts = []
if os.path.exists(viz_temp_dir):
for file in sorted(os.listdir(viz_temp_dir)):
if file.endswith('.html'):
file_path = os.path.join(viz_temp_dir, file)
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
if content.strip():
html_parts.append(content)
except Exception as e:
print(f"Error reading file {file}: {str(e)}")
if not html_parts:
return """
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
<h3>No visualizations were generated</h3>
<p>This might be due to:</p>
<ul>
<li>Data format issues</li>
<li>Too few unique values in columns</li>
<li>All categorical data with high cardinality</li>
</ul>
<p>Try with a different dataset or check your data formatting.</p>
</div>
"""
# Combine all HTML content with proper styling
combined_html = f"""
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
<h2 style="text-align: center;">AutoViz Analysis Report</h2>
<p style="text-align: center;">Analysis of {len(df)} rows and {len(df.columns)} columns</p>
<hr>
{'<hr>'.join(html_parts)}
</div>
"""
return combined_html
except Exception as e:
error_message = f"""
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
<h3>Error in AutoViz Analysis</h3>
<p>Error details: {str(e)}</p>
<p>Troubleshooting steps:</p>
<ul>
<li>Check if your data contains valid numerical or categorical values</li>
<li>Ensure there are no completely empty columns</li>
<li>Try with a smaller dataset</li>
<li>Check for any special characters in column names</li>
</ul>
</div>
"""
return error_message
finally:
if os.path.exists(viz_temp_dir):
shutil.rmtree(viz_temp_dir)
def create_interface():
analyzer = DataAnalyzer()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Data Analysis Dashboard")
# Store the dataframe in a state variable
current_df = gr.State(None)
with gr.Tabs():
# First Tab: Data Upload & Preview
with gr.TabItem("Data Upload & Preview"):
with gr.Row():
file_input = gr.File(label="Upload CSV")
data_preview = gr.Dataframe(label="Data Preview", interactive=False)
def load_data(file):
if file is None:
return None, None
try:
df = pd.read_csv(file.name)
return df.head(), df
except Exception as e:
return None, None
file_input.change(
fn=load_data,
inputs=[file_input],
outputs=[data_preview, current_df]
)
# Second Tab: Sweetviz Analysis
with gr.TabItem("Sweetviz Analysis"):
with gr.Row():
sweetviz_button = gr.Button("Generate Sweetviz Report")
sweetviz_output = gr.HTML(label="Sweetviz Report")
def generate_sweetviz(df):
if df is None:
return "Please upload a dataset first"
return analyzer.generate_sweetviz_report(df)
sweetviz_button.click(
fn=generate_sweetviz,
inputs=[current_df],
outputs=[sweetviz_output]
)
# Third Tab: AutoViz Analysis
with gr.TabItem("AutoViz Analysis"):
with gr.Row():
autoviz_button = gr.Button("Generate AutoViz Report")
autoviz_output = gr.HTML(label="AutoViz Report")
def generate_autoviz(df):
if df is None:
return "Please upload a dataset first"
return analyzer.generate_autoviz_report(df)
autoviz_button.click(
fn=generate_autoviz,
inputs=[current_df],
outputs=[autoviz_output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(show_error=True)