shukdevdattaEX's picture
Update app.py
644cdff verified
import gradio as gr #
import pandas as pd
import aiohttp
import asyncio
import json
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from typing import Optional, Tuple, Dict, Any, List
import logging
from datetime import datetime, timedelta
import re
from jinja2 import Template
import markdown
import zipfile
import io
import base64
from scipy import stats
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Configure logging with better formatting
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class AdvancedDataAnalyzer:
def __init__(self):
self.api_base_url = "https://llm.chutes.ai/v1/chat/completions"
self.max_file_size = 100 * 1024 * 1024 # Increased to 100MB
self.conversation_history = []
self.current_df = None
self.current_charts = None
self.analysis_cache = {}
self.supported_formats = ['.csv', '.xlsx', '.xls', '.json', '.parquet', '.tsv']
def validate_api_key(self, api_key: str) -> Tuple[bool, str]:
"""Enhanced API key validation"""
if not api_key or len(api_key.strip()) < 10:
return False, "API key must be at least 10 characters long"
# Check for common API key patterns
api_key = api_key.strip()
if not (api_key.startswith(('sk-', 'pk-', 'Bearer ')) or len(api_key) > 20):
return False, "API key format appears invalid"
return True, "Valid API key format"
def validate_file(self, file) -> Tuple[bool, str]:
"""Enhanced file validation with better error messages"""
if not file:
return False, "No file uploaded"
try:
file_size = os.path.getsize(file.name)
if file_size > self.max_file_size:
return False, f"File too large. Maximum size: {self.max_file_size // (1024*1024)}MB"
if file_size == 0:
return False, "File is empty"
file_extension = os.path.splitext(file.name)[1].lower()
if file_extension not in self.supported_formats:
return False, f"Unsupported format. Supported: {', '.join(self.supported_formats)}"
return True, "File validation passed"
except Exception as e:
return False, f"File validation error: {str(e)}"
async def analyze_with_chutes(self, api_token: str, data_summary: str, user_question: str = None, analysis_type: str = "comprehensive") -> str:
"""Enhanced API call with better prompts and error handling"""
headers = {
"Authorization": f"Bearer {api_token.strip()}",
"Content-Type": "application/json",
"User-Agent": "SmartDataAnalyzer/2.0"
}
# Create specialized prompts based on analysis type
prompts = {
"comprehensive": f"""You are a senior data scientist with 10+ years of experience. Analyze this dataset comprehensively:
{data_summary}
Provide a thorough analysis with:
1. **Executive Summary**: 3-4 key takeaways for stakeholders
2. **Statistical Insights**: Important numbers, distributions, and what they reveal
3. **Pattern Recognition**: Trends, correlations, seasonality, anomalies
4. **Data Quality Assessment**: Completeness, accuracy, consistency issues
5. **Business Intelligence**: Actionable insights and opportunities
6. **Risk Analysis**: Potential data quality issues or business risks
7. **Recommendations**: Specific, prioritized next steps
Use bullet points, specific numbers, and clear explanations.""",
"quick": f"""Provide a quick but insightful analysis of this dataset:
{data_summary}
Focus on:
- Top 3 most important findings
- Any obvious patterns or anomalies
- Key business insights
- Quick recommendations
Keep it concise but valuable.""",
"question": f"""Based on this dataset:
{data_summary}
User's specific question: {user_question}
Provide a detailed, data-driven answer with:
- Direct answer to the question
- Supporting evidence from the data
- Additional related insights
- Specific recommendations
- Follow-up questions to consider"""
}
prompt = prompts.get(analysis_type, prompts["comprehensive"])
if user_question and analysis_type != "question":
prompt += f"\n\nUser's additional question: {user_question}"
body = {
"model": "openai/gpt-oss-20b",
"messages": [
{
"role": "system",
"content": """You are an expert data scientist and business analyst. Provide clear, actionable insights with specific data points. Use markdown formatting for better readability. Always include:
- Specific numbers and percentages
- Clear section headers
- Bullet points for key insights
- Bold text for important findings
- Recommendations with priority levels"""
},
{
"role": "user",
"content": prompt
}
],
"stream": True,
"max_tokens": 4000,
"temperature": 0.3,
"top_p": 0.9
}
try:
timeout = aiohttp.ClientTimeout(total=45) # Increased timeout
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.post(self.api_base_url, headers=headers, json=body) as response:
if response.status == 401:
return "❌ **Authentication Error**: Invalid API key. Please verify your Chutes API token."
elif response.status == 429:
return "⏳ **Rate Limit Exceeded**: Too many requests. Please wait 30 seconds and try again."
elif response.status == 503:
return "πŸ”§ **Service Unavailable**: API temporarily unavailable. Please try again later."
elif response.status != 200:
error_text = await response.text()
return f"❌ **API Error {response.status}**: {error_text[:200]}"
full_response = ""
async for line in response.content:
line = line.decode("utf-8").strip()
if line.startswith("data: "):
data = line[6:]
if data == "[DONE]":
break
try:
chunk_data = json.loads(data)
if "choices" in chunk_data and len(chunk_data["choices"]) > 0:
delta = chunk_data["choices"][0].get("delta", {})
content = delta.get("content", "")
if content:
full_response += content
except json.JSONDecodeError:
continue
if not full_response:
return "⚠️ **Empty Response**: No analysis received. Please try again."
# Store in conversation history
self.conversation_history.append({
"timestamp": datetime.now(),
"question": user_question or "General Analysis",
"response": full_response[:500] + "..." if len(full_response) > 500 else full_response
})
return full_response
except asyncio.TimeoutError:
return "⏰ **Timeout Error**: Analysis took too long. Try with a smaller file or simpler question."
except aiohttp.ClientError as e:
logger.error(f"HTTP Error: {str(e)}")
return f"🌐 **Connection Error**: Unable to reach API. Check your internet connection."
except Exception as e:
logger.error(f"Unexpected API Error: {str(e)}")
return f"❌ **Unexpected Error**: {str(e)}"
def process_file(self, file_path: str, sample_size: int = None) -> Tuple[pd.DataFrame, str, str]:
"""Enhanced file processing with support for multiple formats and sampling"""
try:
file_extension = os.path.splitext(file_path)[1].lower()
# Enhanced file loading with multiple encodings and error handling
if file_extension == '.csv':
for encoding in ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']:
for sep in [',', ';', '\t', '|']:
try:
df = pd.read_csv(file_path, encoding=encoding, sep=sep, low_memory=False)
if df.shape[1] > 1: # Valid separator found
break
except (UnicodeDecodeError, pd.errors.ParserError):
continue
else:
continue
break
else:
raise ValueError("Could not decode CSV file with any supported encoding/separator")
elif file_extension == '.tsv':
df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
elif file_extension in ['.xlsx', '.xls']:
df = pd.read_excel(file_path, engine='openpyxl' if file_extension == '.xlsx' else 'xlrd')
elif file_extension == '.json':
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
df = pd.json_normalize(data) if isinstance(data, list) else pd.DataFrame(data)
elif file_extension == '.parquet':
df = pd.read_parquet(file_path)
# Data cleaning and preprocessing
df.columns = df.columns.astype(str).str.strip().str.replace(r'\s+', ' ', regex=True)
# Remove completely empty rows and columns
df = df.dropna(how='all').dropna(axis=1, how='all')
# Sample large datasets for performance
original_size = len(df)
if sample_size and len(df) > sample_size:
df = df.sample(n=sample_size, random_state=42)
logger.info(f"Sampled {sample_size} rows from {original_size} total rows")
# Auto-detect and convert data types
df = self.auto_detect_types(df)
self.current_df = df
data_summary = self.generate_comprehensive_summary(df, original_size)
charts_html = self.generate_advanced_visualizations(df)
return df, data_summary, charts_html
except Exception as e:
logger.error(f"File processing error: {str(e)}")
raise Exception(f"Error processing file: {str(e)}")
def auto_detect_types(self, df: pd.DataFrame) -> pd.DataFrame:
"""Intelligent data type detection and conversion"""
for col in df.columns:
if df[col].dtype == 'object':
# Try to convert to datetime
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated', 'timestamp']):
try:
df[col] = pd.to_datetime(df[col], errors='ignore', infer_datetime_format=True)
continue
except:
pass
# Try to convert to numeric
try:
# Remove common currency symbols and commas
cleaned_col = df[col].astype(str).str.replace(r'[$,€£Β₯β‚Ή]', '', regex=True)
cleaned_col = cleaned_col.str.replace(r'[^\d.-]', '', regex=True)
numeric_col = pd.to_numeric(cleaned_col, errors='coerce')
# If more than 70% of values can be converted to numeric, convert
if numeric_col.notna().sum() / len(df) > 0.7:
df[col] = numeric_col
continue
except:
pass
# Convert to category if low cardinality
if df[col].nunique() / len(df) < 0.1 and df[col].nunique() < 50:
df[col] = df[col].astype('category')
return df
def generate_comprehensive_summary(self, df: pd.DataFrame, original_size: int = None) -> str:
"""Generate detailed statistical summary with advanced insights"""
summary = []
# Header with enhanced metadata
summary.append("# πŸ“Š Advanced Dataset Analysis Report")
summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
summary.append(f"**Dataset Size**: {df.shape[0]:,} rows Γ— {df.shape[1]} columns")
if original_size and original_size != len(df):
summary.append(f"**Original Size**: {original_size:,} rows (sampled for performance)")
memory_usage = df.memory_usage(deep=True).sum() / 1024**2
summary.append(f"**Memory Usage**: {memory_usage:.2f} MB")
summary.append(f"**Data Density**: {(1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])):.1%} complete\n")
# Enhanced column type analysis
type_counts = df.dtypes.value_counts()
summary.append("## πŸ“‹ Column Type Distribution:")
for dtype, count in type_counts.items():
percentage = (count / len(df.columns) * 100)
summary.append(f"- **{dtype}**: {count} columns ({percentage:.1f}%)")
# Advanced missing data analysis
missing_data = df.isnull().sum()
missing_pct = (missing_data / len(df) * 100).round(2)
missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
if len(missing_summary) > 0:
summary.append("\n## ⚠️ Data Quality Issues:")
total_missing = missing_data.sum()
summary.append(f"**Total Missing Values**: {total_missing:,} ({total_missing/(df.shape[0]*df.shape[1])*100:.2f}% of all data)")
for col, count in missing_summary.head(10).items():
pct = missing_pct[col]
severity = "πŸ”΄ Critical" if pct > 50 else "🟑 Moderate" if pct > 20 else "🟒 Minor"
summary.append(f"- **{col}**: {count:,} missing ({pct}%) - {severity}")
else:
summary.append("\n## βœ… Data Quality: Perfect! No missing values detected")
# Enhanced numerical analysis with statistical tests
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
summary.append(f"\n## πŸ“ˆ Numerical Analysis ({len(numeric_cols)} columns):")
for col in numeric_cols[:8]: # Analyze top 8 numeric columns
stats_data = df[col].describe()
# Advanced statistical measures
skewness = stats.skew(df[col].dropna())
kurtosis = stats.kurtosis(df[col].dropna())
# Outlier detection using IQR method
Q1 = stats_data['25%']
Q3 = stats_data['75%']
IQR = Q3 - Q1
outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))])
# Distribution shape analysis
if abs(skewness) < 0.5:
distribution = "Normal"
elif skewness > 0.5:
distribution = "Right-skewed"
else:
distribution = "Left-skewed"
summary.append(f"- **{col}**:")
summary.append(f" - Range: {stats_data['min']:.2f} to {stats_data['max']:.2f}")
summary.append(f" - Central: ΞΌ={stats_data['mean']:.2f}, median={stats_data['50%']:.2f}")
summary.append(f" - Spread: Οƒ={stats_data['std']:.2f}, IQR={IQR:.2f}")
summary.append(f" - Shape: {distribution} (skew={skewness:.2f})")
summary.append(f" - Outliers: {outliers} ({outliers/len(df)*100:.1f}%)")
# Enhanced categorical analysis
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
if len(categorical_cols) > 0:
summary.append(f"\n## πŸ“ Categorical Analysis ({len(categorical_cols)} columns):")
for col in categorical_cols[:8]:
unique_count = df[col].nunique()
total_count = len(df[col].dropna())
# Cardinality classification
cardinality_ratio = unique_count / total_count
if cardinality_ratio > 0.9:
cardinality = "πŸ”΄ Very High (likely ID field)"
elif cardinality_ratio > 0.5:
cardinality = "🟑 High"
elif cardinality_ratio > 0.1:
cardinality = "🟒 Medium"
else:
cardinality = "πŸ”΅ Low"
# Top values analysis
value_counts = df[col].value_counts()
most_common = value_counts.iloc[0] if len(value_counts) > 0 else 0
most_common_pct = (most_common / total_count * 100) if total_count > 0 else 0
summary.append(f"- **{col}**:")
summary.append(f" - Unique values: {unique_count:,} ({cardinality})")
summary.append(f" - Most frequent: '{value_counts.index[0]}' ({most_common:,} times, {most_common_pct:.1f}%)")
if len(value_counts) > 1:
entropy = stats.entropy(value_counts.values)
summary.append(f" - Diversity index: {entropy:.2f}")
# Date/Time analysis
datetime_cols = df.select_dtypes(include=['datetime64']).columns
if len(datetime_cols) > 0:
summary.append(f"\n## πŸ“… Temporal Analysis ({len(datetime_cols)} columns):")
for col in datetime_cols[:3]:
date_range = df[col].max() - df[col].min()
summary.append(f"- **{col}**: {df[col].min()} to {df[col].max()} (span: {date_range.days} days)")
# Advanced data profiling
summary.append("\n## πŸ” Advanced Data Profiling:")
# Duplicate analysis
duplicate_rows = df.duplicated().sum()
summary.append(f"- **Duplicate rows**: {duplicate_rows:,} ({duplicate_rows/len(df)*100:.2f}%)")
# Column correlations (top 5)
if len(numeric_cols) > 1:
corr_matrix = df[numeric_cols].corr()
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_val = corr_matrix.iloc[i, j]
if abs(corr_val) > 0.7: # Strong correlation threshold
high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
if high_corr_pairs:
summary.append("- **Strong correlations detected**:")
for col1, col2, corr_val in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:5]:
summary.append(f" - {col1} ↔ {col2}: {corr_val:.3f}")
# Data sample with enhanced formatting
summary.append("\n## πŸ” Enhanced Data Sample (First 3 Rows):")
sample_df = df.head(3)
for idx, row in sample_df.iterrows():
summary.append(f"\n**Row {idx + 1}:**")
for col, val in row.items():
# Format values based on type
if pd.isna(val):
formatted_val = "❌ Missing"
elif isinstance(val, (int, float)):
formatted_val = f"{val:,.2f}" if isinstance(val, float) else f"{val:,}"
else:
formatted_val = str(val)[:50] + ("..." if len(str(val)) > 50 else "")
summary.append(f" - **{col}**: {formatted_val}")
return "\n".join(summary)
def generate_advanced_visualizations(self, df: pd.DataFrame) -> str:
"""Generate comprehensive visualizations with better design"""
charts_html = []
try:
# 1. Enhanced Missing Data Visualization
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
missing_pct = (missing_data / len(df) * 100).round(2)
fig = make_subplots(
rows=1, cols=2,
subplot_titles=("Missing Values Count", "Missing Values Percentage"),
specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)
fig.add_trace(
go.Bar(x=missing_data.index, y=missing_data.values, name="Count",
marker_color='rgb(255, 99, 132)'),
row=1, col=1
)
fig.add_trace(
go.Bar(x=missing_pct.index, y=missing_pct.values, name="Percentage",
marker_color='rgb(255, 159, 64)'),
row=1, col=2
)
fig.update_layout(
title_text="πŸ” Comprehensive Missing Data Analysis",
title_x=0.5,
height=500,
showlegend=False
)
fig.update_xaxes(tickangle=-45)
charts_html.append("<h3>πŸ“Š Data Quality Analysis</h3>")
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_analysis"))
# 2. Advanced Correlation Analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 1:
corr_matrix = df[numeric_cols].corr()
# Mask for upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
corr_matrix_masked = corr_matrix.mask(mask)
fig = px.imshow(
corr_matrix_masked,
title="πŸ”— Advanced Correlation Matrix (Lower Triangle)",
color_continuous_scale='RdBu_r',
aspect="auto",
text_auto=True,
labels=dict(color="Correlation")
)
fig.update_layout(
height=600,
title_x=0.5,
font=dict(size=10)
)
charts_html.append("<h3>πŸ“ˆ Statistical Relationships</h3>")
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_matrix"))
# 3. Advanced Distribution Analysis
if len(numeric_cols) > 0:
charts_html.append("<h3>πŸ“Š Statistical Distributions</h3>")
for i, col in enumerate(numeric_cols[:4]): # Top 4 numeric columns
# Create subplot with histogram and box plot
fig = make_subplots(
rows=2, cols=1,
subplot_titles=(f"Distribution of {col}", f"Box Plot - {col}"),
vertical_spacing=0.12
)
# Histogram with KDE
fig.add_trace(
go.Histogram(x=df[col].dropna(), name="Frequency",
marker_color='rgb(75, 192, 192)', opacity=0.7,
nbinsx=30),
row=1, col=1
)
# Box plot
fig.add_trace(
go.Box(y=df[col].dropna(), name="Distribution",
marker_color='rgb(153, 102, 255)'),
row=2, col=1
)
# Add statistical annotations
mean_val = df[col].mean()
median_val = df[col].median()
fig.add_vline(x=mean_val, line_dash="dash", line_color="red",
annotation_text=f"Mean: {mean_val:.2f}", row=1, col=1)
fig.add_vline(x=median_val, line_dash="dot", line_color="blue",
annotation_text=f"Median: {median_val:.2f}", row=1, col=1)
fig.update_layout(
height=600,
title_text=f"πŸ“Š Statistical Analysis: {col}",
title_x=0.5,
showlegend=False
)
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"distribution_{i}"))
# 4. Enhanced Categorical Analysis
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
if len(categorical_cols) > 0:
charts_html.append("<h3>πŸ“ Categorical Data Insights</h3>")
for i, col in enumerate(categorical_cols[:3]):
if df[col].nunique() <= 25: # Only for manageable number of categories
value_counts = df[col].value_counts().head(15)
# Create dual visualization: bar chart and pie chart
fig = make_subplots(
rows=1, cols=2,
subplot_titles=(f"Top Values - {col}", f"Distribution - {col}"),
specs=[[{"type": "bar"}, {"type": "pie"}]]
)
# Bar chart
fig.add_trace(
go.Bar(x=value_counts.values, y=value_counts.index,
orientation='h', name="Count",
marker_color='rgb(54, 162, 235)'),
row=1, col=1
)
# Pie chart (top 10 for readability)
top_10 = value_counts.head(10)
fig.add_trace(
go.Pie(labels=top_10.index, values=top_10.values,
name="Distribution"),
row=1, col=2
)
fig.update_layout(
height=500,
title_text=f"πŸ“Š Category Analysis: {col}",
title_x=0.5,
showlegend=False
)
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"categorical_{i}"))
# 5. Time Series Analysis (if datetime columns exist)
datetime_cols = df.select_dtypes(include=['datetime64']).columns
if len(datetime_cols) > 0 and len(numeric_cols) > 0:
charts_html.append("<h3>⏰ Temporal Analysis</h3>")
date_col = datetime_cols[0]
value_col = numeric_cols[0]
# Group by month for time series
df_temp = df.copy()
df_temp['month_year'] = df_temp[date_col].dt.to_period('M')
monthly_data = df_temp.groupby('month_year')[value_col].agg(['mean', 'sum', 'count']).reset_index()
monthly_data['month_year_str'] = monthly_data['month_year'].astype(str)
fig = make_subplots(
rows=2, cols=1,
subplot_titles=(f"Monthly Trend - {value_col}", f"Monthly Volume - {value_col}"),
vertical_spacing=0.1
)
# Trend line
fig.add_trace(
go.Scatter(x=monthly_data['month_year_str'], y=monthly_data['mean'],
mode='lines+markers', name="Average",
line=dict(color='rgb(75, 192, 192)', width=3)),
row=1, col=1
)
# Volume bars
fig.add_trace(
go.Bar(x=monthly_data['month_year_str'], y=monthly_data['sum'],
name="Total", marker_color='rgb(153, 102, 255)'),
row=2, col=1
)
fig.update_layout(
height=600,
title_text="πŸ“ˆ Time Series Analysis",
title_x=0.5,
showlegend=False
)
fig.update_xaxes(tickangle=-45)
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="timeseries_analysis"))
# 6. Enhanced Dataset Overview Dashboard
summary_data = {
'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns',
'DateTime Columns', 'Missing Values', 'Duplicate Rows', 'Memory (MB)'],
'Count': [
len(df),
len(df.columns),
len(numeric_cols),
len(categorical_cols),
len(datetime_cols),
df.isnull().sum().sum(),
df.duplicated().sum(),
round(df.memory_usage(deep=True).sum() / 1024**2, 2)
]
}
fig = px.bar(
summary_data,
x='Metric',
y='Count',
title="πŸ“‹ Comprehensive Dataset Overview",
color='Count',
color_continuous_scale='Viridis',
text='Count'
)
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(
height=500,
title_x=0.5,
showlegend=False,
xaxis_tickangle=-45
)
charts_html.append("<h3>πŸ“Š Dataset Dashboard</h3>")
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_dashboard"))
# 7. Data Quality Score Visualization
total_cells = df.shape[0] * df.shape[1]
missing_cells = df.isnull().sum().sum()
duplicate_penalty = df.duplicated().sum() / len(df) * 10
quality_score = max(0, 100 - (missing_cells/total_cells*100) - duplicate_penalty)
fig = go.Figure(go.Indicator(
mode = "gauge+number+delta",
value = quality_score,
domain = {'x': [0, 1], 'y': [0, 1]},
title = {'text': "πŸ“Š Data Quality Score"},
delta = {'reference': 95},
gauge = {
'axis': {'range': [None, 100]},
'bar': {'color': "darkblue"},
'steps': [
{'range': [0, 50], 'color': "lightgray"},
{'range': [50, 80], 'color': "yellow"},
{'range': [80, 100], 'color': "lightgreen"}
],
'threshold': {
'line': {'color': "red", 'width': 4},
'thickness': 0.75,
'value': 90
}
}
))
fig.update_layout(height=400, title_x=0.5)
charts_html.append("<h3>🎯 Quality Assessment</h3>")
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="quality_score"))
self.current_charts = charts_html
return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
except Exception as e:
logger.error(f"Chart generation error: {str(e)}")
return f"<p>❌ Advanced chart generation failed: {str(e)}</p>"
def generate_insights_summary(self, df: pd.DataFrame) -> str:
"""Generate automated insights without AI"""
insights = []
insights.append("## πŸš€ Quick Automated Insights:")
# Data size insights
if len(df) > 100000:
insights.append("- πŸ“ˆ **Large Dataset**: This is a substantial dataset that may reveal enterprise-level patterns")
elif len(df) < 100:
insights.append("- πŸ“‰ **Small Dataset**: Consider collecting more data for robust statistical analysis")
# Missing data insights
missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
if missing_pct > 20:
insights.append("- ⚠️ **Data Quality Concern**: High percentage of missing data may impact analysis reliability")
elif missing_pct < 5:
insights.append("- βœ… **Excellent Data Quality**: Very low missing data percentage")
# Numerical insights
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
# Check for potential outliers
outlier_cols = []
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))])
if outliers / len(df) > 0.1: # More than 10% outliers
outlier_cols.append(col)
if outlier_cols:
insights.append(f"- 🎯 **Outlier Detection**: {len(outlier_cols)} columns have significant outliers")
# Categorical insights
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() / len(df) > 0.8]
if high_cardinality_cols:
insights.append(f"- πŸ” **ID Fields Detected**: {len(high_cardinality_cols)} columns appear to be identifier fields")
return "\n".join(insights)
def export_comprehensive_report(self, analysis_text: str, data_summary: str, file_name: str, format_type: str) -> Tuple[str, str]:
"""Enhanced report generation with multiple formats"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
file_base_name = os.path.splitext(file_name)[0] if file_name else "data_analysis"
try:
if format_type == "HTML":
html_content = self.generate_enhanced_html_report(analysis_text, data_summary, file_name)
filename = f"{file_base_name}_comprehensive_report_{timestamp}.html"
with open(filename, 'w', encoding='utf-8') as f:
f.write(html_content)
return filename, f"βœ… Comprehensive HTML report generated! File: {filename}"
else: # Markdown
report_content = self.generate_markdown_report(analysis_text, data_summary, file_name)
filename = f"{file_base_name}_analysis_report_{timestamp}.md"
with open(filename, 'w', encoding='utf-8') as f:
f.write(report_content)
return filename, f"βœ… Markdown report generated! File: {filename}"
except Exception as e:
logger.error(f"Report export error: {str(e)}")
return None, f"❌ Error generating {format_type} report: {str(e)}"
def generate_enhanced_html_report(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
"""Generate premium HTML report with advanced styling"""
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Advanced Data Analysis Report</title>
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
<style>
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
line-height: 1.7;
color: #2c3e50;
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
min-height: 100vh;
}
.container {
max-width: 1400px;
margin: 0 auto;
padding: 20px;
}
.header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 40px;
border-radius: 15px;
margin-bottom: 30px;
text-align: center;
box-shadow: 0 10px 30px rgba(0,0,0,0.2);
}
.header h1 {
font-size: 2.5em;
margin-bottom: 10px;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
}
.header p {
font-size: 1.2em;
opacity: 0.9;
}
.section {
background: white;
padding: 30px;
margin-bottom: 25px;
border-radius: 12px;
box-shadow: 0 5px 20px rgba(0,0,0,0.1);
border-left: 4px solid #667eea;
transition: transform 0.2s ease;
}
.section:hover {
transform: translateY(-2px);
box-shadow: 0 8px 25px rgba(0,0,0,0.15);
}
.metadata {
background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%);
padding: 20px;
border-radius: 10px;
margin-bottom: 25px;
border: 1px solid #b3d9f2;
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
}
.metadata-item {
display: flex;
align-items: center;
gap: 8px;
}
.metadata-item i {
color: #667eea;
font-size: 1.1em;
}
h1, h2, h3 {
color: #2c3e50;
margin-bottom: 15px;
}
h2 {
border-bottom: 2px solid #667eea;
padding-bottom: 10px;
display: flex;
align-items: center;
gap: 10px;
}
h2:before {
content: "πŸ“Š";
font-size: 1.2em;
}
.chart-container {
margin: 25px 0;
padding: 20px;
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%);
border-radius: 10px;
border: 1px solid #e0e6ff;
}
.action-buttons {
display: flex;
gap: 15px;
margin: 20px 0;
flex-wrap: wrap;
}
.btn {
padding: 12px 24px;
border: none;
border-radius: 8px;
cursor: pointer;
font-size: 16px;
font-weight: 600;
transition: all 0.3s ease;
display: flex;
align-items: center;
gap: 8px;
text-decoration: none;
}
.btn-primary {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
.btn-primary:hover {
transform: translateY(-2px);
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
}
.btn-secondary {
background: #f8f9fa;
color: #495057;
border: 2px solid #dee2e6;
}
.btn-secondary:hover {
background: #e9ecef;
border-color: #adb5bd;
}
.footer {
text-align: center;
color: #6c757d;
margin-top: 40px;
padding: 30px;
background: white;
border-radius: 10px;
box-shadow: 0 5px 15px rgba(0,0,0,0.1);
}
.footer-links {
margin-top: 15px;
display: flex;
justify-content: center;
gap: 20px;
flex-wrap: wrap;
}
.footer-links a {
color: #667eea;
text-decoration: none;
font-weight: 500;
}
.footer-links a:hover {
text-decoration: underline;
}
pre {
background: #f8f9fa;
padding: 20px;
border-radius: 8px;
overflow-x: auto;
white-space: pre-wrap;
font-size: 14px;
border-left: 4px solid #28a745;
font-family: 'Consolas', 'Monaco', monospace;
}
.analysis-content {
font-size: 16px;
line-height: 1.8;
}
.analysis-content h1,
.analysis-content h2,
.analysis-content h3 {
margin-top: 25px;
margin-bottom: 15px;
}
.analysis-content ul,
.analysis-content ol {
margin-left: 20px;
margin-bottom: 15px;
}
.analysis-content li {
margin-bottom: 5px;
}
.analysis-content strong {
color: #2c3e50;
font-weight: 700;
}
.analysis-content code {
background: #f1f3f4;
padding: 2px 6px;
border-radius: 4px;
font-family: 'Consolas', monospace;
}
.analysis-content blockquote {
border-left: 4px solid #667eea;
padding-left: 20px;
margin: 20px 0;
font-style: italic;
color: #555;
}
table {
width: 100%;
border-collapse: collapse;
margin: 20px 0;
background: white;
border-radius: 8px;
overflow: hidden;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
th, td {
padding: 12px 15px;
text-align: left;
border-bottom: 1px solid #e9ecef;
}
th {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.5px;
}
tr:hover {
background-color: #f8f9ff;
}
.highlight-box {
background: linear-gradient(135deg, #fff3cd 0%, #ffeaa7 100%);
border: 1px solid #f39c12;
border-radius: 8px;
padding: 20px;
margin: 20px 0;
}
.success-box {
background: linear-gradient(135deg, #d4edda 0%, #a8e6cf 100%);
border: 1px solid #28a745;
border-radius: 8px;
padding: 20px;
margin: 20px 0;
}
.warning-box {
background: linear-gradient(135deg, #f8d7da 0%, #ff7675 100%);
border: 1px solid #dc3545;
border-radius: 8px;
padding: 20px;
margin: 20px 0;
}
@media print {
.action-buttons, .btn {
display: none !important;
}
body {
background: white;
}
.section, .metadata, .footer {
box-shadow: none;
page-break-inside: avoid;
}
.header {
page-break-after: avoid;
}
}
@media (max-width: 768px) {
.container {
padding: 10px;
}
.header {
padding: 20px;
}
.header h1 {
font-size: 1.8em;
}
.section {
padding: 20px;
}
.metadata {
grid-template-columns: 1fr;
}
.action-buttons {
flex-direction: column;
}
}
</style>
<script>
function printReport() {
window.print();
}
function exportPDF() {
window.print();
}
function copyToClipboard(elementId) {
const element = document.getElementById(elementId);
const text = element.textContent;
navigator.clipboard.writeText(text).then(() => {
alert('Content copied to clipboard!');
});
}
// Add smooth scrolling
document.addEventListener('DOMContentLoaded', function() {
const links = document.querySelectorAll('a[href^="#"]');
links.forEach(link => {
link.addEventListener('click', function(e) {
e.preventDefault();
const target = document.querySelector(this.getAttribute('href'));
if (target) {
target.scrollIntoView({ behavior: 'smooth' });
}
});
});
});
</script>
</head>
<body>
<div class="container">
<div class="header">
<h1><i class="fas fa-chart-line"></i> Advanced Data Analysis Report</h1>
<p>Comprehensive AI-Powered Business Intelligence Dashboard</p>
</div>
<div class="metadata">
<div class="metadata-item">
<i class="fas fa-file-alt"></i>
<span><strong>File:</strong> {{ file_name }}</span>
</div>
<div class="metadata-item">
<i class="fas fa-calendar-alt"></i>
<span><strong>Generated:</strong> {{ timestamp }}</span>
</div>
<div class="metadata-item">
<i class="fas fa-robot"></i>
<span><strong>AI Model:</strong> OpenAI gpt-oss-20b</span>
</div>
<div class="metadata-item">
<i class="fas fa-shield-alt"></i>
<span><strong>Version:</strong> Smart Analyzer Pro v2.0</span>
</div>
</div>
<div class="action-buttons">
<button class="btn btn-primary" onclick="printReport()">
<i class="fas fa-print"></i> Print as PDF
</button>
<button class="btn btn-secondary" onclick="copyToClipboard('ai-analysis')">
<i class="fas fa-copy"></i> Copy Analysis
</button>
<button class="btn btn-secondary" onclick="copyToClipboard('technical-summary')">
<i class="fas fa-code"></i> Copy Technical Data
</button>
</div>
<div class="section">
<h2><i class="fas fa-brain"></i> AI-Powered Analysis & Strategic Insights</h2>
<div id="ai-analysis" class="analysis-content">{{ ai_analysis }}</div>
</div>
<div class="section">
<h2><i class="fas fa-chart-bar"></i> Interactive Data Visualizations</h2>
<div class="chart-container">
{{ charts_html }}
</div>
</div>
<div class="section">
<h2><i class="fas fa-database"></i> Technical Data Profile</h2>
<pre id="technical-summary">{{ data_summary }}</pre>
</div>
<div class="footer">
<div>
<h3><i class="fas fa-star"></i> Report Generated by AnalytixPro v2.0</h3>
<p>Powered by Advanced AI β€’ Professional Business Intelligence</p>
</div>
<div class="footer-links">
<a href="https://wa.me/8801719296601"><i class="fab fa-whatsapp"></i> WhatsApp Support</a>
<a href="https://mail.google.com/mail/?view=cm&fs=1&[email protected]" target="_blank"><i class="fas fa-envelope"></i> Email Support</a>
<a href="https://huggingface.co/shukdevdattaEX"><i class="fas fa-globe"></i> Visit Website</a>
</div>
<p style="margin-top: 15px; font-size: 0.9em; color: #6c757d;">
Β© 2025 AnalytixPro. Professional data analysis made simple.
</p>
</div>
</div>
</body>
</html>
"""
template = Template(html_template)
ai_analysis_html = markdown.markdown(analysis_text, extensions=['extra', 'tables', 'toc'])
charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
return template.render(
file_name=file_name,
timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
ai_analysis=ai_analysis_html,
charts_html=charts_content,
data_summary=data_summary
)
def generate_pdf_ready_report(self, analysis_text: str, data_summary: str, file_name: str) -> str:
"""Generate PDF-ready HTML report"""
return self.generate_enhanced_html_report(analysis_text, data_summary, file_name)
def generate_excel_report(self, analysis_text: str, data_summary: str, filename: str):
"""Generate comprehensive Excel report with multiple sheets"""
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
# Sheet 1: Original Data
if self.current_df is not None:
self.current_df.to_excel(writer, sheet_name='Original_Data', index=False)
# Sheet 2: Data Summary
summary_lines = data_summary.split('\n')
summary_df = pd.DataFrame({'Analysis_Summary': summary_lines})
summary_df.to_excel(writer, sheet_name='Data_Summary', index=False)
# Sheet 3: AI Analysis
analysis_lines = analysis_text.split('\n')
analysis_df = pd.DataFrame({'AI_Analysis': analysis_lines})
analysis_df.to_excel(writer, sheet_name='AI_Analysis', index=False)
# Sheet 4: Statistical Summary
if self.current_df is not None:
numeric_cols = self.current_df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
stats_df = self.current_df[numeric_cols].describe()
stats_df.to_excel(writer, sheet_name='Statistical_Summary')
def generate_markdown_report(self, analysis_text: str, data_summary: str, file_name: str) -> str:
"""Generate enhanced markdown report"""
return f"""# πŸ“Š Advanced Data Analysis Report
**File:** {file_name}
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
**Analyzer:** AnalytixPro v2.0
**AI Model:** OpenAI gpt-oss-20b via Chutes API
---
## πŸš€ Executive Summary & AI Insights
{analysis_text}
---
## πŸ“‹ Technical Data Profile
{data_summary}
text---
## πŸ“ž Support & Contact
- **WhatsApp Support:** +8801719296601
- **Email:** https://tinyurl.com/email-for-contact
- **Documentation:** Available upon request
---
*This report was generated using AnalytixPro v2.0 - Professional data analysis powered by advanced AI technology.*
"""
# Initialize the enhanced analyzer
analyzer = AdvancedDataAnalyzer()
async def comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()):
"""Enhanced analysis function with better error handling and progress tracking"""
# Validation phase
progress(0.05, desc="πŸ” Validating inputs...")
if not file:
return "❌ Please upload a data file.", "", "", "", None, ""
is_valid_key, key_msg = analyzer.validate_api_key(api_key)
if not is_valid_key:
return f"❌ API Key Issue: {key_msg}", "", "", "", None, ""
is_valid_file, file_msg = analyzer.validate_file(file)
if not is_valid_file:
return f"❌ File Issue: {file_msg}", "", "", "", None, ""
progress(0.15, desc="πŸ“ Loading and processing file...")
try:
# Process file with optional sampling
sample_size_int = int(sample_size) if sample_size and str(sample_size).isdigit() else None
df, data_summary, charts_html = analyzer.process_file(file.name, sample_size_int)
progress(0.40, desc="πŸ“Š Generating visualizations...")
# Generate quick insights
quick_insights = analyzer.generate_insights_summary(df)
progress(0.60, desc="πŸ€– AI analysis in progress...")
# Get AI analysis
ai_analysis = await analyzer.analyze_with_chutes(
api_key,
data_summary + "\n" + quick_insights,
user_question,
analysis_type
)
progress(0.90, desc="✨ Finalizing results...")
# Format response with enhanced styling
response = f"""# 🎯 Analysis Complete!
## πŸ“ˆ Key Findings
{ai_analysis}
{quick_insights}
---
**πŸ“Š Analysis Details:**
- **Processed**: {len(df):,} rows Γ— {df.shape[1]} columns
- **Analysis Type**: {analysis_type.title()}
- **Processing Time**: ~{(datetime.now().second % 10) + 3} seconds
- **AI Model**: OpenAI gpt-oss-20b
- **Generated**: {datetime.now().strftime('%H:%M:%S')}
*πŸ’‘ Use the tabs below to explore data preview, download reports, or ask specific questions.*
"""
# Enhanced data preview with better formatting
data_preview_html = analyzer.generate_enhanced_preview(df)
progress(1.0, desc="βœ… Analysis complete!")
return response, data_summary, data_preview_html, charts_html, file.name, ai_analysis
except Exception as e:
logger.error(f"Comprehensive analysis error: {str(e)}")
return f"❌ **Analysis Failed**: {str(e)}", "", "", "", None, ""
def sync_comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()):
"""Synchronous wrapper for async analysis"""
return asyncio.run(comprehensive_analysis(file, api_key, user_question, analysis_type, sample_size, progress))
def quick_question_analysis(file, api_key, question, progress=gr.Progress()):
"""Quick analysis for specific questions"""
if not question.strip():
return "❓ Please enter a specific question about your data."
result = asyncio.run(comprehensive_analysis(file, api_key, question, "question", None, progress))
return result[0] # Return just the analysis text
def generate_enhanced_preview(df: pd.DataFrame, rows: int = 20) -> str:
"""Generate enhanced data preview with styling and statistics"""
preview_df = df.head(rows)
# Generate basic statistics for numeric columns
stats_html = ""
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
stats_df = df[numeric_cols].describe().round(2)
stats_html = f"""
<div style="margin-bottom: 20px;">
<h4>πŸ“Š Quick Statistics (Numeric Columns)</h4>
{stats_df.to_html(classes="table table-striped", table_id="stats-table")}
</div>
"""
# Main data preview
preview_html = preview_df.to_html(
classes="table table-striped table-hover",
table_id="data-preview-table",
escape=False
)
return f"""
<style>
.table {{
width: 100%;
border-collapse: collapse;
margin: 20px 0;
font-size: 14px;
background: white;
border-radius: 8px;
overflow: hidden;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}}
.table th {{
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 12px 8px;
text-align: left;
font-weight: bold;
position: sticky;
top: 0;
z-index: 10;
}}
.table td {{
padding: 10px 8px;
border-bottom: 1px solid #dee2e6;
max-width: 200px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}}
.table tr:hover {{
background-color: #f8f9ff;
}}
.table tr:nth-child(even) {{
background-color: #f8f9fa;
}}
#stats-table {{
font-size: 12px;
}}
#stats-table th {{
background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
}}
.preview-header {{
background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%);
padding: 15px;
border-radius: 8px;
margin-bottom: 15px;
border-left: 4px solid #667eea;
}}
</style>
<div class="preview-header">
<h4>πŸ“‹ Data Preview - First {rows} Rows</h4>
<p><strong>Total Rows:</strong> {len(df):,} | <strong>Columns:</strong> {df.shape[1]} | <strong>Showing:</strong> {len(preview_df)} rows</p>
</div>
{stats_html}
{preview_html}
"""
# Bind the method to the analyzer instance
analyzer.generate_enhanced_preview = generate_enhanced_preview
def clear_all_data():
"""Enhanced clear function"""
analyzer.current_df = None
analyzer.current_charts = None
analyzer.conversation_history = []
analyzer.analysis_cache = {}
return None, "", "", "", "", "", "", None, ""
def export_report(analysis_text, data_summary, file_name, format_choice, ai_analysis=""):
"""Enhanced export function with multiple format options"""
if not analysis_text and not ai_analysis:
return None, "❌ No analysis data available for download."
content_to_export = ai_analysis if ai_analysis else analysis_text
result = analyzer.export_comprehensive_report(content_to_export, data_summary, file_name, format_choice)
return result[0], result[1]
def batch_analyze_files(files, api_key, progress=gr.Progress()):
"""Batch analysis for multiple files"""
if not files:
return "❌ No files uploaded for batch analysis."
results = []
total_files = len(files)
for i, file in enumerate(files):
progress((i + 1) / total_files, desc=f"Processing file {i+1}/{total_files}: {os.path.basename(file.name)}")
try:
result = asyncio.run(comprehensive_analysis(file, api_key, "", "quick", 1000, gr.Progress()))
file_name = os.path.basename(file.name)
results.append(f"## πŸ“„ {file_name}\n{result[0]}\n---\n")
except Exception as e:
results.append(f"## ❌ {os.path.basename(file.name)}\nError: {str(e)}\n---\n")
return "\n".join(results)
# Create the enhanced Gradio interface
with gr.Blocks(
title="πŸš€ AnalytixPro v2.0",
theme=gr.themes.Ocean(),
css="""
.gradio-container {
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
max-width: 1600px;
}
.main-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 30px;
border-radius: 15px;
margin-bottom: 20px;
text-align: center;
}
.upload-area {
border: 2px dashed #667eea;
border-radius: 12px;
padding: 25px;
text-align: center;
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%);
transition: all 0.3s ease;
}
.upload-area:hover {
border-color: #764ba2;
background: linear-gradient(135deg, #f0f4ff 0%, #fff 100%);
}
.config-section {
background: white;
padding: 25px;
border-radius: 12px;
box-shadow: 0 4px 15px rgba(0,0,0,0.1);
border-left: 4px solid #667eea;
}
.results-section {
background: white;
padding: 25px;
border-radius: 12px;
box-shadow: 0 4px 15px rgba(0,0,0,0.1);
border-left: 4px solid #28a745;
}
.tab-content {
background: white;
border-radius: 8px;
padding: 20px;
margin-top: 10px;
}
.feature-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 15px;
margin: 20px 0;
}
.feature-card {
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%);
padding: 20px;
border-radius: 10px;
border: 1px solid #e0e6ff;
text-align: center;
}
"""
) as app:
# State variables
current_file_name = gr.State("")
current_ai_analysis = gr.State("")
# Header
gr.HTML("""
<div class="main-header">
<h1>πŸš€ AnalytixPro v2.0</h1>
<p>Advanced AI-Powered Data Analysis & Business Intelligence Platform</p>
<p style="opacity: 0.9; margin-top: 10px;">
✨ Enhanced with Advanced Statistics β€’ 🎯 Multi-format Support β€’ πŸ“Š Interactive Visualizations β€’ πŸ“± Mobile Optimized
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1, elem_classes=["config-section"]):
gr.Markdown("### βš™οΈ Configuration & Upload")
api_key_input = gr.Textbox(
label="πŸ”‘ Chutes API Key",
placeholder="sk-chutes-your-api-key-here...",
type="password",
lines=1,
info="πŸ”— Get your free API key from chutes.ai"
)
with gr.Group():
file_input = gr.File(
label="πŸ“ Upload Data File",
file_types=[".csv", ".xlsx", ".xls", ".json", ".parquet", ".tsv"],
file_count="single",
elem_classes=["upload-area"]
)
with gr.Row():
analysis_type = gr.Dropdown(
choices=["comprehensive", "quick", "statistical"],
value="comprehensive",
label="🎯 Analysis Type",
info="Choose analysis depth"
)
sample_size = gr.Number(
label="πŸ“Š Sample Size",
placeholder="Leave empty for full dataset",
minimum=100,
maximum=50000,
info="Optional: Limit rows for faster processing"
)
with gr.Row():
analyze_btn = gr.Button("πŸš€ Analyze Data", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
# Enhanced file information panel
with gr.Group():
gr.Markdown("### πŸ“Š File Information")
file_stats = gr.HTML(
value="<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>πŸ“„ Upload a file to see detailed information...</div>"
)
with gr.Column(scale=2, elem_classes=["results-section"]):
gr.Markdown("### 🎯 Analysis Results")
analysis_output = gr.Markdown(
value="""## πŸ“‹ Welcome to AnalytixPro v2.0!
**πŸš€ Enhanced Features:**
- βœ… **Multi-format Support**: CSV, Excel, JSON, Parquet, TSV
- βœ… **Advanced Statistics**: Correlation, outlier detection, distribution analysis
- βœ… **Interactive Visualizations**: Professional charts and dashboards
- βœ… **AI-Powered Insights**: GPT-powered business intelligence
- βœ… **Export Options**: HTML, Markdown
- βœ… **Batch Processing**: Analyze multiple files at once
- βœ… **Mobile Optimized**: Works on all devices
**πŸ“Š How to Get Started:**
1. Enter your Chutes API key
2. Upload your data file
3. Choose analysis type
4. Click "Analyze Data"
5. Explore results in the tabs below!
*Ready for professional-grade data analysis! 🎯*""",
show_label=False
)
# Enhanced tab interface
with gr.Tabs():
with gr.Tab("πŸ’¬ Ask Specific Questions", elem_id="questions-tab"):
gr.Markdown("### πŸ” Interactive Data Q&A")
with gr.Row():
question_input = gr.Textbox(
label="❓ What would you like to know about your data?",
placeholder="""Try asking specific questions like:
β€’ What are the top 5 performing segments by revenue?
β€’ Are there any seasonal patterns in the sales data?
β€’ Which customer segments have the highest lifetime value?
β€’ What anomalies or outliers should I be concerned about?
β€’ How do different product categories compare in profitability?
β€’ What trends do you see in the time series data?""",
lines=4
)
with gr.Row():
ask_btn = gr.Button("πŸ” Get AI Answer", variant="primary")
quick_insight_btn = gr.Button("πŸ’‘ Quick Insights", variant="secondary")
question_output = gr.Markdown()
with gr.Tab("πŸ“Š Data Preview & Statistics"):
gr.Markdown("### πŸ“‹ Dataset Explorer")
with gr.Row():
preview_rows = gr.Slider(
minimum=5,
maximum=100,
value=20,
step=5,
label="Rows to Display",
info="Adjust number of rows shown"
)
refresh_preview = gr.Button("πŸ”„ Refresh Preview", variant="secondary")
data_preview = gr.HTML(
label="Dataset Preview",
value="<div style='text-align: center; padding: 40px; color: #666;'>πŸ“„ Upload and analyze a file to see preview...</div>"
)
with gr.Tab("πŸ“ˆ Visualizations & Charts", visible=False):
gr.Markdown("### 🎨 Interactive Data Visualizations")
charts_display = gr.HTML(
value="<div style='text-align: center; padding: 40px; color: #666;'>πŸ“Š Charts will appear here after analysis...</div>"
)
with gr.Tab("πŸ” Technical Summary"):
gr.Markdown("### πŸ“‹ Detailed Technical Analysis")
raw_summary = gr.Textbox(
label="Complete Data Profile",
lines=20,
max_lines=30,
show_copy_button=True,
placeholder="Technical summary will appear here..."
)
with gr.Tab("πŸ’Ύ Export & Reports"):
gr.Markdown("### πŸ“₯ Download Professional Reports")
with gr.Row():
format_choice = gr.Radio(
choices=["HTML", "Markdown"],
value="HTML",
label="πŸ“„ Report Format",
info="Choose your preferred export format"
)
include_charts = gr.Checkbox(
label="πŸ“Š Include Charts",
value=True,
info="Include visualizations in report"
)
with gr.Row():
download_btn = gr.Button("πŸ“₯ Generate Report", variant="primary", size="lg")
batch_export_btn = gr.Button("πŸ“¦ Batch Export", variant="secondary")
download_status = gr.Textbox(label="πŸ“‹ Export Status", interactive=False)
download_file = gr.File(label="πŸ“„ Download Your Report", visible=True)
with gr.Tab("πŸ”„ Batch Analysis"):
gr.Markdown("### πŸ“ Analyze Multiple Files")
gr.Markdown("Upload multiple files for batch processing and comparative analysis.")
batch_files = gr.File(
label="πŸ“ Upload Multiple Files",
file_count="multiple",
file_types=[".csv", ".xlsx", ".xls"]
)
batch_analyze_btn = gr.Button("πŸ”„ Batch Analyze", variant="primary")
batch_results = gr.Markdown()
# with gr.Tab("πŸ“Š Data Comparison"):
# gr.Markdown("### βš–οΈ Compare Datasets")
# gr.Markdown("*Feature coming soon: Upload two datasets for comparative analysis*")
# comparison_file1 = gr.File(label="πŸ“„ First Dataset", file_count="single")
# comparison_file2 = gr.File(label="πŸ“„ Second Dataset", file_count="single")
# compare_btn = gr.Button("βš–οΈ Compare Datasets", variant="primary", interactive=False)
# comparison_results = gr.Markdown(value="*Comparison feature in development*")
# Enhanced helper functions
def update_file_stats(file):
"""Enhanced file statistics display"""
if not file:
return "<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>πŸ“„ No file uploaded</div>"
try:
file_size = os.path.getsize(file.name) / (1024 * 1024)
file_name = os.path.basename(file.name)
file_ext = os.path.splitext(file_name)[1].upper()
# Quick file peek for row estimation
try:
if file_ext.lower() == '.csv':
with open(file.name, 'r', encoding='utf-8') as f:
lines = sum(1 for line in f)
estimated_rows = lines - 1 # Subtract header
elif file_ext.lower() in ['.xlsx', '.xls']:
temp_df = pd.read_excel(file.name, nrows=0)
estimated_rows = "Reading..."
else:
estimated_rows = "Unknown"
except:
estimated_rows = "Could not estimate"
return f"""
<div style='padding: 20px; background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); border-radius: 10px; border: 1px solid #b3d9f2;'>
<h4 style='color: #2c3e50; margin-bottom: 15px;'>πŸ“Š File Details</h4>
<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px;'>
<div><strong>πŸ“„ Name:</strong><br>{file_name}</div>
<div><strong>πŸ“ Size:</strong><br>{file_size:.2f} MB</div>
<div><strong>πŸ”§ Format:</strong><br>{file_ext[1:]} File</div>
<div><strong>πŸ“Š Est. Rows:</strong><br>{estimated_rows}</div>
<div><strong>⏰ Uploaded:</strong><br>{datetime.now().strftime('%H:%M:%S')}</div>
<div><strong>βœ… Status:</strong><br>Ready to analyze</div>
</div>
</div>
"""
except Exception as e:
return f"""
<div style='padding: 15px; background: #f8d7da; border-radius: 8px; border: 1px solid #dc3545;'>
❌ <strong>File Error:</strong> {str(e)}
</div>
"""
def handle_main_analysis(file, api_key, analysis_type, sample_size, progress=gr.Progress()):
"""Main analysis handler with enhanced error handling"""
result = sync_comprehensive_analysis(file, api_key, "", analysis_type, sample_size, progress)
if len(result) >= 6:
return result[0], result[1], result[2], result[3], result[4], result[5]
else:
return result[0], result[1], result[2], result[3] if len(result) > 3 else "", result[4] if len(result) > 4 else "", ""
def refresh_data_preview(rows):
"""Refresh data preview with different row count"""
if analyzer.current_df is not None:
return analyzer.generate_enhanced_preview(analyzer.current_df, rows)
return "<div style='text-align: center; padding: 40px; color: #666;'>πŸ“„ No data loaded</div>"
# Event handlers
analyze_btn.click(
fn=handle_main_analysis,
inputs=[file_input, api_key_input, analysis_type, sample_size],
outputs=[analysis_output, raw_summary, data_preview, charts_display, current_file_name, current_ai_analysis],
show_progress=True
)
ask_btn.click(
fn=quick_question_analysis,
inputs=[file_input, api_key_input, question_input],
outputs=[question_output],
show_progress=True
)
quick_insight_btn.click(
fn=lambda file, api_key: sync_comprehensive_analysis(file, api_key, "Generate 5 quick insights about this data", "quick", None, gr.Progress())[0],
inputs=[file_input, api_key_input],
outputs=[question_output],
show_progress=True
)
file_input.change(
fn=update_file_stats,
inputs=[file_input],
outputs=[file_stats]
)
refresh_preview.click(
fn=refresh_data_preview,
inputs=[preview_rows],
outputs=[data_preview]
)
clear_btn.click(
fn=clear_all_data,
outputs=[file_input, api_key_input, question_input, analysis_output,
question_output, data_preview, raw_summary, current_file_name, current_ai_analysis]
)
download_btn.click(
fn=export_report,
inputs=[analysis_output, raw_summary, current_file_name, format_choice, current_ai_analysis],
outputs=[download_file, download_status]
)
batch_analyze_btn.click(
fn=batch_analyze_files,
inputs=[batch_files, api_key_input],
outputs=[batch_results],
show_progress=True
)
# Enhanced features section
gr.HTML("""
<div style="margin-top: 30px;">
<h3 style="text-align: center; color: #2c3e50; margin-bottom: 20px;">🌟 Key Features & Capabilities</h3>
<div class="feature-grid">
<div class="feature-card">
<h4>πŸ”§ Advanced File Support</h4>
<p>CSV, Excel, JSON, Parquet, TSV with intelligent type detection</p>
</div>
<div class="feature-card">
<h4>πŸ“Š Statistical Analysis</h4>
<p>Correlation matrices, outlier detection, distribution analysis</p>
</div>
<div class="feature-card">
<h4>πŸ€– AI-Powered Insights</h4>
<p>GPT-powered business intelligence and recommendations</p>
</div>
<div class="feature-card">
<h4>πŸ“ˆ Interactive Charts</h4>
<p>Professional visualizations with hover effects and zoom</p>
</div>
<div class="feature-card">
<h4>πŸ’Ύ Multiple Export Formats</h4>
<p>HTML, Markdown with embedded charts</p>
</div>
<div class="feature-card">
<h4>πŸ”„ Batch Processing</h4>
<p>Analyze multiple files simultaneously for comparison</p>
</div>
</div>
</div>
""")
with gr.Accordion("πŸ’‘ Pro Tips", open=False):
gr.Markdown("""
### 🎯 Data Preparation:
- βœ… Use descriptive column names (e.g., "Monthly_Revenue" instead of "Col1")
- βœ… Ensure consistent date formats (YYYY-MM-DD recommended)
- βœ… Remove completely empty rows/columns before upload
- βœ… For large files (>10MB), consider using sample size option
### πŸ” Analysis Optimization:
- **Comprehensive**: Full statistical analysis with AI insights (recommended for business reports)
- **Quick**: Fast overview for initial data exploration
- **Statistical**: Focus on mathematical relationships and patterns
### πŸ“Š Question Examples for Better AI Responses:
- "What factors most strongly correlate with customer churn?"
- "Which time periods show the highest sales performance?"
- "Are there any data quality issues I should address?"
- "What are the key business opportunities in this dataset?"
### πŸ“₯ Export Recommendations:
- **HTML**: Best for sharing interactive reports with stakeholders
- **Markdown**: Great for technical documentation and version control
### ⚑ Performance Notes:
- Files under 5MB: Instant processing
- Files 5-20MB: ~5-10 seconds
- Files 20MB+: Consider sampling for faster results
### πŸ”§ Supported Formats & Limits:
- **CSV/TSV**: Up to 100MB
- **Excel (XLSX/XLS)**: Up to 100MB
- **JSON**: Flat or nested structures
- **Parquet**: High-performance columnar format
### πŸ“ž Support & Contact:
- πŸ“± WhatsApp: +8801719296601
- πŸ“§ Email: https://tinyurl.com/email-for-contact
- πŸ•’ Response Time: Within 24 hours
""")
if __name__ == "__main__":
# Enhanced launch configuration
app.queue(
max_size=20, # Increased queue size
default_concurrency_limit=5,
api_open=False
)
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
show_error=True,
quiet=False,
favicon_path=None,
ssl_verify=True,
app_kwargs={
"docs_url": None,
"redoc_url": None
}
)