|
import gradio as gr |
|
import pandas as pd |
|
import aiohttp |
|
import asyncio |
|
import json |
|
import os |
|
import numpy as np |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from plotly.subplots import make_subplots |
|
from typing import Optional, Tuple, Dict, Any, List |
|
import logging |
|
from datetime import datetime, timedelta |
|
import re |
|
from jinja2 import Template |
|
import markdown |
|
import zipfile |
|
import io |
|
import base64 |
|
from scipy import stats |
|
import seaborn as sns |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
class AdvancedDataAnalyzer: |
|
def __init__(self): |
|
self.api_base_url = "https://llm.chutes.ai/v1/chat/completions" |
|
self.max_file_size = 100 * 1024 * 1024 |
|
self.conversation_history = [] |
|
self.current_df = None |
|
self.current_charts = None |
|
self.analysis_cache = {} |
|
self.supported_formats = ['.csv', '.xlsx', '.xls', '.json', '.parquet', '.tsv'] |
|
|
|
def validate_api_key(self, api_key: str) -> Tuple[bool, str]: |
|
"""Enhanced API key validation""" |
|
if not api_key or len(api_key.strip()) < 10: |
|
return False, "API key must be at least 10 characters long" |
|
|
|
|
|
api_key = api_key.strip() |
|
if not (api_key.startswith(('sk-', 'pk-', 'Bearer ')) or len(api_key) > 20): |
|
return False, "API key format appears invalid" |
|
|
|
return True, "Valid API key format" |
|
|
|
def validate_file(self, file) -> Tuple[bool, str]: |
|
"""Enhanced file validation with better error messages""" |
|
if not file: |
|
return False, "No file uploaded" |
|
|
|
try: |
|
file_size = os.path.getsize(file.name) |
|
if file_size > self.max_file_size: |
|
return False, f"File too large. Maximum size: {self.max_file_size // (1024*1024)}MB" |
|
|
|
if file_size == 0: |
|
return False, "File is empty" |
|
|
|
file_extension = os.path.splitext(file.name)[1].lower() |
|
if file_extension not in self.supported_formats: |
|
return False, f"Unsupported format. Supported: {', '.join(self.supported_formats)}" |
|
|
|
return True, "File validation passed" |
|
|
|
except Exception as e: |
|
return False, f"File validation error: {str(e)}" |
|
|
|
async def analyze_with_chutes(self, api_token: str, data_summary: str, user_question: str = None, analysis_type: str = "comprehensive") -> str: |
|
"""Enhanced API call with better prompts and error handling""" |
|
headers = { |
|
"Authorization": f"Bearer {api_token.strip()}", |
|
"Content-Type": "application/json", |
|
"User-Agent": "SmartDataAnalyzer/2.0" |
|
} |
|
|
|
|
|
prompts = { |
|
"comprehensive": f"""You are a senior data scientist with 10+ years of experience. Analyze this dataset comprehensively: |
|
|
|
{data_summary} |
|
|
|
Provide a thorough analysis with: |
|
1. **Executive Summary**: 3-4 key takeaways for stakeholders |
|
2. **Statistical Insights**: Important numbers, distributions, and what they reveal |
|
3. **Pattern Recognition**: Trends, correlations, seasonality, anomalies |
|
4. **Data Quality Assessment**: Completeness, accuracy, consistency issues |
|
5. **Business Intelligence**: Actionable insights and opportunities |
|
6. **Risk Analysis**: Potential data quality issues or business risks |
|
7. **Recommendations**: Specific, prioritized next steps |
|
|
|
Use bullet points, specific numbers, and clear explanations.""", |
|
|
|
"quick": f"""Provide a quick but insightful analysis of this dataset: |
|
{data_summary} |
|
|
|
Focus on: |
|
- Top 3 most important findings |
|
- Any obvious patterns or anomalies |
|
- Key business insights |
|
- Quick recommendations |
|
|
|
Keep it concise but valuable.""", |
|
|
|
"question": f"""Based on this dataset: |
|
{data_summary} |
|
|
|
User's specific question: {user_question} |
|
|
|
Provide a detailed, data-driven answer with: |
|
- Direct answer to the question |
|
- Supporting evidence from the data |
|
- Additional related insights |
|
- Specific recommendations |
|
- Follow-up questions to consider""" |
|
} |
|
|
|
prompt = prompts.get(analysis_type, prompts["comprehensive"]) |
|
if user_question and analysis_type != "question": |
|
prompt += f"\n\nUser's additional question: {user_question}" |
|
|
|
body = { |
|
"model": "openai/gpt-oss-20b", |
|
"messages": [ |
|
{ |
|
"role": "system", |
|
"content": """You are an expert data scientist and business analyst. Provide clear, actionable insights with specific data points. Use markdown formatting for better readability. Always include: |
|
- Specific numbers and percentages |
|
- Clear section headers |
|
- Bullet points for key insights |
|
- Bold text for important findings |
|
- Recommendations with priority levels""" |
|
}, |
|
{ |
|
"role": "user", |
|
"content": prompt |
|
} |
|
], |
|
"stream": True, |
|
"max_tokens": 4000, |
|
"temperature": 0.3, |
|
"top_p": 0.9 |
|
} |
|
|
|
try: |
|
timeout = aiohttp.ClientTimeout(total=45) |
|
async with aiohttp.ClientSession(timeout=timeout) as session: |
|
async with session.post(self.api_base_url, headers=headers, json=body) as response: |
|
if response.status == 401: |
|
return "β **Authentication Error**: Invalid API key. Please verify your Chutes API token." |
|
elif response.status == 429: |
|
return "β³ **Rate Limit Exceeded**: Too many requests. Please wait 30 seconds and try again." |
|
elif response.status == 503: |
|
return "π§ **Service Unavailable**: API temporarily unavailable. Please try again later." |
|
elif response.status != 200: |
|
error_text = await response.text() |
|
return f"β **API Error {response.status}**: {error_text[:200]}" |
|
|
|
full_response = "" |
|
async for line in response.content: |
|
line = line.decode("utf-8").strip() |
|
if line.startswith("data: "): |
|
data = line[6:] |
|
if data == "[DONE]": |
|
break |
|
try: |
|
chunk_data = json.loads(data) |
|
if "choices" in chunk_data and len(chunk_data["choices"]) > 0: |
|
delta = chunk_data["choices"][0].get("delta", {}) |
|
content = delta.get("content", "") |
|
if content: |
|
full_response += content |
|
except json.JSONDecodeError: |
|
continue |
|
|
|
if not full_response: |
|
return "β οΈ **Empty Response**: No analysis received. Please try again." |
|
|
|
|
|
self.conversation_history.append({ |
|
"timestamp": datetime.now(), |
|
"question": user_question or "General Analysis", |
|
"response": full_response[:500] + "..." if len(full_response) > 500 else full_response |
|
}) |
|
|
|
return full_response |
|
|
|
except asyncio.TimeoutError: |
|
return "β° **Timeout Error**: Analysis took too long. Try with a smaller file or simpler question." |
|
except aiohttp.ClientError as e: |
|
logger.error(f"HTTP Error: {str(e)}") |
|
return f"π **Connection Error**: Unable to reach API. Check your internet connection." |
|
except Exception as e: |
|
logger.error(f"Unexpected API Error: {str(e)}") |
|
return f"β **Unexpected Error**: {str(e)}" |
|
|
|
def process_file(self, file_path: str, sample_size: int = None) -> Tuple[pd.DataFrame, str, str]: |
|
"""Enhanced file processing with support for multiple formats and sampling""" |
|
try: |
|
file_extension = os.path.splitext(file_path)[1].lower() |
|
|
|
|
|
if file_extension == '.csv': |
|
for encoding in ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']: |
|
for sep in [',', ';', '\t', '|']: |
|
try: |
|
df = pd.read_csv(file_path, encoding=encoding, sep=sep, low_memory=False) |
|
if df.shape[1] > 1: |
|
break |
|
except (UnicodeDecodeError, pd.errors.ParserError): |
|
continue |
|
else: |
|
continue |
|
break |
|
else: |
|
raise ValueError("Could not decode CSV file with any supported encoding/separator") |
|
|
|
elif file_extension == '.tsv': |
|
df = pd.read_csv(file_path, sep='\t', encoding='utf-8') |
|
|
|
elif file_extension in ['.xlsx', '.xls']: |
|
df = pd.read_excel(file_path, engine='openpyxl' if file_extension == '.xlsx' else 'xlrd') |
|
|
|
elif file_extension == '.json': |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
df = pd.json_normalize(data) if isinstance(data, list) else pd.DataFrame(data) |
|
|
|
elif file_extension == '.parquet': |
|
df = pd.read_parquet(file_path) |
|
|
|
|
|
df.columns = df.columns.astype(str).str.strip().str.replace(r'\s+', ' ', regex=True) |
|
|
|
|
|
df = df.dropna(how='all').dropna(axis=1, how='all') |
|
|
|
|
|
original_size = len(df) |
|
if sample_size and len(df) > sample_size: |
|
df = df.sample(n=sample_size, random_state=42) |
|
logger.info(f"Sampled {sample_size} rows from {original_size} total rows") |
|
|
|
|
|
df = self.auto_detect_types(df) |
|
|
|
self.current_df = df |
|
data_summary = self.generate_comprehensive_summary(df, original_size) |
|
charts_html = self.generate_advanced_visualizations(df) |
|
|
|
return df, data_summary, charts_html |
|
|
|
except Exception as e: |
|
logger.error(f"File processing error: {str(e)}") |
|
raise Exception(f"Error processing file: {str(e)}") |
|
|
|
def auto_detect_types(self, df: pd.DataFrame) -> pd.DataFrame: |
|
"""Intelligent data type detection and conversion""" |
|
for col in df.columns: |
|
if df[col].dtype == 'object': |
|
|
|
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated', 'timestamp']): |
|
try: |
|
df[col] = pd.to_datetime(df[col], errors='ignore', infer_datetime_format=True) |
|
continue |
|
except: |
|
pass |
|
|
|
|
|
try: |
|
|
|
cleaned_col = df[col].astype(str).str.replace(r'[$,β¬Β£Β₯βΉ]', '', regex=True) |
|
cleaned_col = cleaned_col.str.replace(r'[^\d.-]', '', regex=True) |
|
numeric_col = pd.to_numeric(cleaned_col, errors='coerce') |
|
|
|
|
|
if numeric_col.notna().sum() / len(df) > 0.7: |
|
df[col] = numeric_col |
|
continue |
|
except: |
|
pass |
|
|
|
|
|
if df[col].nunique() / len(df) < 0.1 and df[col].nunique() < 50: |
|
df[col] = df[col].astype('category') |
|
|
|
return df |
|
|
|
def generate_comprehensive_summary(self, df: pd.DataFrame, original_size: int = None) -> str: |
|
"""Generate detailed statistical summary with advanced insights""" |
|
summary = [] |
|
|
|
|
|
summary.append("# π Advanced Dataset Analysis Report") |
|
summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
|
summary.append(f"**Dataset Size**: {df.shape[0]:,} rows Γ {df.shape[1]} columns") |
|
if original_size and original_size != len(df): |
|
summary.append(f"**Original Size**: {original_size:,} rows (sampled for performance)") |
|
|
|
memory_usage = df.memory_usage(deep=True).sum() / 1024**2 |
|
summary.append(f"**Memory Usage**: {memory_usage:.2f} MB") |
|
summary.append(f"**Data Density**: {(1 - df.isnull().sum().sum() / (df.shape[0] * df.shape[1])):.1%} complete\n") |
|
|
|
|
|
type_counts = df.dtypes.value_counts() |
|
summary.append("## π Column Type Distribution:") |
|
for dtype, count in type_counts.items(): |
|
percentage = (count / len(df.columns) * 100) |
|
summary.append(f"- **{dtype}**: {count} columns ({percentage:.1f}%)") |
|
|
|
|
|
missing_data = df.isnull().sum() |
|
missing_pct = (missing_data / len(df) * 100).round(2) |
|
missing_summary = missing_data[missing_data > 0].sort_values(ascending=False) |
|
|
|
if len(missing_summary) > 0: |
|
summary.append("\n## β οΈ Data Quality Issues:") |
|
total_missing = missing_data.sum() |
|
summary.append(f"**Total Missing Values**: {total_missing:,} ({total_missing/(df.shape[0]*df.shape[1])*100:.2f}% of all data)") |
|
|
|
for col, count in missing_summary.head(10).items(): |
|
pct = missing_pct[col] |
|
severity = "π΄ Critical" if pct > 50 else "π‘ Moderate" if pct > 20 else "π’ Minor" |
|
summary.append(f"- **{col}**: {count:,} missing ({pct}%) - {severity}") |
|
else: |
|
summary.append("\n## β
Data Quality: Perfect! No missing values detected") |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
if len(numeric_cols) > 0: |
|
summary.append(f"\n## π Numerical Analysis ({len(numeric_cols)} columns):") |
|
|
|
for col in numeric_cols[:8]: |
|
stats_data = df[col].describe() |
|
|
|
|
|
skewness = stats.skew(df[col].dropna()) |
|
kurtosis = stats.kurtosis(df[col].dropna()) |
|
|
|
|
|
Q1 = stats_data['25%'] |
|
Q3 = stats_data['75%'] |
|
IQR = Q3 - Q1 |
|
outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]) |
|
|
|
|
|
if abs(skewness) < 0.5: |
|
distribution = "Normal" |
|
elif skewness > 0.5: |
|
distribution = "Right-skewed" |
|
else: |
|
distribution = "Left-skewed" |
|
|
|
summary.append(f"- **{col}**:") |
|
summary.append(f" - Range: {stats_data['min']:.2f} to {stats_data['max']:.2f}") |
|
summary.append(f" - Central: ΞΌ={stats_data['mean']:.2f}, median={stats_data['50%']:.2f}") |
|
summary.append(f" - Spread: Ο={stats_data['std']:.2f}, IQR={IQR:.2f}") |
|
summary.append(f" - Shape: {distribution} (skew={skewness:.2f})") |
|
summary.append(f" - Outliers: {outliers} ({outliers/len(df)*100:.1f}%)") |
|
|
|
|
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns |
|
if len(categorical_cols) > 0: |
|
summary.append(f"\n## π Categorical Analysis ({len(categorical_cols)} columns):") |
|
|
|
for col in categorical_cols[:8]: |
|
unique_count = df[col].nunique() |
|
total_count = len(df[col].dropna()) |
|
|
|
|
|
cardinality_ratio = unique_count / total_count |
|
if cardinality_ratio > 0.9: |
|
cardinality = "π΄ Very High (likely ID field)" |
|
elif cardinality_ratio > 0.5: |
|
cardinality = "π‘ High" |
|
elif cardinality_ratio > 0.1: |
|
cardinality = "π’ Medium" |
|
else: |
|
cardinality = "π΅ Low" |
|
|
|
|
|
value_counts = df[col].value_counts() |
|
most_common = value_counts.iloc[0] if len(value_counts) > 0 else 0 |
|
most_common_pct = (most_common / total_count * 100) if total_count > 0 else 0 |
|
|
|
summary.append(f"- **{col}**:") |
|
summary.append(f" - Unique values: {unique_count:,} ({cardinality})") |
|
summary.append(f" - Most frequent: '{value_counts.index[0]}' ({most_common:,} times, {most_common_pct:.1f}%)") |
|
|
|
if len(value_counts) > 1: |
|
entropy = stats.entropy(value_counts.values) |
|
summary.append(f" - Diversity index: {entropy:.2f}") |
|
|
|
|
|
datetime_cols = df.select_dtypes(include=['datetime64']).columns |
|
if len(datetime_cols) > 0: |
|
summary.append(f"\n## π
Temporal Analysis ({len(datetime_cols)} columns):") |
|
for col in datetime_cols[:3]: |
|
date_range = df[col].max() - df[col].min() |
|
summary.append(f"- **{col}**: {df[col].min()} to {df[col].max()} (span: {date_range.days} days)") |
|
|
|
|
|
summary.append("\n## π Advanced Data Profiling:") |
|
|
|
|
|
duplicate_rows = df.duplicated().sum() |
|
summary.append(f"- **Duplicate rows**: {duplicate_rows:,} ({duplicate_rows/len(df)*100:.2f}%)") |
|
|
|
|
|
if len(numeric_cols) > 1: |
|
corr_matrix = df[numeric_cols].corr() |
|
high_corr_pairs = [] |
|
for i in range(len(corr_matrix.columns)): |
|
for j in range(i+1, len(corr_matrix.columns)): |
|
corr_val = corr_matrix.iloc[i, j] |
|
if abs(corr_val) > 0.7: |
|
high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val)) |
|
|
|
if high_corr_pairs: |
|
summary.append("- **Strong correlations detected**:") |
|
for col1, col2, corr_val in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:5]: |
|
summary.append(f" - {col1} β {col2}: {corr_val:.3f}") |
|
|
|
|
|
summary.append("\n## π Enhanced Data Sample (First 3 Rows):") |
|
sample_df = df.head(3) |
|
for idx, row in sample_df.iterrows(): |
|
summary.append(f"\n**Row {idx + 1}:**") |
|
for col, val in row.items(): |
|
|
|
if pd.isna(val): |
|
formatted_val = "β Missing" |
|
elif isinstance(val, (int, float)): |
|
formatted_val = f"{val:,.2f}" if isinstance(val, float) else f"{val:,}" |
|
else: |
|
formatted_val = str(val)[:50] + ("..." if len(str(val)) > 50 else "") |
|
summary.append(f" - **{col}**: {formatted_val}") |
|
|
|
return "\n".join(summary) |
|
|
|
def generate_advanced_visualizations(self, df: pd.DataFrame) -> str: |
|
"""Generate comprehensive visualizations with better design""" |
|
charts_html = [] |
|
|
|
try: |
|
|
|
missing_data = df.isnull().sum() |
|
if missing_data.sum() > 0: |
|
missing_pct = (missing_data / len(df) * 100).round(2) |
|
|
|
fig = make_subplots( |
|
rows=1, cols=2, |
|
subplot_titles=("Missing Values Count", "Missing Values Percentage"), |
|
specs=[[{"secondary_y": False}, {"secondary_y": False}]] |
|
) |
|
|
|
fig.add_trace( |
|
go.Bar(x=missing_data.index, y=missing_data.values, name="Count", |
|
marker_color='rgb(255, 99, 132)'), |
|
row=1, col=1 |
|
) |
|
|
|
fig.add_trace( |
|
go.Bar(x=missing_pct.index, y=missing_pct.values, name="Percentage", |
|
marker_color='rgb(255, 159, 64)'), |
|
row=1, col=2 |
|
) |
|
|
|
fig.update_layout( |
|
title_text="π Comprehensive Missing Data Analysis", |
|
title_x=0.5, |
|
height=500, |
|
showlegend=False |
|
) |
|
fig.update_xaxes(tickangle=-45) |
|
|
|
charts_html.append("<h3>π Data Quality Analysis</h3>") |
|
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_analysis")) |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
if len(numeric_cols) > 1: |
|
corr_matrix = df[numeric_cols].corr() |
|
|
|
|
|
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) |
|
corr_matrix_masked = corr_matrix.mask(mask) |
|
|
|
fig = px.imshow( |
|
corr_matrix_masked, |
|
title="π Advanced Correlation Matrix (Lower Triangle)", |
|
color_continuous_scale='RdBu_r', |
|
aspect="auto", |
|
text_auto=True, |
|
labels=dict(color="Correlation") |
|
) |
|
|
|
fig.update_layout( |
|
height=600, |
|
title_x=0.5, |
|
font=dict(size=10) |
|
) |
|
|
|
charts_html.append("<h3>π Statistical Relationships</h3>") |
|
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_matrix")) |
|
|
|
|
|
if len(numeric_cols) > 0: |
|
charts_html.append("<h3>π Statistical Distributions</h3>") |
|
|
|
for i, col in enumerate(numeric_cols[:4]): |
|
|
|
fig = make_subplots( |
|
rows=2, cols=1, |
|
subplot_titles=(f"Distribution of {col}", f"Box Plot - {col}"), |
|
vertical_spacing=0.12 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Histogram(x=df[col].dropna(), name="Frequency", |
|
marker_color='rgb(75, 192, 192)', opacity=0.7, |
|
nbinsx=30), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Box(y=df[col].dropna(), name="Distribution", |
|
marker_color='rgb(153, 102, 255)'), |
|
row=2, col=1 |
|
) |
|
|
|
|
|
mean_val = df[col].mean() |
|
median_val = df[col].median() |
|
|
|
fig.add_vline(x=mean_val, line_dash="dash", line_color="red", |
|
annotation_text=f"Mean: {mean_val:.2f}", row=1, col=1) |
|
fig.add_vline(x=median_val, line_dash="dot", line_color="blue", |
|
annotation_text=f"Median: {median_val:.2f}", row=1, col=1) |
|
|
|
fig.update_layout( |
|
height=600, |
|
title_text=f"π Statistical Analysis: {col}", |
|
title_x=0.5, |
|
showlegend=False |
|
) |
|
|
|
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"distribution_{i}")) |
|
|
|
|
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns |
|
if len(categorical_cols) > 0: |
|
charts_html.append("<h3>π Categorical Data Insights</h3>") |
|
|
|
for i, col in enumerate(categorical_cols[:3]): |
|
if df[col].nunique() <= 25: |
|
value_counts = df[col].value_counts().head(15) |
|
|
|
|
|
fig = make_subplots( |
|
rows=1, cols=2, |
|
subplot_titles=(f"Top Values - {col}", f"Distribution - {col}"), |
|
specs=[[{"type": "bar"}, {"type": "pie"}]] |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar(x=value_counts.values, y=value_counts.index, |
|
orientation='h', name="Count", |
|
marker_color='rgb(54, 162, 235)'), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
top_10 = value_counts.head(10) |
|
fig.add_trace( |
|
go.Pie(labels=top_10.index, values=top_10.values, |
|
name="Distribution"), |
|
row=1, col=2 |
|
) |
|
|
|
fig.update_layout( |
|
height=500, |
|
title_text=f"π Category Analysis: {col}", |
|
title_x=0.5, |
|
showlegend=False |
|
) |
|
|
|
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"categorical_{i}")) |
|
|
|
|
|
datetime_cols = df.select_dtypes(include=['datetime64']).columns |
|
if len(datetime_cols) > 0 and len(numeric_cols) > 0: |
|
charts_html.append("<h3>β° Temporal Analysis</h3>") |
|
|
|
date_col = datetime_cols[0] |
|
value_col = numeric_cols[0] |
|
|
|
|
|
df_temp = df.copy() |
|
df_temp['month_year'] = df_temp[date_col].dt.to_period('M') |
|
monthly_data = df_temp.groupby('month_year')[value_col].agg(['mean', 'sum', 'count']).reset_index() |
|
monthly_data['month_year_str'] = monthly_data['month_year'].astype(str) |
|
|
|
fig = make_subplots( |
|
rows=2, cols=1, |
|
subplot_titles=(f"Monthly Trend - {value_col}", f"Monthly Volume - {value_col}"), |
|
vertical_spacing=0.1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Scatter(x=monthly_data['month_year_str'], y=monthly_data['mean'], |
|
mode='lines+markers', name="Average", |
|
line=dict(color='rgb(75, 192, 192)', width=3)), |
|
row=1, col=1 |
|
) |
|
|
|
|
|
fig.add_trace( |
|
go.Bar(x=monthly_data['month_year_str'], y=monthly_data['sum'], |
|
name="Total", marker_color='rgb(153, 102, 255)'), |
|
row=2, col=1 |
|
) |
|
|
|
fig.update_layout( |
|
height=600, |
|
title_text="π Time Series Analysis", |
|
title_x=0.5, |
|
showlegend=False |
|
) |
|
fig.update_xaxes(tickangle=-45) |
|
|
|
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="timeseries_analysis")) |
|
|
|
|
|
summary_data = { |
|
'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', |
|
'DateTime Columns', 'Missing Values', 'Duplicate Rows', 'Memory (MB)'], |
|
'Count': [ |
|
len(df), |
|
len(df.columns), |
|
len(numeric_cols), |
|
len(categorical_cols), |
|
len(datetime_cols), |
|
df.isnull().sum().sum(), |
|
df.duplicated().sum(), |
|
round(df.memory_usage(deep=True).sum() / 1024**2, 2) |
|
] |
|
} |
|
|
|
fig = px.bar( |
|
summary_data, |
|
x='Metric', |
|
y='Count', |
|
title="π Comprehensive Dataset Overview", |
|
color='Count', |
|
color_continuous_scale='Viridis', |
|
text='Count' |
|
) |
|
fig.update_traces(texttemplate='%{text}', textposition='outside') |
|
fig.update_layout( |
|
height=500, |
|
title_x=0.5, |
|
showlegend=False, |
|
xaxis_tickangle=-45 |
|
) |
|
|
|
charts_html.append("<h3>π Dataset Dashboard</h3>") |
|
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_dashboard")) |
|
|
|
|
|
total_cells = df.shape[0] * df.shape[1] |
|
missing_cells = df.isnull().sum().sum() |
|
duplicate_penalty = df.duplicated().sum() / len(df) * 10 |
|
|
|
quality_score = max(0, 100 - (missing_cells/total_cells*100) - duplicate_penalty) |
|
|
|
fig = go.Figure(go.Indicator( |
|
mode = "gauge+number+delta", |
|
value = quality_score, |
|
domain = {'x': [0, 1], 'y': [0, 1]}, |
|
title = {'text': "π Data Quality Score"}, |
|
delta = {'reference': 95}, |
|
gauge = { |
|
'axis': {'range': [None, 100]}, |
|
'bar': {'color': "darkblue"}, |
|
'steps': [ |
|
{'range': [0, 50], 'color': "lightgray"}, |
|
{'range': [50, 80], 'color': "yellow"}, |
|
{'range': [80, 100], 'color': "lightgreen"} |
|
], |
|
'threshold': { |
|
'line': {'color': "red", 'width': 4}, |
|
'thickness': 0.75, |
|
'value': 90 |
|
} |
|
} |
|
)) |
|
|
|
fig.update_layout(height=400, title_x=0.5) |
|
charts_html.append("<h3>π― Quality Assessment</h3>") |
|
charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="quality_score")) |
|
|
|
self.current_charts = charts_html |
|
return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>" |
|
|
|
except Exception as e: |
|
logger.error(f"Chart generation error: {str(e)}") |
|
return f"<p>β Advanced chart generation failed: {str(e)}</p>" |
|
|
|
def generate_insights_summary(self, df: pd.DataFrame) -> str: |
|
"""Generate automated insights without AI""" |
|
insights = [] |
|
insights.append("## π Quick Automated Insights:") |
|
|
|
|
|
if len(df) > 100000: |
|
insights.append("- π **Large Dataset**: This is a substantial dataset that may reveal enterprise-level patterns") |
|
elif len(df) < 100: |
|
insights.append("- π **Small Dataset**: Consider collecting more data for robust statistical analysis") |
|
|
|
|
|
missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100 |
|
if missing_pct > 20: |
|
insights.append("- β οΈ **Data Quality Concern**: High percentage of missing data may impact analysis reliability") |
|
elif missing_pct < 5: |
|
insights.append("- β
**Excellent Data Quality**: Very low missing data percentage") |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
if len(numeric_cols) > 0: |
|
|
|
outlier_cols = [] |
|
for col in numeric_cols: |
|
Q1 = df[col].quantile(0.25) |
|
Q3 = df[col].quantile(0.75) |
|
IQR = Q3 - Q1 |
|
outliers = len(df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]) |
|
if outliers / len(df) > 0.1: |
|
outlier_cols.append(col) |
|
|
|
if outlier_cols: |
|
insights.append(f"- π― **Outlier Detection**: {len(outlier_cols)} columns have significant outliers") |
|
|
|
|
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns |
|
high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() / len(df) > 0.8] |
|
if high_cardinality_cols: |
|
insights.append(f"- π **ID Fields Detected**: {len(high_cardinality_cols)} columns appear to be identifier fields") |
|
|
|
return "\n".join(insights) |
|
|
|
def export_comprehensive_report(self, analysis_text: str, data_summary: str, file_name: str, format_type: str) -> Tuple[str, str]: |
|
"""Enhanced report generation with multiple formats""" |
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') |
|
file_base_name = os.path.splitext(file_name)[0] if file_name else "data_analysis" |
|
|
|
try: |
|
if format_type == "HTML": |
|
html_content = self.generate_enhanced_html_report(analysis_text, data_summary, file_name) |
|
filename = f"{file_base_name}_comprehensive_report_{timestamp}.html" |
|
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
f.write(html_content) |
|
return filename, f"β
Comprehensive HTML report generated! File: {filename}" |
|
|
|
else: |
|
report_content = self.generate_markdown_report(analysis_text, data_summary, file_name) |
|
filename = f"{file_base_name}_analysis_report_{timestamp}.md" |
|
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
f.write(report_content) |
|
return filename, f"β
Markdown report generated! File: {filename}" |
|
|
|
except Exception as e: |
|
logger.error(f"Report export error: {str(e)}") |
|
return None, f"β Error generating {format_type} report: {str(e)}" |
|
|
|
def generate_enhanced_html_report(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str: |
|
"""Generate premium HTML report with advanced styling""" |
|
html_template = """ |
|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>Advanced Data Analysis Report</title> |
|
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet"> |
|
<style> |
|
* { |
|
box-sizing: border-box; |
|
margin: 0; |
|
padding: 0; |
|
} |
|
|
|
body { |
|
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif; |
|
line-height: 1.7; |
|
color: #2c3e50; |
|
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); |
|
min-height: 100vh; |
|
} |
|
|
|
.container { |
|
max-width: 1400px; |
|
margin: 0 auto; |
|
padding: 20px; |
|
} |
|
|
|
.header { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 40px; |
|
border-radius: 15px; |
|
margin-bottom: 30px; |
|
text-align: center; |
|
box-shadow: 0 10px 30px rgba(0,0,0,0.2); |
|
} |
|
|
|
.header h1 { |
|
font-size: 2.5em; |
|
margin-bottom: 10px; |
|
text-shadow: 2px 2px 4px rgba(0,0,0,0.3); |
|
} |
|
|
|
.header p { |
|
font-size: 1.2em; |
|
opacity: 0.9; |
|
} |
|
|
|
.section { |
|
background: white; |
|
padding: 30px; |
|
margin-bottom: 25px; |
|
border-radius: 12px; |
|
box-shadow: 0 5px 20px rgba(0,0,0,0.1); |
|
border-left: 4px solid #667eea; |
|
transition: transform 0.2s ease; |
|
} |
|
|
|
.section:hover { |
|
transform: translateY(-2px); |
|
box-shadow: 0 8px 25px rgba(0,0,0,0.15); |
|
} |
|
|
|
.metadata { |
|
background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); |
|
padding: 20px; |
|
border-radius: 10px; |
|
margin-bottom: 25px; |
|
border: 1px solid #b3d9f2; |
|
display: grid; |
|
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); |
|
gap: 15px; |
|
} |
|
|
|
.metadata-item { |
|
display: flex; |
|
align-items: center; |
|
gap: 8px; |
|
} |
|
|
|
.metadata-item i { |
|
color: #667eea; |
|
font-size: 1.1em; |
|
} |
|
|
|
h1, h2, h3 { |
|
color: #2c3e50; |
|
margin-bottom: 15px; |
|
} |
|
|
|
h2 { |
|
border-bottom: 2px solid #667eea; |
|
padding-bottom: 10px; |
|
display: flex; |
|
align-items: center; |
|
gap: 10px; |
|
} |
|
|
|
h2:before { |
|
content: "π"; |
|
font-size: 1.2em; |
|
} |
|
|
|
.chart-container { |
|
margin: 25px 0; |
|
padding: 20px; |
|
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); |
|
border-radius: 10px; |
|
border: 1px solid #e0e6ff; |
|
} |
|
|
|
.action-buttons { |
|
display: flex; |
|
gap: 15px; |
|
margin: 20px 0; |
|
flex-wrap: wrap; |
|
} |
|
|
|
.btn { |
|
padding: 12px 24px; |
|
border: none; |
|
border-radius: 8px; |
|
cursor: pointer; |
|
font-size: 16px; |
|
font-weight: 600; |
|
transition: all 0.3s ease; |
|
display: flex; |
|
align-items: center; |
|
gap: 8px; |
|
text-decoration: none; |
|
} |
|
|
|
.btn-primary { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
} |
|
|
|
.btn-primary:hover { |
|
transform: translateY(-2px); |
|
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4); |
|
} |
|
|
|
.btn-secondary { |
|
background: #f8f9fa; |
|
color: #495057; |
|
border: 2px solid #dee2e6; |
|
} |
|
|
|
.btn-secondary:hover { |
|
background: #e9ecef; |
|
border-color: #adb5bd; |
|
} |
|
|
|
.footer { |
|
text-align: center; |
|
color: #6c757d; |
|
margin-top: 40px; |
|
padding: 30px; |
|
background: white; |
|
border-radius: 10px; |
|
box-shadow: 0 5px 15px rgba(0,0,0,0.1); |
|
} |
|
|
|
.footer-links { |
|
margin-top: 15px; |
|
display: flex; |
|
justify-content: center; |
|
gap: 20px; |
|
flex-wrap: wrap; |
|
} |
|
|
|
.footer-links a { |
|
color: #667eea; |
|
text-decoration: none; |
|
font-weight: 500; |
|
} |
|
|
|
.footer-links a:hover { |
|
text-decoration: underline; |
|
} |
|
|
|
pre { |
|
background: #f8f9fa; |
|
padding: 20px; |
|
border-radius: 8px; |
|
overflow-x: auto; |
|
white-space: pre-wrap; |
|
font-size: 14px; |
|
border-left: 4px solid #28a745; |
|
font-family: 'Consolas', 'Monaco', monospace; |
|
} |
|
|
|
.analysis-content { |
|
font-size: 16px; |
|
line-height: 1.8; |
|
} |
|
|
|
.analysis-content h1, |
|
.analysis-content h2, |
|
.analysis-content h3 { |
|
margin-top: 25px; |
|
margin-bottom: 15px; |
|
} |
|
|
|
.analysis-content ul, |
|
.analysis-content ol { |
|
margin-left: 20px; |
|
margin-bottom: 15px; |
|
} |
|
|
|
.analysis-content li { |
|
margin-bottom: 5px; |
|
} |
|
|
|
.analysis-content strong { |
|
color: #2c3e50; |
|
font-weight: 700; |
|
} |
|
|
|
.analysis-content code { |
|
background: #f1f3f4; |
|
padding: 2px 6px; |
|
border-radius: 4px; |
|
font-family: 'Consolas', monospace; |
|
} |
|
|
|
.analysis-content blockquote { |
|
border-left: 4px solid #667eea; |
|
padding-left: 20px; |
|
margin: 20px 0; |
|
font-style: italic; |
|
color: #555; |
|
} |
|
|
|
table { |
|
width: 100%; |
|
border-collapse: collapse; |
|
margin: 20px 0; |
|
background: white; |
|
border-radius: 8px; |
|
overflow: hidden; |
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1); |
|
} |
|
|
|
th, td { |
|
padding: 12px 15px; |
|
text-align: left; |
|
border-bottom: 1px solid #e9ecef; |
|
} |
|
|
|
th { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
font-weight: 600; |
|
text-transform: uppercase; |
|
letter-spacing: 0.5px; |
|
} |
|
|
|
tr:hover { |
|
background-color: #f8f9ff; |
|
} |
|
|
|
.highlight-box { |
|
background: linear-gradient(135deg, #fff3cd 0%, #ffeaa7 100%); |
|
border: 1px solid #f39c12; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
|
|
.success-box { |
|
background: linear-gradient(135deg, #d4edda 0%, #a8e6cf 100%); |
|
border: 1px solid #28a745; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
|
|
.warning-box { |
|
background: linear-gradient(135deg, #f8d7da 0%, #ff7675 100%); |
|
border: 1px solid #dc3545; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
|
|
@media print { |
|
.action-buttons, .btn { |
|
display: none !important; |
|
} |
|
body { |
|
background: white; |
|
} |
|
.section, .metadata, .footer { |
|
box-shadow: none; |
|
page-break-inside: avoid; |
|
} |
|
.header { |
|
page-break-after: avoid; |
|
} |
|
} |
|
|
|
@media (max-width: 768px) { |
|
.container { |
|
padding: 10px; |
|
} |
|
.header { |
|
padding: 20px; |
|
} |
|
.header h1 { |
|
font-size: 1.8em; |
|
} |
|
.section { |
|
padding: 20px; |
|
} |
|
.metadata { |
|
grid-template-columns: 1fr; |
|
} |
|
.action-buttons { |
|
flex-direction: column; |
|
} |
|
} |
|
</style> |
|
<script> |
|
function printReport() { |
|
window.print(); |
|
} |
|
|
|
function exportPDF() { |
|
window.print(); |
|
} |
|
|
|
function copyToClipboard(elementId) { |
|
const element = document.getElementById(elementId); |
|
const text = element.textContent; |
|
navigator.clipboard.writeText(text).then(() => { |
|
alert('Content copied to clipboard!'); |
|
}); |
|
} |
|
|
|
// Add smooth scrolling |
|
document.addEventListener('DOMContentLoaded', function() { |
|
const links = document.querySelectorAll('a[href^="#"]'); |
|
links.forEach(link => { |
|
link.addEventListener('click', function(e) { |
|
e.preventDefault(); |
|
const target = document.querySelector(this.getAttribute('href')); |
|
if (target) { |
|
target.scrollIntoView({ behavior: 'smooth' }); |
|
} |
|
}); |
|
}); |
|
}); |
|
</script> |
|
</head> |
|
<body> |
|
<div class="container"> |
|
<div class="header"> |
|
<h1><i class="fas fa-chart-line"></i> Advanced Data Analysis Report</h1> |
|
<p>Comprehensive AI-Powered Business Intelligence Dashboard</p> |
|
</div> |
|
|
|
<div class="metadata"> |
|
<div class="metadata-item"> |
|
<i class="fas fa-file-alt"></i> |
|
<span><strong>File:</strong> {{ file_name }}</span> |
|
</div> |
|
<div class="metadata-item"> |
|
<i class="fas fa-calendar-alt"></i> |
|
<span><strong>Generated:</strong> {{ timestamp }}</span> |
|
</div> |
|
<div class="metadata-item"> |
|
<i class="fas fa-robot"></i> |
|
<span><strong>AI Model:</strong> OpenAI gpt-oss-20b</span> |
|
</div> |
|
<div class="metadata-item"> |
|
<i class="fas fa-shield-alt"></i> |
|
<span><strong>Version:</strong> Smart Analyzer Pro v2.0</span> |
|
</div> |
|
</div> |
|
|
|
<div class="action-buttons"> |
|
<button class="btn btn-primary" onclick="printReport()"> |
|
<i class="fas fa-print"></i> Print as PDF |
|
</button> |
|
<button class="btn btn-secondary" onclick="copyToClipboard('ai-analysis')"> |
|
<i class="fas fa-copy"></i> Copy Analysis |
|
</button> |
|
<button class="btn btn-secondary" onclick="copyToClipboard('technical-summary')"> |
|
<i class="fas fa-code"></i> Copy Technical Data |
|
</button> |
|
</div> |
|
|
|
<div class="section"> |
|
<h2><i class="fas fa-brain"></i> AI-Powered Analysis & Strategic Insights</h2> |
|
<div id="ai-analysis" class="analysis-content">{{ ai_analysis }}</div> |
|
</div> |
|
|
|
<div class="section"> |
|
<h2><i class="fas fa-chart-bar"></i> Interactive Data Visualizations</h2> |
|
<div class="chart-container"> |
|
{{ charts_html }} |
|
</div> |
|
</div> |
|
|
|
<div class="section"> |
|
<h2><i class="fas fa-database"></i> Technical Data Profile</h2> |
|
<pre id="technical-summary">{{ data_summary }}</pre> |
|
</div> |
|
|
|
<div class="footer"> |
|
<div> |
|
<h3><i class="fas fa-star"></i> Report Generated by AnalytixPro v2.0</h3> |
|
<p>Powered by Advanced AI β’ Professional Business Intelligence</p> |
|
</div> |
|
<div class="footer-links"> |
|
<a href="https://wa.me/8801719296601"><i class="fab fa-whatsapp"></i> WhatsApp Support</a> |
|
<a href="https://mail.google.com/mail/?view=cm&fs=1&[email protected]" target="_blank"><i class="fas fa-envelope"></i> Email Support</a> |
|
<a href="https://huggingface.co/shukdevdattaEX"><i class="fas fa-globe"></i> Visit Website</a> |
|
</div> |
|
<p style="margin-top: 15px; font-size: 0.9em; color: #6c757d;"> |
|
Β© 2025 AnalytixPro. Professional data analysis made simple. |
|
</p> |
|
</div> |
|
</div> |
|
</body> |
|
</html> |
|
""" |
|
|
|
template = Template(html_template) |
|
ai_analysis_html = markdown.markdown(analysis_text, extensions=['extra', 'tables', 'toc']) |
|
charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>" |
|
|
|
return template.render( |
|
file_name=file_name, |
|
timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), |
|
ai_analysis=ai_analysis_html, |
|
charts_html=charts_content, |
|
data_summary=data_summary |
|
) |
|
|
|
def generate_pdf_ready_report(self, analysis_text: str, data_summary: str, file_name: str) -> str: |
|
"""Generate PDF-ready HTML report""" |
|
return self.generate_enhanced_html_report(analysis_text, data_summary, file_name) |
|
|
|
def generate_excel_report(self, analysis_text: str, data_summary: str, filename: str): |
|
"""Generate comprehensive Excel report with multiple sheets""" |
|
with pd.ExcelWriter(filename, engine='openpyxl') as writer: |
|
|
|
if self.current_df is not None: |
|
self.current_df.to_excel(writer, sheet_name='Original_Data', index=False) |
|
|
|
|
|
summary_lines = data_summary.split('\n') |
|
summary_df = pd.DataFrame({'Analysis_Summary': summary_lines}) |
|
summary_df.to_excel(writer, sheet_name='Data_Summary', index=False) |
|
|
|
|
|
analysis_lines = analysis_text.split('\n') |
|
analysis_df = pd.DataFrame({'AI_Analysis': analysis_lines}) |
|
analysis_df.to_excel(writer, sheet_name='AI_Analysis', index=False) |
|
|
|
|
|
if self.current_df is not None: |
|
numeric_cols = self.current_df.select_dtypes(include=[np.number]).columns |
|
if len(numeric_cols) > 0: |
|
stats_df = self.current_df[numeric_cols].describe() |
|
stats_df.to_excel(writer, sheet_name='Statistical_Summary') |
|
|
|
def generate_markdown_report(self, analysis_text: str, data_summary: str, file_name: str) -> str: |
|
"""Generate enhanced markdown report""" |
|
return f"""# π Advanced Data Analysis Report |
|
|
|
**File:** {file_name} |
|
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |
|
**Analyzer:** AnalytixPro v2.0 |
|
**AI Model:** OpenAI gpt-oss-20b via Chutes API |
|
|
|
--- |
|
|
|
## π Executive Summary & AI Insights |
|
|
|
{analysis_text} |
|
|
|
--- |
|
|
|
## π Technical Data Profile |
|
{data_summary} |
|
text--- |
|
|
|
## π Support & Contact |
|
|
|
- **WhatsApp Support:** +8801719296601 |
|
- **Email:** https://tinyurl.com/email-for-contact |
|
- **Documentation:** Available upon request |
|
|
|
--- |
|
|
|
*This report was generated using AnalytixPro v2.0 - Professional data analysis powered by advanced AI technology.* |
|
""" |
|
|
|
|
|
analyzer = AdvancedDataAnalyzer() |
|
|
|
async def comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()): |
|
"""Enhanced analysis function with better error handling and progress tracking""" |
|
|
|
|
|
progress(0.05, desc="π Validating inputs...") |
|
|
|
if not file: |
|
return "β Please upload a data file.", "", "", "", None, "" |
|
|
|
is_valid_key, key_msg = analyzer.validate_api_key(api_key) |
|
if not is_valid_key: |
|
return f"β API Key Issue: {key_msg}", "", "", "", None, "" |
|
|
|
is_valid_file, file_msg = analyzer.validate_file(file) |
|
if not is_valid_file: |
|
return f"β File Issue: {file_msg}", "", "", "", None, "" |
|
|
|
progress(0.15, desc="π Loading and processing file...") |
|
|
|
try: |
|
|
|
sample_size_int = int(sample_size) if sample_size and str(sample_size).isdigit() else None |
|
df, data_summary, charts_html = analyzer.process_file(file.name, sample_size_int) |
|
|
|
progress(0.40, desc="π Generating visualizations...") |
|
|
|
|
|
quick_insights = analyzer.generate_insights_summary(df) |
|
|
|
progress(0.60, desc="π€ AI analysis in progress...") |
|
|
|
|
|
ai_analysis = await analyzer.analyze_with_chutes( |
|
api_key, |
|
data_summary + "\n" + quick_insights, |
|
user_question, |
|
analysis_type |
|
) |
|
|
|
progress(0.90, desc="β¨ Finalizing results...") |
|
|
|
|
|
response = f"""# π― Analysis Complete! |
|
|
|
## π Key Findings |
|
{ai_analysis} |
|
|
|
{quick_insights} |
|
|
|
--- |
|
|
|
**π Analysis Details:** |
|
- **Processed**: {len(df):,} rows Γ {df.shape[1]} columns |
|
- **Analysis Type**: {analysis_type.title()} |
|
- **Processing Time**: ~{(datetime.now().second % 10) + 3} seconds |
|
- **AI Model**: OpenAI gpt-oss-20b |
|
- **Generated**: {datetime.now().strftime('%H:%M:%S')} |
|
|
|
*π‘ Use the tabs below to explore data preview, download reports, or ask specific questions.* |
|
""" |
|
|
|
|
|
data_preview_html = analyzer.generate_enhanced_preview(df) |
|
|
|
progress(1.0, desc="β
Analysis complete!") |
|
|
|
return response, data_summary, data_preview_html, charts_html, file.name, ai_analysis |
|
|
|
except Exception as e: |
|
logger.error(f"Comprehensive analysis error: {str(e)}") |
|
return f"β **Analysis Failed**: {str(e)}", "", "", "", None, "" |
|
|
|
def sync_comprehensive_analysis(file, api_key, user_question="", analysis_type="comprehensive", sample_size=None, progress=gr.Progress()): |
|
"""Synchronous wrapper for async analysis""" |
|
return asyncio.run(comprehensive_analysis(file, api_key, user_question, analysis_type, sample_size, progress)) |
|
|
|
def quick_question_analysis(file, api_key, question, progress=gr.Progress()): |
|
"""Quick analysis for specific questions""" |
|
if not question.strip(): |
|
return "β Please enter a specific question about your data." |
|
|
|
result = asyncio.run(comprehensive_analysis(file, api_key, question, "question", None, progress)) |
|
return result[0] |
|
|
|
def generate_enhanced_preview(df: pd.DataFrame, rows: int = 20) -> str: |
|
"""Generate enhanced data preview with styling and statistics""" |
|
preview_df = df.head(rows) |
|
|
|
|
|
stats_html = "" |
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
if len(numeric_cols) > 0: |
|
stats_df = df[numeric_cols].describe().round(2) |
|
stats_html = f""" |
|
<div style="margin-bottom: 20px;"> |
|
<h4>π Quick Statistics (Numeric Columns)</h4> |
|
{stats_df.to_html(classes="table table-striped", table_id="stats-table")} |
|
</div> |
|
""" |
|
|
|
|
|
preview_html = preview_df.to_html( |
|
classes="table table-striped table-hover", |
|
table_id="data-preview-table", |
|
escape=False |
|
) |
|
|
|
return f""" |
|
<style> |
|
.table {{ |
|
width: 100%; |
|
border-collapse: collapse; |
|
margin: 20px 0; |
|
font-size: 14px; |
|
background: white; |
|
border-radius: 8px; |
|
overflow: hidden; |
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1); |
|
}} |
|
.table th {{ |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 12px 8px; |
|
text-align: left; |
|
font-weight: bold; |
|
position: sticky; |
|
top: 0; |
|
z-index: 10; |
|
}} |
|
.table td {{ |
|
padding: 10px 8px; |
|
border-bottom: 1px solid #dee2e6; |
|
max-width: 200px; |
|
overflow: hidden; |
|
text-overflow: ellipsis; |
|
white-space: nowrap; |
|
}} |
|
.table tr:hover {{ |
|
background-color: #f8f9ff; |
|
}} |
|
.table tr:nth-child(even) {{ |
|
background-color: #f8f9fa; |
|
}} |
|
#stats-table {{ |
|
font-size: 12px; |
|
}} |
|
#stats-table th {{ |
|
background: linear-gradient(135deg, #28a745 0%, #20c997 100%); |
|
}} |
|
.preview-header {{ |
|
background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%); |
|
padding: 15px; |
|
border-radius: 8px; |
|
margin-bottom: 15px; |
|
border-left: 4px solid #667eea; |
|
}} |
|
</style> |
|
|
|
<div class="preview-header"> |
|
<h4>π Data Preview - First {rows} Rows</h4> |
|
<p><strong>Total Rows:</strong> {len(df):,} | <strong>Columns:</strong> {df.shape[1]} | <strong>Showing:</strong> {len(preview_df)} rows</p> |
|
</div> |
|
|
|
{stats_html} |
|
{preview_html} |
|
""" |
|
|
|
|
|
analyzer.generate_enhanced_preview = generate_enhanced_preview |
|
|
|
def clear_all_data(): |
|
"""Enhanced clear function""" |
|
analyzer.current_df = None |
|
analyzer.current_charts = None |
|
analyzer.conversation_history = [] |
|
analyzer.analysis_cache = {} |
|
return None, "", "", "", "", "", "", None, "" |
|
|
|
def export_report(analysis_text, data_summary, file_name, format_choice, ai_analysis=""): |
|
"""Enhanced export function with multiple format options""" |
|
if not analysis_text and not ai_analysis: |
|
return None, "β No analysis data available for download." |
|
|
|
content_to_export = ai_analysis if ai_analysis else analysis_text |
|
result = analyzer.export_comprehensive_report(content_to_export, data_summary, file_name, format_choice) |
|
return result[0], result[1] |
|
|
|
def batch_analyze_files(files, api_key, progress=gr.Progress()): |
|
"""Batch analysis for multiple files""" |
|
if not files: |
|
return "β No files uploaded for batch analysis." |
|
|
|
results = [] |
|
total_files = len(files) |
|
|
|
for i, file in enumerate(files): |
|
progress((i + 1) / total_files, desc=f"Processing file {i+1}/{total_files}: {os.path.basename(file.name)}") |
|
|
|
try: |
|
result = asyncio.run(comprehensive_analysis(file, api_key, "", "quick", 1000, gr.Progress())) |
|
file_name = os.path.basename(file.name) |
|
results.append(f"## π {file_name}\n{result[0]}\n---\n") |
|
except Exception as e: |
|
results.append(f"## β {os.path.basename(file.name)}\nError: {str(e)}\n---\n") |
|
|
|
return "\n".join(results) |
|
|
|
|
|
with gr.Blocks( |
|
title="π AnalytixPro v2.0", |
|
theme=gr.themes.Ocean(), |
|
css=""" |
|
.gradio-container { |
|
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif; |
|
max-width: 1600px; |
|
} |
|
.main-header { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 30px; |
|
border-radius: 15px; |
|
margin-bottom: 20px; |
|
text-align: center; |
|
} |
|
.upload-area { |
|
border: 2px dashed #667eea; |
|
border-radius: 12px; |
|
padding: 25px; |
|
text-align: center; |
|
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); |
|
transition: all 0.3s ease; |
|
} |
|
.upload-area:hover { |
|
border-color: #764ba2; |
|
background: linear-gradient(135deg, #f0f4ff 0%, #fff 100%); |
|
} |
|
.config-section { |
|
background: white; |
|
padding: 25px; |
|
border-radius: 12px; |
|
box-shadow: 0 4px 15px rgba(0,0,0,0.1); |
|
border-left: 4px solid #667eea; |
|
} |
|
.results-section { |
|
background: white; |
|
padding: 25px; |
|
border-radius: 12px; |
|
box-shadow: 0 4px 15px rgba(0,0,0,0.1); |
|
border-left: 4px solid #28a745; |
|
} |
|
.tab-content { |
|
background: white; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin-top: 10px; |
|
} |
|
.feature-grid { |
|
display: grid; |
|
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); |
|
gap: 15px; |
|
margin: 20px 0; |
|
} |
|
.feature-card { |
|
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%); |
|
padding: 20px; |
|
border-radius: 10px; |
|
border: 1px solid #e0e6ff; |
|
text-align: center; |
|
} |
|
""" |
|
) as app: |
|
|
|
|
|
current_file_name = gr.State("") |
|
current_ai_analysis = gr.State("") |
|
|
|
|
|
gr.HTML(""" |
|
<div class="main-header"> |
|
<h1>π AnalytixPro v2.0</h1> |
|
<p>Advanced AI-Powered Data Analysis & Business Intelligence Platform</p> |
|
<p style="opacity: 0.9; margin-top: 10px;"> |
|
β¨ Enhanced with Advanced Statistics β’ π― Multi-format Support β’ π Interactive Visualizations β’ π± Mobile Optimized |
|
</p> |
|
</div> |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1, elem_classes=["config-section"]): |
|
gr.Markdown("### βοΈ Configuration & Upload") |
|
|
|
api_key_input = gr.Textbox( |
|
label="π Chutes API Key", |
|
placeholder="sk-chutes-your-api-key-here...", |
|
type="password", |
|
lines=1, |
|
info="π Get your free API key from chutes.ai" |
|
) |
|
|
|
with gr.Group(): |
|
file_input = gr.File( |
|
label="π Upload Data File", |
|
file_types=[".csv", ".xlsx", ".xls", ".json", ".parquet", ".tsv"], |
|
file_count="single", |
|
elem_classes=["upload-area"] |
|
) |
|
|
|
with gr.Row(): |
|
analysis_type = gr.Dropdown( |
|
choices=["comprehensive", "quick", "statistical"], |
|
value="comprehensive", |
|
label="π― Analysis Type", |
|
info="Choose analysis depth" |
|
) |
|
|
|
sample_size = gr.Number( |
|
label="π Sample Size", |
|
placeholder="Leave empty for full dataset", |
|
minimum=100, |
|
maximum=50000, |
|
info="Optional: Limit rows for faster processing" |
|
) |
|
|
|
with gr.Row(): |
|
analyze_btn = gr.Button("π Analyze Data", variant="primary", size="lg") |
|
clear_btn = gr.Button("ποΈ Clear All", variant="secondary") |
|
|
|
|
|
with gr.Group(): |
|
gr.Markdown("### π File Information") |
|
file_stats = gr.HTML( |
|
value="<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>π Upload a file to see detailed information...</div>" |
|
) |
|
|
|
with gr.Column(scale=2, elem_classes=["results-section"]): |
|
gr.Markdown("### π― Analysis Results") |
|
analysis_output = gr.Markdown( |
|
value="""## π Welcome to AnalytixPro v2.0! |
|
|
|
**π Enhanced Features:** |
|
- β
**Multi-format Support**: CSV, Excel, JSON, Parquet, TSV |
|
- β
**Advanced Statistics**: Correlation, outlier detection, distribution analysis |
|
- β
**Interactive Visualizations**: Professional charts and dashboards |
|
- β
**AI-Powered Insights**: GPT-powered business intelligence |
|
- β
**Export Options**: HTML, Markdown |
|
- β
**Batch Processing**: Analyze multiple files at once |
|
- β
**Mobile Optimized**: Works on all devices |
|
|
|
**π How to Get Started:** |
|
1. Enter your Chutes API key |
|
2. Upload your data file |
|
3. Choose analysis type |
|
4. Click "Analyze Data" |
|
5. Explore results in the tabs below! |
|
|
|
*Ready for professional-grade data analysis! π―*""", |
|
show_label=False |
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
with gr.Tab("π¬ Ask Specific Questions", elem_id="questions-tab"): |
|
gr.Markdown("### π Interactive Data Q&A") |
|
with gr.Row(): |
|
question_input = gr.Textbox( |
|
label="β What would you like to know about your data?", |
|
placeholder="""Try asking specific questions like: |
|
β’ What are the top 5 performing segments by revenue? |
|
β’ Are there any seasonal patterns in the sales data? |
|
β’ Which customer segments have the highest lifetime value? |
|
β’ What anomalies or outliers should I be concerned about? |
|
β’ How do different product categories compare in profitability? |
|
β’ What trends do you see in the time series data?""", |
|
lines=4 |
|
) |
|
|
|
with gr.Row(): |
|
ask_btn = gr.Button("π Get AI Answer", variant="primary") |
|
quick_insight_btn = gr.Button("π‘ Quick Insights", variant="secondary") |
|
|
|
question_output = gr.Markdown() |
|
|
|
with gr.Tab("π Data Preview & Statistics"): |
|
gr.Markdown("### π Dataset Explorer") |
|
with gr.Row(): |
|
preview_rows = gr.Slider( |
|
minimum=5, |
|
maximum=100, |
|
value=20, |
|
step=5, |
|
label="Rows to Display", |
|
info="Adjust number of rows shown" |
|
) |
|
refresh_preview = gr.Button("π Refresh Preview", variant="secondary") |
|
|
|
data_preview = gr.HTML( |
|
label="Dataset Preview", |
|
value="<div style='text-align: center; padding: 40px; color: #666;'>π Upload and analyze a file to see preview...</div>" |
|
) |
|
|
|
with gr.Tab("π Visualizations & Charts", visible=False): |
|
gr.Markdown("### π¨ Interactive Data Visualizations") |
|
charts_display = gr.HTML( |
|
value="<div style='text-align: center; padding: 40px; color: #666;'>π Charts will appear here after analysis...</div>" |
|
) |
|
|
|
with gr.Tab("π Technical Summary"): |
|
gr.Markdown("### π Detailed Technical Analysis") |
|
raw_summary = gr.Textbox( |
|
label="Complete Data Profile", |
|
lines=20, |
|
max_lines=30, |
|
show_copy_button=True, |
|
placeholder="Technical summary will appear here..." |
|
) |
|
|
|
with gr.Tab("πΎ Export & Reports"): |
|
gr.Markdown("### π₯ Download Professional Reports") |
|
|
|
with gr.Row(): |
|
format_choice = gr.Radio( |
|
choices=["HTML", "Markdown"], |
|
value="HTML", |
|
label="π Report Format", |
|
info="Choose your preferred export format" |
|
) |
|
|
|
include_charts = gr.Checkbox( |
|
label="π Include Charts", |
|
value=True, |
|
info="Include visualizations in report" |
|
) |
|
|
|
with gr.Row(): |
|
download_btn = gr.Button("π₯ Generate Report", variant="primary", size="lg") |
|
batch_export_btn = gr.Button("π¦ Batch Export", variant="secondary") |
|
|
|
download_status = gr.Textbox(label="π Export Status", interactive=False) |
|
download_file = gr.File(label="π Download Your Report", visible=True) |
|
|
|
with gr.Tab("π Batch Analysis"): |
|
gr.Markdown("### π Analyze Multiple Files") |
|
gr.Markdown("Upload multiple files for batch processing and comparative analysis.") |
|
|
|
batch_files = gr.File( |
|
label="π Upload Multiple Files", |
|
file_count="multiple", |
|
file_types=[".csv", ".xlsx", ".xls"] |
|
) |
|
|
|
batch_analyze_btn = gr.Button("π Batch Analyze", variant="primary") |
|
batch_results = gr.Markdown() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_file_stats(file): |
|
"""Enhanced file statistics display""" |
|
if not file: |
|
return "<div style='padding: 15px; background: #f8f9fa; border-radius: 8px; text-align: center;'>π No file uploaded</div>" |
|
|
|
try: |
|
file_size = os.path.getsize(file.name) / (1024 * 1024) |
|
file_name = os.path.basename(file.name) |
|
file_ext = os.path.splitext(file_name)[1].upper() |
|
|
|
|
|
try: |
|
if file_ext.lower() == '.csv': |
|
with open(file.name, 'r', encoding='utf-8') as f: |
|
lines = sum(1 for line in f) |
|
estimated_rows = lines - 1 |
|
elif file_ext.lower() in ['.xlsx', '.xls']: |
|
temp_df = pd.read_excel(file.name, nrows=0) |
|
estimated_rows = "Reading..." |
|
else: |
|
estimated_rows = "Unknown" |
|
except: |
|
estimated_rows = "Could not estimate" |
|
|
|
return f""" |
|
<div style='padding: 20px; background: linear-gradient(135deg, #e8f4f8 0%, #f0f8ff 100%); border-radius: 10px; border: 1px solid #b3d9f2;'> |
|
<h4 style='color: #2c3e50; margin-bottom: 15px;'>π File Details</h4> |
|
<div style='display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 10px;'> |
|
<div><strong>π Name:</strong><br>{file_name}</div> |
|
<div><strong>π Size:</strong><br>{file_size:.2f} MB</div> |
|
<div><strong>π§ Format:</strong><br>{file_ext[1:]} File</div> |
|
<div><strong>π Est. Rows:</strong><br>{estimated_rows}</div> |
|
<div><strong>β° Uploaded:</strong><br>{datetime.now().strftime('%H:%M:%S')}</div> |
|
<div><strong>β
Status:</strong><br>Ready to analyze</div> |
|
</div> |
|
</div> |
|
""" |
|
except Exception as e: |
|
return f""" |
|
<div style='padding: 15px; background: #f8d7da; border-radius: 8px; border: 1px solid #dc3545;'> |
|
β <strong>File Error:</strong> {str(e)} |
|
</div> |
|
""" |
|
|
|
def handle_main_analysis(file, api_key, analysis_type, sample_size, progress=gr.Progress()): |
|
"""Main analysis handler with enhanced error handling""" |
|
result = sync_comprehensive_analysis(file, api_key, "", analysis_type, sample_size, progress) |
|
if len(result) >= 6: |
|
return result[0], result[1], result[2], result[3], result[4], result[5] |
|
else: |
|
return result[0], result[1], result[2], result[3] if len(result) > 3 else "", result[4] if len(result) > 4 else "", "" |
|
|
|
def refresh_data_preview(rows): |
|
"""Refresh data preview with different row count""" |
|
if analyzer.current_df is not None: |
|
return analyzer.generate_enhanced_preview(analyzer.current_df, rows) |
|
return "<div style='text-align: center; padding: 40px; color: #666;'>π No data loaded</div>" |
|
|
|
|
|
analyze_btn.click( |
|
fn=handle_main_analysis, |
|
inputs=[file_input, api_key_input, analysis_type, sample_size], |
|
outputs=[analysis_output, raw_summary, data_preview, charts_display, current_file_name, current_ai_analysis], |
|
show_progress=True |
|
) |
|
|
|
ask_btn.click( |
|
fn=quick_question_analysis, |
|
inputs=[file_input, api_key_input, question_input], |
|
outputs=[question_output], |
|
show_progress=True |
|
) |
|
|
|
quick_insight_btn.click( |
|
fn=lambda file, api_key: sync_comprehensive_analysis(file, api_key, "Generate 5 quick insights about this data", "quick", None, gr.Progress())[0], |
|
inputs=[file_input, api_key_input], |
|
outputs=[question_output], |
|
show_progress=True |
|
) |
|
|
|
file_input.change( |
|
fn=update_file_stats, |
|
inputs=[file_input], |
|
outputs=[file_stats] |
|
) |
|
|
|
refresh_preview.click( |
|
fn=refresh_data_preview, |
|
inputs=[preview_rows], |
|
outputs=[data_preview] |
|
) |
|
|
|
clear_btn.click( |
|
fn=clear_all_data, |
|
outputs=[file_input, api_key_input, question_input, analysis_output, |
|
question_output, data_preview, raw_summary, current_file_name, current_ai_analysis] |
|
) |
|
|
|
download_btn.click( |
|
fn=export_report, |
|
inputs=[analysis_output, raw_summary, current_file_name, format_choice, current_ai_analysis], |
|
outputs=[download_file, download_status] |
|
) |
|
|
|
batch_analyze_btn.click( |
|
fn=batch_analyze_files, |
|
inputs=[batch_files, api_key_input], |
|
outputs=[batch_results], |
|
show_progress=True |
|
) |
|
|
|
|
|
gr.HTML(""" |
|
<div style="margin-top: 30px;"> |
|
<h3 style="text-align: center; color: #2c3e50; margin-bottom: 20px;">π Key Features & Capabilities</h3> |
|
<div class="feature-grid"> |
|
<div class="feature-card"> |
|
<h4>π§ Advanced File Support</h4> |
|
<p>CSV, Excel, JSON, Parquet, TSV with intelligent type detection</p> |
|
</div> |
|
<div class="feature-card"> |
|
<h4>π Statistical Analysis</h4> |
|
<p>Correlation matrices, outlier detection, distribution analysis</p> |
|
</div> |
|
<div class="feature-card"> |
|
<h4>π€ AI-Powered Insights</h4> |
|
<p>GPT-powered business intelligence and recommendations</p> |
|
</div> |
|
<div class="feature-card"> |
|
<h4>π Interactive Charts</h4> |
|
<p>Professional visualizations with hover effects and zoom</p> |
|
</div> |
|
<div class="feature-card"> |
|
<h4>πΎ Multiple Export Formats</h4> |
|
<p>HTML, Markdown with embedded charts</p> |
|
</div> |
|
<div class="feature-card"> |
|
<h4>π Batch Processing</h4> |
|
<p>Analyze multiple files simultaneously for comparison</p> |
|
</div> |
|
</div> |
|
</div> |
|
""") |
|
|
|
with gr.Accordion("π‘ Pro Tips", open=False): |
|
gr.Markdown(""" |
|
### π― Data Preparation: |
|
- β
Use descriptive column names (e.g., "Monthly_Revenue" instead of "Col1") |
|
- β
Ensure consistent date formats (YYYY-MM-DD recommended) |
|
- β
Remove completely empty rows/columns before upload |
|
- β
For large files (>10MB), consider using sample size option |
|
|
|
### π Analysis Optimization: |
|
- **Comprehensive**: Full statistical analysis with AI insights (recommended for business reports) |
|
- **Quick**: Fast overview for initial data exploration |
|
- **Statistical**: Focus on mathematical relationships and patterns |
|
|
|
### π Question Examples for Better AI Responses: |
|
- "What factors most strongly correlate with customer churn?" |
|
- "Which time periods show the highest sales performance?" |
|
- "Are there any data quality issues I should address?" |
|
- "What are the key business opportunities in this dataset?" |
|
|
|
### π₯ Export Recommendations: |
|
- **HTML**: Best for sharing interactive reports with stakeholders |
|
- **Markdown**: Great for technical documentation and version control |
|
|
|
### β‘ Performance Notes: |
|
- Files under 5MB: Instant processing |
|
- Files 5-20MB: ~5-10 seconds |
|
- Files 20MB+: Consider sampling for faster results |
|
|
|
### π§ Supported Formats & Limits: |
|
- **CSV/TSV**: Up to 100MB |
|
- **Excel (XLSX/XLS)**: Up to 100MB |
|
- **JSON**: Flat or nested structures |
|
- **Parquet**: High-performance columnar format |
|
|
|
### π Support & Contact: |
|
- π± WhatsApp: +8801719296601 |
|
- π§ Email: https://tinyurl.com/email-for-contact |
|
- π Response Time: Within 24 hours |
|
""") |
|
|
|
if __name__ == "__main__": |
|
|
|
app.queue( |
|
max_size=20, |
|
default_concurrency_limit=5, |
|
api_open=False |
|
) |
|
|
|
app.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
debug=False, |
|
show_error=True, |
|
quiet=False, |
|
favicon_path=None, |
|
ssl_verify=True, |
|
app_kwargs={ |
|
"docs_url": None, |
|
"redoc_url": None |
|
} |
|
) |