import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import streamlit as st import requests from bs4 import BeautifulSoup from scipy import stats # FIX: Importing `stats` for z-score calculation # Try importing wordcloud, handle missing package try: from wordcloud import WordCloud except ImportError: WordCloud = None # Try importing Hugging Face LLM for AI insights try: from transformers import pipeline llm_pipeline = pipeline("text-generation", model="facebook/opt-1.3b") # Can use GPT-like models except ImportError: llm_pipeline = None # Set Streamlit page config st.set_page_config(page_title="Conversational Data Analysis", page_icon="📊") pd.set_option("display.max_columns", None) sns.set_style("whitegrid") ### 📝 FUNCTION TO LOAD DATA ### def load_data(uploaded_file): """Load a CSV, Excel, or JSON file into a Pandas DataFrame.""" try: file_extension = uploaded_file.name.split(".")[-1] if file_extension == "csv": df = pd.read_csv(uploaded_file) elif file_extension in ["xlsx", "xls"]: df = pd.read_excel(uploaded_file, engine="openpyxl") elif file_extension == "json": df = pd.json_normalize(json.load(uploaded_file)) else: st.error("❌ Unsupported file format. Use CSV, Excel, or JSON.") return None st.success(f"✅ Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns") return df except Exception as e: st.error(f"❌ Error loading file: {e}") return None ### 🔎 AUTOMATED DATA ANALYSIS ### def analyze_data(df): """Perform automated analysis for trends, anomalies, and insights.""" insights = [] # 1. Missing Data Analysis missing_values = df.isnull().sum() missing_report = missing_values[missing_values > 0] if not missing_report.empty: insights.append(f"🔎 Missing Data Found:\n{missing_report.to_string()}") # 2. Summary Statistics insights.append(f"📊 Data Summary:\n{df.describe().to_string()}") # 3. Correlation Analysis if df.select_dtypes(include=[np.number]).shape[1] > 1: corr_matrix = df.corr().round(2) insights.append(f"📈 Correlation Matrix:\n{corr_matrix.to_string()}") # 4. Outlier Detection (FIXED: Now works correctly) if not df.select_dtypes(include=[np.number]).empty: z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number]), nan_policy="omit")) outliers = np.where(z_scores > 3) if outliers[0].size > 0: insights.append(f"⚠️ Outliers Detected: {outliers[0].size} extreme values found.") return "\n\n".join(insights) ### 📊 FUNCTION FOR CATEGORICAL DATA VISUALIZATION ### def visualize_categorical_data(df, column, chart_type): """Generates bar chart, pie chart, or word cloud for categorical columns.""" if column not in df.columns: st.error(f"⚠️ Column '{column}' not found!") return data = df[column].dropna() fig, ax = plt.subplots(figsize=(8, 5)) if chart_type == "Bar Chart": data.value_counts().plot(kind="bar", color="purple", ax=ax) ax.set_title(f"Bar Chart of {column}") elif chart_type == "Pie Chart": data.value_counts().plot(kind="pie", autopct="%1.1f%%", startangle=90, cmap="coolwarm", ax=ax) ax.set_ylabel("") ax.set_title(f"Pie Chart of {column}") elif chart_type == "Word Cloud": if WordCloud is None: st.error("⚠️ `wordcloud` is not installed. Run `pip install wordcloud` to enable this feature.") return text = " ".join(data.astype(str)) wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text) ax.imshow(wordcloud, interpolation="bilinear") ax.axis("off") ax.set_title(f"Word Cloud for {column}") st.pyplot(fig) ### 🤖 AI-POWERED INSIGHTS ### def generate_ai_summary(df): """Uses an AI model to generate insights from the data trends.""" if llm_pipeline is None: return "⚠️ AI insights unavailable. Install `transformers` and load an LLM model." prompt = f""" Analyze the following dataset summary: {analyze_data(df)} Provide insights on trends, anomalies, and patterns in natural language: """ ai_response = llm_pipeline(prompt, max_length=250, num_return_sequences=1) return ai_response[0]["generated_text"] ### 🏁 MAIN FUNCTION ### def main(): """Interactive chatbot for conversational data analysis.""" st.title("📊 Conversational Data Analysis Chatbot") st.write("Upload a dataset and chat about trends, patterns, and anomalies!") # Step 1: Ask the user what they want to do action = st.radio("What would you like to do?", ["Analyze a Spreadsheet", "Scrape a Website"]) if action == "Analyze a Spreadsheet": uploaded_file = st.file_uploader("📂 Upload a CSV, Excel, or JSON file", type=["csv", "xlsx", "json"]) if uploaded_file: df = load_data(uploaded_file) if df is not None: # Generate full analysis and display insights st.subheader("📊 Automated Data Analysis") st.text(analyze_data(df)) # AI-Powered Insights st.subheader("🤖 AI Summary of Data Trends") st.text(generate_ai_summary(df)) # Let user interact with data data_type = st.radio("What would you like to explore?", ["Numerical Trends", "Categorical Insights"]) if data_type == "Numerical Trends": numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist() if numeric_columns: column = st.selectbox("🔢 Select a numerical column:", numeric_columns) chart_type = st.selectbox("📊 Choose a chart type:", ["Histogram", "Boxplot", "2D Line Chart"]) if st.button("Generate Chart"): visualize_categorical_data(df, column, chart_type) else: st.warning("⚠️ No numerical columns found in the dataset.") elif data_type == "Categorical Insights": categorical_columns = df.select_dtypes(include=["object"]).columns.tolist() if categorical_columns: column = st.selectbox("🔠 Select a categorical column:", categorical_columns) chart_type = st.radio("📊 Choose a visualization:", ["Bar Chart", "Pie Chart", "Word Cloud"]) if st.button("Generate Chart"): visualize_categorical_data(df, column, chart_type) else: st.warning("⚠️ No categorical columns found in the dataset.") if __name__ == "__main__": main()