shukdevdattaEX commited on
Commit
dd6870b
Β·
verified Β·
1 Parent(s): e913d54

Update v2.txt

Browse files
Files changed (1) hide show
  1. v2.txt +85 -148
v2.txt CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
3
  import aiohttp
4
  import asyncio
5
  import json
6
- import io
7
  import os
8
  import numpy as np
9
  import plotly.express as px
@@ -12,10 +11,8 @@ from typing import Optional, Tuple, Dict, Any
12
  import logging
13
  from datetime import datetime
14
  import re
15
- import base64
16
- from io import BytesIO
17
- import weasyprint # For PDF generation
18
- from jinja2 import Template # For HTML templating
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO)
@@ -86,12 +83,12 @@ Format your response with clear sections and bullet points for readability."""
86
  ],
87
  "stream": True,
88
  "max_tokens": 3000,
89
- "temperature": 0.2, # Very low for consistent analysis
90
  "top_p": 0.9
91
  }
92
 
93
  try:
94
- timeout = aiohttp.ClientTimeout(total=30) # 30 second timeout
95
  async with aiohttp.ClientSession(timeout=timeout) as session:
96
  async with session.post(self.api_base_url, headers=headers, json=body) as response:
97
  if response.status == 401:
@@ -131,9 +128,7 @@ Format your response with clear sections and bullet points for readability."""
131
  try:
132
  file_extension = os.path.splitext(file_path)[1].lower()
133
 
134
- # Read file with better error handling
135
  if file_extension == '.csv':
136
- # Try different encodings
137
  for encoding in ['utf-8', 'latin-1', 'cp1252']:
138
  try:
139
  df = pd.read_csv(file_path, encoding=encoding)
@@ -147,13 +142,8 @@ Format your response with clear sections and bullet points for readability."""
147
  else:
148
  raise ValueError("Unsupported file format. Please upload CSV or Excel files.")
149
 
150
- # Clean column names
151
  df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
152
-
153
- # Store dataframe for visualizations
154
  self.current_df = df
155
-
156
- # Generate enhanced summaries
157
  data_summary = self.generate_enhanced_summary(df)
158
  charts_html = self.generate_visualizations(df)
159
 
@@ -165,23 +155,17 @@ Format your response with clear sections and bullet points for readability."""
165
  def generate_enhanced_summary(self, df: pd.DataFrame) -> str:
166
  """Generate comprehensive data summary with statistical insights"""
167
  summary = []
168
-
169
- # Header with timestamp
170
  summary.append(f"# πŸ“Š Dataset Analysis Report")
171
  summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
172
  summary.append(f"**File Size**: {df.shape[0]:,} rows Γ— {df.shape[1]} columns")
173
-
174
- # Memory usage
175
  memory_usage = df.memory_usage(deep=True).sum() / 1024**2
176
  summary.append(f"**Memory Usage**: {memory_usage:.2f} MB\n")
177
 
178
- # Data types breakdown
179
  type_counts = df.dtypes.value_counts()
180
  summary.append("## πŸ“‹ Column Types:")
181
  for dtype, count in type_counts.items():
182
  summary.append(f"- **{dtype}**: {count} columns")
183
 
184
- # Missing data analysis
185
  missing_data = df.isnull().sum()
186
  missing_pct = (missing_data / len(df) * 100).round(2)
187
  missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
@@ -194,26 +178,23 @@ Format your response with clear sections and bullet points for readability."""
194
  else:
195
  summary.append("\n## βœ… Data Quality: No missing values detected!")
196
 
197
- # Numerical analysis
198
  numeric_cols = df.select_dtypes(include=[np.number]).columns
199
  if len(numeric_cols) > 0:
200
  summary.append(f"\n## πŸ“ˆ Numerical Columns Analysis ({len(numeric_cols)} columns):")
201
- for col in numeric_cols[:10]: # Limit to first 10
202
  stats = df[col].describe()
203
  outliers = len(df[df[col] > (stats['75%'] + 1.5 * (stats['75%'] - stats['25%']))])
204
  summary.append(f"- **{col}**: ΞΌ={stats['mean']:.2f}, Οƒ={stats['std']:.2f}, outliers={outliers}")
205
 
206
- # Categorical analysis
207
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
208
  if len(categorical_cols) > 0:
209
  summary.append(f"\n## πŸ“ Categorical Columns Analysis ({len(categorical_cols)} columns):")
210
- for col in categorical_cols[:10]: # Limit to first 10
211
  unique_count = df[col].nunique()
212
  cardinality = "High" if unique_count > len(df) * 0.9 else "Medium" if unique_count > 10 else "Low"
213
  most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
214
  summary.append(f"- **{col}**: {unique_count:,} unique values ({cardinality} cardinality), Top: '{most_common}'")
215
 
216
- # Sample data with better formatting
217
  summary.append("\n## πŸ” Data Sample (First 3 Rows):")
218
  sample_df = df.head(3)
219
  for idx, row in sample_df.iterrows():
@@ -228,7 +209,6 @@ Format your response with clear sections and bullet points for readability."""
228
  charts_html = []
229
 
230
  try:
231
- # Chart 1: Data completeness analysis
232
  missing_data = df.isnull().sum()
233
  if missing_data.sum() > 0:
234
  fig = px.bar(
@@ -248,7 +228,6 @@ Format your response with clear sections and bullet points for readability."""
248
  charts_html.append(f"<h3>πŸ“Š Data Quality Overview</h3>")
249
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_chart"))
250
 
251
- # Chart 2: Numerical columns correlation heatmap
252
  numeric_cols = df.select_dtypes(include=[np.number]).columns
253
  if len(numeric_cols) > 1:
254
  corr_matrix = df[numeric_cols].corr()
@@ -263,9 +242,8 @@ Format your response with clear sections and bullet points for readability."""
263
  charts_html.append(f"<h3>πŸ“ˆ Correlation Analysis</h3>")
264
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_chart"))
265
 
266
- # Chart 3: Distribution plots for numerical columns
267
  if len(numeric_cols) > 0:
268
- for i, col in enumerate(numeric_cols[:3]): # First 3 numeric columns
269
  fig = px.histogram(
270
  df,
271
  x=col,
@@ -278,11 +256,10 @@ Format your response with clear sections and bullet points for readability."""
278
  charts_html.append(f"<h3>πŸ“ˆ Data Distributions</h3>")
279
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"dist_chart_{i}"))
280
 
281
- # Chart 4: Categorical analysis
282
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
283
  if len(categorical_cols) > 0:
284
- for i, col in enumerate(categorical_cols[:2]): # First 2 categorical columns
285
- if df[col].nunique() <= 20: # Only if reasonable number of categories
286
  value_counts = df[col].value_counts().head(10)
287
  fig = px.bar(
288
  x=value_counts.values,
@@ -296,7 +273,6 @@ Format your response with clear sections and bullet points for readability."""
296
  charts_html.append(f"<h3>πŸ“ Categorical Data Analysis</h3>")
297
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"cat_chart_{i}"))
298
 
299
- # Chart 5: Data overview summary
300
  summary_data = {
301
  'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 'Missing Values'],
302
  'Count': [
@@ -320,9 +296,7 @@ Format your response with clear sections and bullet points for readability."""
320
  charts_html.append(f"<h3>πŸ“Š Dataset Overview</h3>")
321
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_chart"))
322
 
323
- # Store charts for export
324
  self.current_charts = charts_html
325
-
326
  return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
327
 
328
  except Exception as e:
@@ -330,8 +304,7 @@ Format your response with clear sections and bullet points for readability."""
330
  return f"<p>❌ Chart generation failed: {str(e)}</p>"
331
 
332
  def generate_report_html(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
333
- """Generate HTML report with embedded charts"""
334
-
335
  html_template = """
336
  <!DOCTYPE html>
337
  <html>
@@ -370,7 +343,11 @@ Format your response with clear sections and bullet points for readability."""
370
  border-radius: 8px;
371
  border-left: 4px solid #667eea;
372
  }
373
- h1, h2, h3 { color: #2c3e50; }
 
 
 
 
374
  .metadata {
375
  background: #e8f4f8;
376
  padding: 15px;
@@ -391,8 +368,60 @@ Format your response with clear sections and bullet points for readability."""
391
  border-radius: 5px;
392
  overflow-x: auto;
393
  white-space: pre-wrap;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  }
395
  </style>
 
 
 
 
 
396
  </head>
397
  <body>
398
  <div class="header">
@@ -403,11 +432,12 @@ Format your response with clear sections and bullet points for readability."""
403
  <div class="metadata">
404
  <strong>πŸ“ File:</strong> {{ file_name }}<br>
405
  <strong>πŸ“… Generated:</strong> {{ timestamp }}<br>
406
- <strong>πŸ€– Model:</strong> OpenAI gpt-oss-20b via Chutes AI
407
  </div>
408
 
409
  <div class="section">
410
  <h2>🎯 AI Analysis & Insights</h2>
 
411
  <div>{{ ai_analysis }}</div>
412
  </div>
413
 
@@ -424,21 +454,15 @@ Format your response with clear sections and bullet points for readability."""
424
  </div>
425
 
426
  <div class="footer">
427
- <p>Report generated by Smart Data Analyzer Pro β€’ Powered by AI</p>
428
- <p>For questions or support, visit chutes.ai</p>
429
  </div>
430
  </body>
431
  </html>
432
  """
433
 
434
  template = Template(html_template)
435
-
436
- # Convert markdown to HTML for AI analysis
437
- ai_analysis_html = analysis_text.replace('\n', '<br>')
438
- ai_analysis_html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', ai_analysis_html)
439
- ai_analysis_html = re.sub(r'## (.*?)\n', r'<h3>\1</h3>', ai_analysis_html)
440
- ai_analysis_html = re.sub(r'# (.*?)\n', r'<h2>\1</h2>', ai_analysis_html)
441
-
442
  charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
443
 
444
  return template.render(
@@ -449,50 +473,37 @@ Format your response with clear sections and bullet points for readability."""
449
  data_summary=data_summary
450
  )
451
 
452
- # Initialize the analyzer
453
  analyzer = EnhancedDataAnalyzer()
454
 
455
  async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
456
- """Enhanced analysis function with progress tracking"""
457
  if not file:
458
  return "❌ Please upload a CSV or Excel file.", "", "", "", None
459
 
460
  if not analyzer.validate_api_key(api_key):
461
  return "❌ Please enter a valid Chutes API key (minimum 10 characters).", "", "", "", None
462
 
463
- # Validate file
464
  is_valid, validation_msg = analyzer.validate_file(file)
465
  if not is_valid:
466
  return f"❌ {validation_msg}", "", "", "", None
467
 
468
  progress(0.1, desc="πŸ“ Reading file...")
469
-
470
  try:
471
- # Process the uploaded file
472
  df, data_summary, charts_html = analyzer.process_file(file.name)
473
  progress(0.3, desc="πŸ“Š Processing data...")
474
-
475
  progress(0.5, desc="πŸ€– Generating AI insights...")
476
-
477
- # Get AI analysis
478
  ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
479
  progress(0.9, desc="✨ Finalizing results...")
480
 
481
- # Format the complete response
482
  response = f"""# 🎯 Analysis Complete!
483
  {ai_analysis}
484
  ---
485
  *Analysis powered by OpenAI gpt-oss-20b via Chutes β€’ Generated at {datetime.now().strftime('%H:%M:%S')}*
486
  """
487
-
488
- # Generate data preview
489
  data_preview_html = df.head(15).to_html(
490
  classes="table table-striped table-hover",
491
  table_id="data-preview-table",
492
  escape=False
493
  )
494
-
495
- # Add some styling to the preview
496
  styled_preview = f"""
497
  <style>
498
  #data-preview-table {{
@@ -527,17 +538,14 @@ async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
527
  return f"❌ **Error**: {str(e)}", "", "", "", None
528
 
529
  def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
530
- """Synchronous wrapper for the async analyze function"""
531
  return asyncio.run(analyze_data(file, api_key, user_question, progress))
532
 
533
  def clear_all():
534
- """Clear all inputs and outputs"""
535
  analyzer.current_df = None
536
  analyzer.current_charts = None
537
  return None, "", "", "", "", "", "", None
538
 
539
  def download_report(analysis_text, data_summary, file_name, format_choice):
540
- """Generate downloadable report in PDF or HTML format"""
541
  if not analysis_text:
542
  return None, "❌ No analysis data available for download."
543
 
@@ -546,26 +554,13 @@ def download_report(analysis_text, data_summary, file_name, format_choice):
546
 
547
  try:
548
  if format_choice == "HTML":
549
- # Generate HTML report
550
  html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
551
  filename = f"{file_base_name}_analysis_report_{timestamp}.html"
552
-
553
  with open(filename, 'w', encoding='utf-8') as f:
554
  f.write(html_content)
555
-
556
  return filename, f"βœ… HTML report generated successfully! File: {filename}"
557
 
558
- elif format_choice == "PDF":
559
- # Generate PDF report
560
- html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
561
- filename = f"{file_base_name}_analysis_report_{timestamp}.pdf"
562
-
563
- # Convert HTML to PDF using weasyprint
564
- weasyprint.HTML(string=html_content).write_pdf(filename)
565
-
566
- return filename, f"βœ… PDF report generated successfully! File: {filename}"
567
-
568
- else: # Markdown fallback
569
  report = f"""# Data Analysis Report
570
  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
571
  File: {file_name}
@@ -577,14 +572,12 @@ File: {file_name}
577
  filename = f"{file_base_name}_analysis_report_{timestamp}.md"
578
  with open(filename, 'w', encoding='utf-8') as f:
579
  f.write(report)
580
-
581
  return filename, f"βœ… Markdown report generated successfully! File: {filename}"
582
 
583
  except Exception as e:
584
  logger.error(f"Report generation error: {str(e)}")
585
  return None, f"❌ Error generating report: {str(e)}"
586
 
587
- # Create enhanced Gradio interface
588
  with gr.Blocks(
589
  title="πŸš€ Smart Data Analyzer Pro",
590
  theme=gr.themes.Ocean(),
@@ -602,33 +595,20 @@ with gr.Blocks(
602
  text-align: center;
603
  background: #f8f9ff;
604
  }
605
- .charts-container {
606
- max-height: 800px;
607
- overflow-y: auto;
608
- padding: 10px;
609
- background: #fafafa;
610
- border-radius: 8px;
611
- }
612
  """
613
  ) as app:
614
-
615
- # Store file name for downloads
616
  current_file_name = gr.State("")
617
 
618
- # Header
619
  gr.Markdown("""
620
  # πŸš€ Smart Data Analyzer Pro
621
  ### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
622
 
623
- Upload your data files and get instant professional insights, visualizations, and downloadable reports!
624
  """)
625
 
626
- # Main interface
627
  with gr.Row():
628
  with gr.Column(scale=1):
629
- # Configuration section
630
  gr.Markdown("### βš™οΈ Configuration")
631
-
632
  api_key_input = gr.Textbox(
633
  label="πŸ”‘ Chutes API Key",
634
  placeholder="sk-chutes-your-api-key-here...",
@@ -636,19 +616,15 @@ with gr.Blocks(
636
  lines=1,
637
  info="Get your free API key from chutes.ai"
638
  )
639
-
640
  file_input = gr.File(
641
  label="πŸ“ Upload Data File",
642
  file_types=[".csv", ".xlsx", ".xls"],
643
  file_count="single",
644
  elem_classes=["upload-area"]
645
  )
646
-
647
  with gr.Row():
648
  analyze_btn = gr.Button("πŸš€ Analyze Data", variant="primary", size="lg")
649
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
650
-
651
- # Quick stats display
652
  with gr.Group():
653
  gr.Markdown("### πŸ“Š Quick Stats")
654
  file_stats = gr.Textbox(
@@ -659,15 +635,12 @@ with gr.Blocks(
659
  )
660
 
661
  with gr.Column(scale=2):
662
- # Results section
663
  gr.Markdown("### 🎯 Analysis Results")
664
-
665
  analysis_output = gr.Markdown(
666
  value="πŸ“‹ **Ready to analyze your data!**\n\nUpload a CSV or Excel file and click 'Analyze Data' to get started.",
667
  show_label=False
668
  )
669
 
670
- # Advanced features in tabs
671
  with gr.Tabs():
672
  with gr.Tab("πŸ’¬ Ask Questions"):
673
  question_input = gr.Textbox(
@@ -684,14 +657,6 @@ with gr.Blocks(
684
  value="<p>Upload a file to see data preview...</p>"
685
  )
686
 
687
- with gr.Tab("πŸ“ˆ Visualizations"):
688
- charts_output = gr.HTML(
689
- label="Auto-Generated Charts",
690
- value="<div class='charts-container'><p>πŸ“Š Interactive charts will appear here after analysis...</p></div>",
691
- elem_classes=["charts-container"],
692
- visible=False
693
- )
694
-
695
  with gr.Tab("πŸ” Raw Summary"):
696
  raw_summary = gr.Textbox(
697
  label="Detailed Data Summary",
@@ -702,56 +667,47 @@ with gr.Blocks(
702
 
703
  with gr.Tab("πŸ’Ύ Export Reports"):
704
  gr.Markdown("### πŸ“₯ Download Your Analysis Report")
705
-
706
  with gr.Row():
707
  format_choice = gr.Radio(
708
- choices=["HTML", "PDF", "Markdown"],
709
  value="HTML",
710
  label="πŸ“„ Report Format",
711
  info="Choose your preferred download format"
712
  )
713
-
714
  download_btn = gr.Button("πŸ“₯ Generate & Download Report", variant="primary", size="lg")
715
  download_status = gr.Textbox(label="Download Status", interactive=False)
716
  download_file = gr.File(label="πŸ“„ Download Link", visible=True)
717
 
718
- # Event handlers
719
  def update_file_stats(file):
720
  if not file:
721
  return "No file uploaded"
722
-
723
  try:
724
- file_size = os.path.getsize(file.name) / (1024 * 1024) # MB
725
  file_name = os.path.basename(file.name)
726
  return f"πŸ“„ **File**: {file_name}\nπŸ“ **Size**: {file_size:.2f} MB\n⏰ **Uploaded**: {datetime.now().strftime('%H:%M:%S')}"
727
  except:
728
  return "File information unavailable"
729
 
730
  def handle_analysis(file, api_key, user_question="", progress=gr.Progress()):
731
- """Handle main analysis and return all outputs including file name"""
732
  result = sync_analyze_data(file, api_key, user_question, progress)
733
- if len(result) == 5: # Check if file name was returned
734
- return result[0], result[1], result[2], result[3], result[4] # analysis, summary, preview, charts, filename
735
  else:
736
- return result[0], result[1], result[2], result[3], "" # fallback without filename
737
 
738
  def handle_question_analysis(file, api_key, question, progress=gr.Progress()):
739
- """Handle question-specific analysis"""
740
  if not question.strip():
741
  return "❓ Please enter a specific question about your data."
742
-
743
  result = sync_analyze_data(file, api_key, question, progress)
744
- return result[0] # Return only the analysis output
745
 
746
- # Main analysis event
747
  analyze_btn.click(
748
  fn=handle_analysis,
749
  inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
750
- outputs=[analysis_output, raw_summary, data_preview, charts_output, current_file_name],
751
  show_progress=True
752
  )
753
 
754
- # Follow-up questions
755
  ask_btn.click(
756
  fn=handle_question_analysis,
757
  inputs=[file_input, api_key_input, question_input],
@@ -759,28 +715,24 @@ with gr.Blocks(
759
  show_progress=True
760
  )
761
 
762
- # File stats update
763
  file_input.change(
764
  fn=update_file_stats,
765
  inputs=[file_input],
766
  outputs=[file_stats]
767
  )
768
 
769
- # Clear functionality
770
  clear_btn.click(
771
  fn=clear_all,
772
  outputs=[file_input, api_key_input, question_input, analysis_output,
773
- question_output, data_preview, charts_output, raw_summary]
774
  )
775
 
776
- # Enhanced download functionality
777
  download_btn.click(
778
  fn=download_report,
779
  inputs=[analysis_output, raw_summary, current_file_name, format_choice],
780
  outputs=[download_file, download_status]
781
  )
782
 
783
- # Footer with usage tips
784
  gr.Markdown("""
785
  ---
786
  ### πŸ’‘ Pro Tips for Better Analysis:
@@ -790,16 +742,8 @@ with gr.Blocks(
790
  - Use descriptive column names
791
  - Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
792
 
793
- **πŸ“Š Visualizations Include:**
794
- - Missing data analysis
795
- - Correlation matrices for numerical data
796
- - Distribution plots and histograms
797
- - Top categories for categorical data
798
- - Dataset overview metrics
799
-
800
  **πŸ“₯ Export Options:**
801
- - **HTML**: Interactive report with embedded charts
802
- - **PDF**: Professional report for presentations
803
  - **Markdown**: Simple text format for documentation
804
 
805
  **⚑ Speed Optimization:**
@@ -810,13 +754,6 @@ with gr.Blocks(
810
  **πŸ”§ Supported Formats:** CSV, XLSX, XLS | **πŸ“ Max Size:** 50MB | **πŸš€ Response Time:** ~3-5 seconds
811
  """)
812
 
813
- def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
814
- """Synchronous wrapper for the async analyze function"""
815
- return asyncio.run(analyze_data(file, api_key, user_question, progress))
816
-
817
- # Launch configuration
818
  if __name__ == "__main__":
819
- app.queue(max_size=10) # Handle multiple users
820
- app.launch(
821
- share=True
822
- )
 
3
  import aiohttp
4
  import asyncio
5
  import json
 
6
  import os
7
  import numpy as np
8
  import plotly.express as px
 
11
  import logging
12
  from datetime import datetime
13
  import re
14
+ from jinja2 import Template
15
+ import markdown # Requires 'markdown' package: install via `pip install markdown`
 
 
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO)
 
83
  ],
84
  "stream": True,
85
  "max_tokens": 3000,
86
+ "temperature": 0.2,
87
  "top_p": 0.9
88
  }
89
 
90
  try:
91
+ timeout = aiohttp.ClientTimeout(total=30)
92
  async with aiohttp.ClientSession(timeout=timeout) as session:
93
  async with session.post(self.api_base_url, headers=headers, json=body) as response:
94
  if response.status == 401:
 
128
  try:
129
  file_extension = os.path.splitext(file_path)[1].lower()
130
 
 
131
  if file_extension == '.csv':
 
132
  for encoding in ['utf-8', 'latin-1', 'cp1252']:
133
  try:
134
  df = pd.read_csv(file_path, encoding=encoding)
 
142
  else:
143
  raise ValueError("Unsupported file format. Please upload CSV or Excel files.")
144
 
 
145
  df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
 
 
146
  self.current_df = df
 
 
147
  data_summary = self.generate_enhanced_summary(df)
148
  charts_html = self.generate_visualizations(df)
149
 
 
155
  def generate_enhanced_summary(self, df: pd.DataFrame) -> str:
156
  """Generate comprehensive data summary with statistical insights"""
157
  summary = []
 
 
158
  summary.append(f"# πŸ“Š Dataset Analysis Report")
159
  summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
160
  summary.append(f"**File Size**: {df.shape[0]:,} rows Γ— {df.shape[1]} columns")
 
 
161
  memory_usage = df.memory_usage(deep=True).sum() / 1024**2
162
  summary.append(f"**Memory Usage**: {memory_usage:.2f} MB\n")
163
 
 
164
  type_counts = df.dtypes.value_counts()
165
  summary.append("## πŸ“‹ Column Types:")
166
  for dtype, count in type_counts.items():
167
  summary.append(f"- **{dtype}**: {count} columns")
168
 
 
169
  missing_data = df.isnull().sum()
170
  missing_pct = (missing_data / len(df) * 100).round(2)
171
  missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
 
178
  else:
179
  summary.append("\n## βœ… Data Quality: No missing values detected!")
180
 
 
181
  numeric_cols = df.select_dtypes(include=[np.number]).columns
182
  if len(numeric_cols) > 0:
183
  summary.append(f"\n## πŸ“ˆ Numerical Columns Analysis ({len(numeric_cols)} columns):")
184
+ for col in numeric_cols[:10]:
185
  stats = df[col].describe()
186
  outliers = len(df[df[col] > (stats['75%'] + 1.5 * (stats['75%'] - stats['25%']))])
187
  summary.append(f"- **{col}**: ΞΌ={stats['mean']:.2f}, Οƒ={stats['std']:.2f}, outliers={outliers}")
188
 
 
189
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
190
  if len(categorical_cols) > 0:
191
  summary.append(f"\n## πŸ“ Categorical Columns Analysis ({len(categorical_cols)} columns):")
192
+ for col in categorical_cols[:10]:
193
  unique_count = df[col].nunique()
194
  cardinality = "High" if unique_count > len(df) * 0.9 else "Medium" if unique_count > 10 else "Low"
195
  most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
196
  summary.append(f"- **{col}**: {unique_count:,} unique values ({cardinality} cardinality), Top: '{most_common}'")
197
 
 
198
  summary.append("\n## πŸ” Data Sample (First 3 Rows):")
199
  sample_df = df.head(3)
200
  for idx, row in sample_df.iterrows():
 
209
  charts_html = []
210
 
211
  try:
 
212
  missing_data = df.isnull().sum()
213
  if missing_data.sum() > 0:
214
  fig = px.bar(
 
228
  charts_html.append(f"<h3>πŸ“Š Data Quality Overview</h3>")
229
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_chart"))
230
 
 
231
  numeric_cols = df.select_dtypes(include=[np.number]).columns
232
  if len(numeric_cols) > 1:
233
  corr_matrix = df[numeric_cols].corr()
 
242
  charts_html.append(f"<h3>πŸ“ˆ Correlation Analysis</h3>")
243
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_chart"))
244
 
 
245
  if len(numeric_cols) > 0:
246
+ for i, col in enumerate(numeric_cols[:3]):
247
  fig = px.histogram(
248
  df,
249
  x=col,
 
256
  charts_html.append(f"<h3>πŸ“ˆ Data Distributions</h3>")
257
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"dist_chart_{i}"))
258
 
 
259
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
260
  if len(categorical_cols) > 0:
261
+ for i, col in enumerate(categorical_cols[:2]):
262
+ if df[col].nunique() <= 20:
263
  value_counts = df[col].value_counts().head(10)
264
  fig = px.bar(
265
  x=value_counts.values,
 
273
  charts_html.append(f"<h3>πŸ“ Categorical Data Analysis</h3>")
274
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"cat_chart_{i}"))
275
 
 
276
  summary_data = {
277
  'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 'Missing Values'],
278
  'Count': [
 
296
  charts_html.append(f"<h3>πŸ“Š Dataset Overview</h3>")
297
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_chart"))
298
 
 
299
  self.current_charts = charts_html
 
300
  return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
301
 
302
  except Exception as e:
 
304
  return f"<p>❌ Chart generation failed: {str(e)}</p>"
305
 
306
  def generate_report_html(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
307
+ """Generate HTML report with properly formatted text and print button"""
 
308
  html_template = """
309
  <!DOCTYPE html>
310
  <html>
 
343
  border-radius: 8px;
344
  border-left: 4px solid #667eea;
345
  }
346
+ h1, h2, h3 {
347
+ color: #2c3e50;
348
+ margin-top: 20px;
349
+ margin-bottom: 15px;
350
+ }
351
  .metadata {
352
  background: #e8f4f8;
353
  padding: 15px;
 
368
  border-radius: 5px;
369
  overflow-x: auto;
370
  white-space: pre-wrap;
371
+ font-size: 14px;
372
+ }
373
+ strong {
374
+ color: #2c3e50;
375
+ font-weight: 600;
376
+ }
377
+ table {
378
+ width: 100%;
379
+ border-collapse: collapse;
380
+ margin: 20px 0;
381
+ }
382
+ th, td {
383
+ border: 1px solid #ddd;
384
+ padding: 8px;
385
+ text-align: left;
386
+ }
387
+ th {
388
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
389
+ color: white;
390
+ }
391
+ tr:nth-child(even) {
392
+ background-color: #f2f2f2;
393
+ }
394
+ .print-button {
395
+ background: #667eea;
396
+ color: white;
397
+ padding: 10px 20px;
398
+ border: none;
399
+ border-radius: 5px;
400
+ cursor: pointer;
401
+ font-size: 16px;
402
+ margin: 10px 0;
403
+ display: inline-block;
404
+ }
405
+ .print-button:hover {
406
+ background: #764ba2;
407
+ }
408
+ @media print {
409
+ .print-button {
410
+ display: none;
411
+ }
412
+ body {
413
+ background: white;
414
+ }
415
+ .section, .metadata, .footer {
416
+ box-shadow: none;
417
+ }
418
  }
419
  </style>
420
+ <script>
421
+ function printReport() {
422
+ window.print();
423
+ }
424
+ </script>
425
  </head>
426
  <body>
427
  <div class="header">
 
432
  <div class="metadata">
433
  <strong>πŸ“ File:</strong> {{ file_name }}<br>
434
  <strong>πŸ“… Generated:</strong> {{ timestamp }}<br>
435
+ <strong>πŸ€– Model:</strong> OpenAI gpt-oss-20b
436
  </div>
437
 
438
  <div class="section">
439
  <h2>🎯 AI Analysis & Insights</h2>
440
+ <button class="print-button" onclick="printReport()">πŸ–¨οΈ Print as PDF</button>
441
  <div>{{ ai_analysis }}</div>
442
  </div>
443
 
 
454
  </div>
455
 
456
  <div class="footer">
457
+ <p>Report generated by Smart Data Analyzer Pro β€’ Powered by Smart AI</p>
458
+ <p>For questions or support, contact +8801719296601 (via Whatsapp)</p>
459
  </div>
460
  </body>
461
  </html>
462
  """
463
 
464
  template = Template(html_template)
465
+ ai_analysis_html = markdown.markdown(analysis_text, extensions=['extra', 'tables'])
 
 
 
 
 
 
466
  charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
467
 
468
  return template.render(
 
473
  data_summary=data_summary
474
  )
475
 
 
476
  analyzer = EnhancedDataAnalyzer()
477
 
478
  async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
 
479
  if not file:
480
  return "❌ Please upload a CSV or Excel file.", "", "", "", None
481
 
482
  if not analyzer.validate_api_key(api_key):
483
  return "❌ Please enter a valid Chutes API key (minimum 10 characters).", "", "", "", None
484
 
 
485
  is_valid, validation_msg = analyzer.validate_file(file)
486
  if not is_valid:
487
  return f"❌ {validation_msg}", "", "", "", None
488
 
489
  progress(0.1, desc="πŸ“ Reading file...")
 
490
  try:
 
491
  df, data_summary, charts_html = analyzer.process_file(file.name)
492
  progress(0.3, desc="πŸ“Š Processing data...")
 
493
  progress(0.5, desc="πŸ€– Generating AI insights...")
 
 
494
  ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
495
  progress(0.9, desc="✨ Finalizing results...")
496
 
 
497
  response = f"""# 🎯 Analysis Complete!
498
  {ai_analysis}
499
  ---
500
  *Analysis powered by OpenAI gpt-oss-20b via Chutes β€’ Generated at {datetime.now().strftime('%H:%M:%S')}*
501
  """
 
 
502
  data_preview_html = df.head(15).to_html(
503
  classes="table table-striped table-hover",
504
  table_id="data-preview-table",
505
  escape=False
506
  )
 
 
507
  styled_preview = f"""
508
  <style>
509
  #data-preview-table {{
 
538
  return f"❌ **Error**: {str(e)}", "", "", "", None
539
 
540
  def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
 
541
  return asyncio.run(analyze_data(file, api_key, user_question, progress))
542
 
543
  def clear_all():
 
544
  analyzer.current_df = None
545
  analyzer.current_charts = None
546
  return None, "", "", "", "", "", "", None
547
 
548
  def download_report(analysis_text, data_summary, file_name, format_choice):
 
549
  if not analysis_text:
550
  return None, "❌ No analysis data available for download."
551
 
 
554
 
555
  try:
556
  if format_choice == "HTML":
 
557
  html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
558
  filename = f"{file_base_name}_analysis_report_{timestamp}.html"
 
559
  with open(filename, 'w', encoding='utf-8') as f:
560
  f.write(html_content)
 
561
  return filename, f"βœ… HTML report generated successfully! File: {filename}"
562
 
563
+ else: # Markdown
 
 
 
 
 
 
 
 
 
 
564
  report = f"""# Data Analysis Report
565
  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
566
  File: {file_name}
 
572
  filename = f"{file_base_name}_analysis_report_{timestamp}.md"
573
  with open(filename, 'w', encoding='utf-8') as f:
574
  f.write(report)
 
575
  return filename, f"βœ… Markdown report generated successfully! File: {filename}"
576
 
577
  except Exception as e:
578
  logger.error(f"Report generation error: {str(e)}")
579
  return None, f"❌ Error generating report: {str(e)}"
580
 
 
581
  with gr.Blocks(
582
  title="πŸš€ Smart Data Analyzer Pro",
583
  theme=gr.themes.Ocean(),
 
595
  text-align: center;
596
  background: #f8f9ff;
597
  }
 
 
 
 
 
 
 
598
  """
599
  ) as app:
 
 
600
  current_file_name = gr.State("")
601
 
 
602
  gr.Markdown("""
603
  # πŸš€ Smart Data Analyzer Pro
604
  ### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
605
 
606
+ Upload your data files and get instant professional insights and downloadable reports!
607
  """)
608
 
 
609
  with gr.Row():
610
  with gr.Column(scale=1):
 
611
  gr.Markdown("### βš™οΈ Configuration")
 
612
  api_key_input = gr.Textbox(
613
  label="πŸ”‘ Chutes API Key",
614
  placeholder="sk-chutes-your-api-key-here...",
 
616
  lines=1,
617
  info="Get your free API key from chutes.ai"
618
  )
 
619
  file_input = gr.File(
620
  label="πŸ“ Upload Data File",
621
  file_types=[".csv", ".xlsx", ".xls"],
622
  file_count="single",
623
  elem_classes=["upload-area"]
624
  )
 
625
  with gr.Row():
626
  analyze_btn = gr.Button("πŸš€ Analyze Data", variant="primary", size="lg")
627
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
 
 
628
  with gr.Group():
629
  gr.Markdown("### πŸ“Š Quick Stats")
630
  file_stats = gr.Textbox(
 
635
  )
636
 
637
  with gr.Column(scale=2):
 
638
  gr.Markdown("### 🎯 Analysis Results")
 
639
  analysis_output = gr.Markdown(
640
  value="πŸ“‹ **Ready to analyze your data!**\n\nUpload a CSV or Excel file and click 'Analyze Data' to get started.",
641
  show_label=False
642
  )
643
 
 
644
  with gr.Tabs():
645
  with gr.Tab("πŸ’¬ Ask Questions"):
646
  question_input = gr.Textbox(
 
657
  value="<p>Upload a file to see data preview...</p>"
658
  )
659
 
 
 
 
 
 
 
 
 
660
  with gr.Tab("πŸ” Raw Summary"):
661
  raw_summary = gr.Textbox(
662
  label="Detailed Data Summary",
 
667
 
668
  with gr.Tab("πŸ’Ύ Export Reports"):
669
  gr.Markdown("### πŸ“₯ Download Your Analysis Report")
 
670
  with gr.Row():
671
  format_choice = gr.Radio(
672
+ choices=["HTML", "Markdown"],
673
  value="HTML",
674
  label="πŸ“„ Report Format",
675
  info="Choose your preferred download format"
676
  )
 
677
  download_btn = gr.Button("πŸ“₯ Generate & Download Report", variant="primary", size="lg")
678
  download_status = gr.Textbox(label="Download Status", interactive=False)
679
  download_file = gr.File(label="πŸ“„ Download Link", visible=True)
680
 
 
681
  def update_file_stats(file):
682
  if not file:
683
  return "No file uploaded"
 
684
  try:
685
+ file_size = os.path.getsize(file.name) / (1024 * 1024)
686
  file_name = os.path.basename(file.name)
687
  return f"πŸ“„ **File**: {file_name}\nπŸ“ **Size**: {file_size:.2f} MB\n⏰ **Uploaded**: {datetime.now().strftime('%H:%M:%S')}"
688
  except:
689
  return "File information unavailable"
690
 
691
  def handle_analysis(file, api_key, user_question="", progress=gr.Progress()):
 
692
  result = sync_analyze_data(file, api_key, user_question, progress)
693
+ if len(result) == 5:
694
+ return result[0], result[1], result[2], result[4]
695
  else:
696
+ return result[0], result[1], result[2], ""
697
 
698
  def handle_question_analysis(file, api_key, question, progress=gr.Progress()):
 
699
  if not question.strip():
700
  return "❓ Please enter a specific question about your data."
 
701
  result = sync_analyze_data(file, api_key, question, progress)
702
+ return result[0]
703
 
 
704
  analyze_btn.click(
705
  fn=handle_analysis,
706
  inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
707
+ outputs=[analysis_output, raw_summary, data_preview, current_file_name],
708
  show_progress=True
709
  )
710
 
 
711
  ask_btn.click(
712
  fn=handle_question_analysis,
713
  inputs=[file_input, api_key_input, question_input],
 
715
  show_progress=True
716
  )
717
 
 
718
  file_input.change(
719
  fn=update_file_stats,
720
  inputs=[file_input],
721
  outputs=[file_stats]
722
  )
723
 
 
724
  clear_btn.click(
725
  fn=clear_all,
726
  outputs=[file_input, api_key_input, question_input, analysis_output,
727
+ question_output, data_preview, raw_summary, current_file_name]
728
  )
729
 
 
730
  download_btn.click(
731
  fn=download_report,
732
  inputs=[analysis_output, raw_summary, current_file_name, format_choice],
733
  outputs=[download_file, download_status]
734
  )
735
 
 
736
  gr.Markdown("""
737
  ---
738
  ### πŸ’‘ Pro Tips for Better Analysis:
 
742
  - Use descriptive column names
743
  - Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
744
 
 
 
 
 
 
 
 
745
  **πŸ“₯ Export Options:**
746
+ - **HTML**: Interactive report with embedded charts and print-to-PDF option
 
747
  - **Markdown**: Simple text format for documentation
748
 
749
  **⚑ Speed Optimization:**
 
754
  **πŸ”§ Supported Formats:** CSV, XLSX, XLS | **πŸ“ Max Size:** 50MB | **πŸš€ Response Time:** ~3-5 seconds
755
  """)
756
 
 
 
 
 
 
757
  if __name__ == "__main__":
758
+ app.queue(max_size=10)
759
+ app.launch()