File size: 7,280 Bytes
0cb60c7
 
 
67f471c
 
771365f
 
 
 
947739b
 
0cb60c7
 
c9d2489
67f471c
771365f
947739b
771365f
0cb60c7
947739b
0cb60c7
9a72b36
830b865
 
 
 
acc4e78
276ed24
 
 
 
 
 
 
 
 
 
9a72b36
 
830b865
276ed24
0cb60c7
947739b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771365f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cb60c7
 
 
 
830b865
0cb60c7
276ed24
 
 
 
 
947739b
 
 
771365f
 
 
 
 
 
 
 
 
 
 
 
 
 
0cb60c7
9138597
0cb60c7
947739b
0cb60c7
 
 
771365f
947739b
 
 
 
 
 
 
 
 
 
0cb60c7
947739b
771365f
 
 
 
 
947739b
771365f
 
0cb60c7
 
 
9138597
947739b
771365f
 
 
 
 
 
 
 
 
 
 
 
0cb60c7
 
 
 
 
 
9a72b36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import gradio as gr
import pandas as pd
import sweetviz as sv
import tempfile
import os
import category_encoders as ce
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from autoviz.AutoViz_Class import AutoViz_Class
import shutil

class DataAnalyzer:
    def __init__(self):
        self.temp_dir = tempfile.mkdtemp()
        self.df = None
        self.AV = AutoViz_Class()
        
    def generate_sweetviz_report(self, df):
        self.df = df
        report = sv.analyze(df)
        report_path = os.path.join(self.temp_dir, "report.html")
        report.show_html(report_path, open_browser=False)
        
        with open(report_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        html_with_table = f"""
        <table width="100%" style="border-collapse: collapse;">
            <tr>
                <td style="padding: 20px; border: 1px solid #ddd;">
                    <div style="height: 800px; overflow: auto;">
                        {html_content}
                    </div>
                </td>
            </tr>
        </table>
        """
        
        os.remove(report_path)
        return html_with_table

    def generate_autoviz_report(self, df):
        """Generate AutoViz report and return the HTML content"""
        # Create a temporary directory for AutoViz output
        viz_temp_dir = os.path.join(self.temp_dir, "autoviz")
        if os.path.exists(viz_temp_dir):
            shutil.rmtree(viz_temp_dir)
        os.makedirs(viz_temp_dir)

        try:
            # Generate AutoViz report
            dft = self.AV.AutoViz(
                filename='',
                sep=',',
                depVar='',
                dfte=df,
                header=0,
                verbose=0,
                lowess=False,
                chart_format='html',
                max_rows_analyzed=150000,
                save_plot_dir=viz_temp_dir
            )

            # Combine all HTML files into one
            html_content = ""
            for file in sorted(os.listdir(viz_temp_dir)):
                if file.endswith('.html'):
                    with open(os.path.join(viz_temp_dir, file), 'r', encoding='utf-8') as f:
                        html_content += f.read() + "<br><hr><br>"

            # Wrap the content in a scrollable div
            html_with_table = f"""
            <table width="100%" style="border-collapse: collapse;">
                <tr>
                    <td style="padding: 20px; border: 1px solid #ddd;">
                        <div style="height: 800px; overflow: auto;">
                            {html_content}
                        </div>
                    </td>
                </tr>
            </table>
            """

            return html_with_table

        except Exception as e:
            return f"Error generating AutoViz report: {str(e)}"
        finally:
            # Clean up
            if os.path.exists(viz_temp_dir):
                shutil.rmtree(viz_temp_dir)

    def encode_and_visualize(self, column_name, encoder_type='binary'):
        if self.df is None or column_name not in self.df.columns:
            return None
        
        df_subset = self.df[[column_name]].copy()
        
        encoders = {
            'binary': ce.BinaryEncoder(),
            'onehot': ce.OneHotEncoder(),
            'catboost': ce.CatBoostEncoder(),
            'count': ce.CountEncoder()
        }
        
        encoder = encoders.get(encoder_type)
        encoded_df = encoder.fit_transform(df_subset)
        
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(encoded_df)
        
        reducer = umap.UMAP(
            n_neighbors=15,
            min_dist=0.1,
            n_components=2,
            random_state=42
        )
        
        embedding = reducer.fit_transform(scaled_data)
        
        plt.figure(figsize=(10, 6))
        scatter = plt.scatter(
            embedding[:, 0],
            embedding[:, 1],
            c=pd.factorize(df_subset[column_name])[0],
            cmap='viridis',
            alpha=0.6
        )
        
        plt.colorbar(scatter)
        plt.title(f'UMAP visualization of {column_name}\nusing {encoder_type} encoding')
        plt.xlabel('UMAP1')
        plt.ylabel('UMAP2')
        
        buf = io.BytesIO()
        plt.savefig(buf, format='png', bbox_inches='tight')
        plt.close()
        buf.seek(0)
        
        return buf

def create_interface():
    analyzer = DataAnalyzer()
    
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Data Analysis Dashboard")
        
        with gr.Tabs():
            with gr.TabItem("Sweetviz Analysis"):
                file_input = gr.File(label="Upload CSV")
                report_html = gr.HTML()
            
            with gr.TabItem("AutoViz Analysis"):
                autoviz_html = gr.HTML()
            
            with gr.TabItem("Categorical Analysis"):
                with gr.Row():
                    column_dropdown = gr.Dropdown(
                        label="Select Categorical Column",
                        choices=[],
                        interactive=True
                    )
                    encoder_dropdown = gr.Dropdown(
                        label="Select Encoder",
                        choices=['binary', 'onehot', 'catboost', 'count'],
                        value='binary',
                        interactive=True
                    )
                plot_output = gr.Image(label="UMAP Visualization")
        
        def process_file(file):
            if file is None:
                return None, None, gr.Dropdown(choices=[])
            
            try:
                df = pd.read_csv(file.name)
                cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
                
                # Generate both reports
                sweetviz_report = analyzer.generate_sweetviz_report(df)
                autoviz_report = analyzer.generate_autoviz_report(df)
                
                return (
                    sweetviz_report,
                    autoviz_report,
                    gr.Dropdown(choices=cat_columns)
                )
            except Exception as e:
                return f"Error: {str(e)}", None, gr.Dropdown(choices=[])
        
        def update_plot(column, encoder_type):
            if column is None:
                return None
            try:
                return analyzer.encode_and_visualize(column, encoder_type)
            except Exception as e:
                return None
        
        file_input.change(
            fn=process_file,
            inputs=[file_input],
            outputs=[report_html, autoviz_html, column_dropdown]
        )
        
        column_dropdown.change(
            fn=update_plot,
            inputs=[column_dropdown, encoder_dropdown],
            outputs=[plot_output]
        )
        
        encoder_dropdown.change(
            fn=update_plot,
            inputs=[column_dropdown, encoder_dropdown],
            outputs=[plot_output]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(show_error=True)