lambdaofgod commited on
Commit
01ca586
·
1 Parent(s): b387020

tabs refactor

Browse files
Files changed (5) hide show
  1. app.py +21 -165
  2. gradio_tabs.py +293 -0
  3. graph_visualizations.py +0 -136
  4. task_visualizations.py +5 -6
  5. text_visualization.py +80 -32
app.py CHANGED
@@ -1,12 +1,16 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import logging
4
- import re
5
  from task_visualizations import TaskVisualizations
6
- import plotly.graph_objects as go
7
- from functools import partial
8
- from text_visualization import WordCloudExtractor, EmbeddingVisualizer
9
- from graph_visualizations import graph_tab
 
 
 
 
 
10
 
11
  logging.basicConfig(level=logging.INFO)
12
 
@@ -28,107 +32,6 @@ def load_repo_df(repo_representations_path):
28
  )
29
 
30
 
31
- def display_representations(repo, representation1, representation2):
32
- repo_data = repos_df[repos_df["repo_name"] == repo]
33
- logging.info(f"repo_data: {repo_data}")
34
- text1 = (
35
- repo_data[repo_data["representation"] == representation1]["text"].iloc[0]
36
- if not repo_data[repo_data["representation"] == representation1].empty
37
- else "No data available"
38
- )
39
- text2 = (
40
- repo_data[repo_data["representation"] == representation2]["text"].iloc[0]
41
- if not repo_data[repo_data["representation"] == representation2].empty
42
- else "No data available"
43
- )
44
-
45
- return text1, text2
46
-
47
-
48
- def get_representation_wordclouds(representations, repos_df):
49
- wordclouds = dict()
50
- for representation in representations:
51
- texts = list(repos_df[repos_df["representation"] == representation]["text"])
52
- wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
53
- return wordclouds
54
-
55
-
56
- def setup_repository_representations_tab(repos, representation_types):
57
-
58
- wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
59
- gr.Markdown("## Wordclouds")
60
- gr.Gallery(
61
- [
62
- (wordcloud, representation_type)
63
- for representation_type, wordcloud in wordcloud_dict.items()
64
- ],
65
- columns=[3],
66
- rows=[4],
67
- height=300,
68
- )
69
-
70
- gr.Markdown("Select a repository and two representation types to compare them.")
71
- with gr.Row():
72
- repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
73
- representation1 = gr.Dropdown(
74
- choices=representation_types, label="Representation 1", value="readme"
75
- )
76
- representation2 = gr.Dropdown(
77
- choices=representation_types,
78
- label="Representation 2",
79
- value="generated_readme",
80
- )
81
-
82
- with gr.Row():
83
- with gr.Column(
84
- elem_id="column1",
85
- variant="panel",
86
- scale=1,
87
- min_width=300,
88
- ):
89
- text1 = gr.Markdown()
90
- with gr.Column(
91
- elem_id="column2",
92
- variant="panel",
93
- scale=1,
94
- min_width=300,
95
- ):
96
- text2 = gr.Markdown()
97
-
98
- def update_representations(repo, representation1, representation2):
99
- text1_content, text2_content = display_representations(
100
- repo, representation1, representation2
101
- )
102
- return (
103
- f"### Representation 1: {representation1}\n\n{text1_content}",
104
- f"### Representation 2: {representation2}\n\n{text2_content}",
105
- )
106
-
107
- # Initial call to populate textboxes with default values
108
- text1.value, text2.value = update_representations(
109
- repos[0], "readme", "generated_readme"
110
- )
111
-
112
- for component in [repo, representation1, representation2]:
113
- component.change(
114
- fn=update_representations,
115
- inputs=[repo, representation1, representation2],
116
- outputs=[text1, text2],
117
- )
118
-
119
-
120
- def load_embeddings_intro_description():
121
- return """
122
- The following plots show embeddings obtained with MPNet sentence transformer after applying 2d UMAP algorithm for dimensionality reduction.
123
-
124
- In the first scatterplot we display PapersWithCode tasks that are colored by area.
125
- """
126
-
127
-
128
- def load_embeddings_description():
129
- return
130
-
131
-
132
  ## main
133
  repos_df = load_repo_df(AppConfig.repo_representations_path)
134
  repos = list(repos_df["repo_name"].unique())
@@ -145,6 +48,14 @@ display_df["is_task"] = display_df["representation"] == "task"
145
  embedding_visualizer = EmbeddingVisualizer(display_df=display_df)
146
 
147
 
 
 
 
 
 
 
 
 
148
  descriptions = {
149
  "intro": load_embeddings_intro_description(),
150
  "Basic representations": """Now we show the embeddings of tasks and repos, using various texts or representations.
@@ -167,68 +78,13 @@ descriptions = {
167
 
168
  with gr.Blocks() as demo:
169
  with gr.Tab("Explore Dependency Graphs"):
170
- graph_tab()
171
  with gr.Tab("Explore Repository Embeddings"):
172
-
173
- tab_elems = [
174
- gr.Markdown("## Tasks by area"),
175
- gr.Markdown(descriptions["intro"]),
176
- gr.Plot(embedding_visualizer.make_task_area_scatterplot()),
177
- ]
178
-
179
- embedding_plots = embedding_visualizer.make_embedding_plots(
180
- color_col="representation"
181
- )
182
- for plot_name in [
183
- "Basic representations",
184
- "Dependency graph based representations",
185
- "READMEs",
186
- ]:
187
- tab_elems.append(gr.Markdown(f"## {plot_name}"))
188
- if descriptions.get(plot_name):
189
- tab_elems.append(gr.Markdown(descriptions[plot_name]))
190
- tab_elems.append(gr.Plot(embedding_plots[plot_name]))
191
- gr.Column(tab_elems)
192
  with gr.Tab("Explore Repository Representations"):
193
- setup_repository_representations_tab(repos, representation_types)
194
  with gr.Tab("Explore PapersWithCode Tasks"):
 
195
 
196
- gr.Markdown(descriptions["task_counts_description"])
197
-
198
- with gr.Row():
199
- min_task_counts_slider_all = gr.Slider(
200
- minimum=50,
201
- maximum=1000,
202
- value=150,
203
- step=50,
204
- label="Minimum Task Count (All Repositories)",
205
- )
206
- update_button = gr.Button("Update Plots")
207
- min_task_counts_slider_selected = gr.Slider(
208
- minimum=10,
209
- maximum=100,
210
- value=50,
211
- step=10,
212
- label="Minimum Task Count (Selected Repositories)",
213
- )
214
- update_selected_button = gr.Button("Update Plots")
215
-
216
- with gr.Row("Task Counts"):
217
- all_repos_tasks_plot = gr.Plot(label="All Repositories")
218
- selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")
219
-
220
- update_button.click(
221
- fn=partial(task_visualizations.get_tasks_sunburst, which_df="all"),
222
- inputs=[min_task_counts_slider_all],
223
- outputs=[all_repos_tasks_plot],
224
- )
225
-
226
- update_selected_button.click(
227
- fn=partial(task_visualizations.get_tasks_sunburst, which_df="selected"),
228
- inputs=[min_task_counts_slider_selected],
229
- outputs=[selected_repos_tasks_plot],
230
- )
231
-
232
- gr.Plot(embedding_visualizer.make_task_area_scatterplot())
233
 
234
  demo.launch(share=True)
 
1
  import gradio as gr
2
  import pandas as pd
3
  import logging
 
4
  from task_visualizations import TaskVisualizations
5
+ from text_visualization import (
6
+ EmbeddingVisualizer,
7
+ )
8
+ from gradio_tabs import (
9
+ setup_embeddings_tab,
10
+ setup_tasks_tab,
11
+ setup_graph_tab,
12
+ setup_repository_representations_tab,
13
+ )
14
 
15
  logging.basicConfig(level=logging.INFO)
16
 
 
32
  )
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  ## main
36
  repos_df = load_repo_df(AppConfig.repo_representations_path)
37
  repos = list(repos_df["repo_name"].unique())
 
48
  embedding_visualizer = EmbeddingVisualizer(display_df=display_df)
49
 
50
 
51
+ def load_embeddings_intro_description():
52
+ return """
53
+ The following plots show embeddings obtained with MPNet sentence transformer after applying 2d UMAP algorithm for dimensionality reduction.
54
+
55
+ In the first scatterplot we display PapersWithCode tasks that are colored by area.
56
+ """
57
+
58
+
59
  descriptions = {
60
  "intro": load_embeddings_intro_description(),
61
  "Basic representations": """Now we show the embeddings of tasks and repos, using various texts or representations.
 
78
 
79
  with gr.Blocks() as demo:
80
  with gr.Tab("Explore Dependency Graphs"):
81
+ setup_graph_tab()
82
  with gr.Tab("Explore Repository Embeddings"):
83
+ setup_embeddings_tab(descriptions, embedding_visualizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  with gr.Tab("Explore Repository Representations"):
85
+ setup_repository_representations_tab(repos_df, repos, representation_types)
86
  with gr.Tab("Explore PapersWithCode Tasks"):
87
+ setup_tasks_tab(descriptions, task_visualizations)
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  demo.launch(share=True)
gradio_tabs.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from graph_visualizations import *
2
+ from text_visualization import WordCloudExtractor
3
+ import logging
4
+ from functools import partial
5
+ import gradio as gr
6
+
7
+
8
+ def display_representations(repos_df, repo, representation1, representation2):
9
+ repo_data = repos_df[repos_df["repo_name"] == repo]
10
+ logging.info(f"repo_data: {repo_data}")
11
+ text1 = (
12
+ repo_data[repo_data["representation"] == representation1]["text"].iloc[0]
13
+ if not repo_data[repo_data["representation"] == representation1].empty
14
+ else "No data available"
15
+ )
16
+ text2 = (
17
+ repo_data[repo_data["representation"] == representation2]["text"].iloc[0]
18
+ if not repo_data[repo_data["representation"] == representation2].empty
19
+ else "No data available"
20
+ )
21
+
22
+ return text1, text2
23
+
24
+
25
+ def get_representation_wordclouds(representations, repos_df):
26
+ wordclouds = dict()
27
+ for representation in representations:
28
+ texts = list(repos_df[repos_df["representation"] == representation]["text"])
29
+ wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
30
+ return wordclouds
31
+
32
+
33
+ def load_embeddings_description():
34
+ return
35
+
36
+
37
+ def setup_repository_representations_tab(repos_df, repos, representation_types):
38
+
39
+ wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
40
+ gr.Markdown("## Wordclouds")
41
+ gr.Gallery(
42
+ [
43
+ (wordcloud, representation_type)
44
+ for representation_type, wordcloud in wordcloud_dict.items()
45
+ ],
46
+ columns=[3],
47
+ rows=[4],
48
+ height=300,
49
+ )
50
+
51
+ gr.Markdown("Select a repository and two representation types to compare them.")
52
+ with gr.Row():
53
+ repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
54
+ representation1 = gr.Dropdown(
55
+ choices=representation_types, label="Representation 1", value="readme"
56
+ )
57
+ representation2 = gr.Dropdown(
58
+ choices=representation_types,
59
+ label="Representation 2",
60
+ value="generated_readme",
61
+ )
62
+
63
+ with gr.Row():
64
+ with gr.Column(
65
+ elem_id="column1",
66
+ variant="panel",
67
+ scale=1,
68
+ min_width=300,
69
+ ):
70
+ text1 = gr.Markdown()
71
+ with gr.Column(
72
+ elem_id="column2",
73
+ variant="panel",
74
+ scale=1,
75
+ min_width=300,
76
+ ):
77
+ text2 = gr.Markdown()
78
+
79
+ def update_representations(repo, representation1, representation2):
80
+ text1_content, text2_content = display_representations(
81
+ repos_df, repo, representation1, representation2
82
+ )
83
+ return (
84
+ f"### Representation 1: {representation1}\n\n{text1_content}",
85
+ f"### Representation 2: {representation2}\n\n{text2_content}",
86
+ )
87
+
88
+ # Initial call to populate textboxes with default values
89
+ text1.value, text2.value = update_representations(
90
+ repos[0], "readme", "generated_readme"
91
+ )
92
+
93
+ for component in [repo, representation1, representation2]:
94
+ component.change(
95
+ fn=update_representations,
96
+ inputs=[repo, representation1, representation2],
97
+ outputs=[text1, text2],
98
+ )
99
+
100
+
101
+ def setup_tasks_tab(descriptions, task_visualizations):
102
+
103
+ gr.Markdown(descriptions["task_counts_description"])
104
+
105
+ with gr.Row():
106
+ min_task_counts_slider_all = gr.Slider(
107
+ minimum=50,
108
+ maximum=1000,
109
+ value=150,
110
+ step=50,
111
+ label="Minimum Task Count (All Repositories)",
112
+ )
113
+ update_button = gr.Button("Update Plots")
114
+ min_task_counts_slider_selected = gr.Slider(
115
+ minimum=10,
116
+ maximum=100,
117
+ value=50,
118
+ step=10,
119
+ label="Minimum Task Count (Selected Repositories)",
120
+ )
121
+ update_selected_button = gr.Button("Update Plots")
122
+
123
+ with gr.Row("Task Counts"):
124
+ all_repos_tasks_plot = gr.Plot(label="All Repositories")
125
+ selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")
126
+
127
+ update_button.click(
128
+ fn=partial(task_visualizations.get_tasks_sunburst, which_df="all"),
129
+ inputs=[min_task_counts_slider_all],
130
+ outputs=[all_repos_tasks_plot],
131
+ )
132
+
133
+ update_selected_button.click(
134
+ fn=partial(task_visualizations.get_tasks_sunburst, which_df="selected"),
135
+ inputs=[min_task_counts_slider_selected],
136
+ outputs=[selected_repos_tasks_plot],
137
+ )
138
+
139
+
140
+ def setup_embeddings_tab(descriptions, embedding_visualizer):
141
+ tab_elems = [
142
+ gr.Markdown("## Tasks by area"),
143
+ gr.Markdown(descriptions["intro"]),
144
+ gr.Plot(embedding_visualizer.make_task_area_scatterplot()),
145
+ ]
146
+
147
+ embedding_plots = embedding_visualizer.make_embedding_plots(
148
+ color_col="representation"
149
+ )
150
+ for plot_name in [
151
+ "Basic representations",
152
+ "Dependency graph based representations",
153
+ "READMEs",
154
+ ]:
155
+ tab_elems.append(gr.Markdown(f"## {plot_name}"))
156
+ if descriptions.get(plot_name):
157
+ tab_elems.append(gr.Markdown(descriptions[plot_name]))
158
+ tab_elems.append(gr.Plot(embedding_plots[plot_name]))
159
+ gr.Column(tab_elems)
160
+
161
+
162
+ def setup_graph_tab():
163
+ gr.Markdown("# Dependency Graph Visualization")
164
+ gr.Markdown("Select a repository to visualize its dependency graph.")
165
+ graphs_dict = init_graphs()
166
+ repo_names = list(graphs_dict.keys())
167
+
168
+ def plot_selected_repo(repo_name, layout_type, *edge_type_checkboxes):
169
+ # Convert checkbox values to selected edge types
170
+ edge_types = (
171
+ get_available_edge_types(graphs_dict[repo_name])
172
+ if repo_name in graphs_dict
173
+ else []
174
+ )
175
+ selected_edge_types = set()
176
+ for i, is_selected in enumerate(edge_type_checkboxes):
177
+ if is_selected and i < len(edge_types):
178
+ selected_edge_types.add(edge_types[i])
179
+
180
+ fig, stats = visualize_graph(
181
+ repo_name, graphs_dict, layout_type, selected_edge_types
182
+ )
183
+ return fig, stats
184
+
185
+ def update_edge_checkboxes(repo_name):
186
+ """Update edge type checkboxes when repository changes"""
187
+ if repo_name not in graphs_dict:
188
+ return [gr.Checkbox(visible=False)] * 8
189
+
190
+ edge_types = get_available_edge_types(graphs_dict[repo_name])
191
+ checkboxes = []
192
+
193
+ # Create checkboxes for each edge type (up to 8)
194
+ for i in range(8):
195
+ if i < len(edge_types):
196
+ edge_type = edge_types[i]
197
+ # function-function should be unchecked by default
198
+ default_value = edge_type != "function-function"
199
+ checkboxes.append(
200
+ gr.Checkbox(label=edge_type, value=default_value, visible=True)
201
+ )
202
+ else:
203
+ checkboxes.append(gr.Checkbox(visible=False))
204
+
205
+ return checkboxes
206
+
207
+ # Get initial edge types for the first repository
208
+ initial_edge_types = []
209
+ if repo_names:
210
+ initial_edge_types = get_available_edge_types(graphs_dict[repo_names[0]])
211
+
212
+ with gr.Row():
213
+ with gr.Column(scale=1):
214
+ repo_dropdown = gr.Dropdown(
215
+ choices=repo_names,
216
+ label="Select Repository",
217
+ value=repo_names[0] if repo_names else None,
218
+ )
219
+
220
+ layout_dropdown = gr.Dropdown(
221
+ choices=[
222
+ ("Spring Layout (Force-directed)", "spring"),
223
+ ("Circular Layout", "circular"),
224
+ ("Kamada-Kawai Layout", "kamada_kawai"),
225
+ ("Fruchterman-Reingold Layout", "fruchterman_reingold"),
226
+ ("Shell Layout", "shell"),
227
+ ("Spectral Layout", "spectral"),
228
+ ("Planar Layout", "planar"),
229
+ ],
230
+ label="Select Layout",
231
+ value="spring",
232
+ )
233
+
234
+ gr.Markdown("### Edge Type Filters")
235
+ gr.Markdown("Select which edge types to display:")
236
+
237
+ # Create checkboxes for edge types with initial values
238
+ edge_checkboxes = []
239
+ for i in range(8): # Support up to 8 edge types
240
+ if i < len(initial_edge_types):
241
+ checkbox = gr.Checkbox(
242
+ label=initial_edge_types[i], value=True, visible=True
243
+ )
244
+ else:
245
+ checkbox = gr.Checkbox(label=f"Edge Type {i+1}", visible=False)
246
+ edge_checkboxes.append(checkbox)
247
+
248
+ visualize_btn = gr.Button("Visualize Graph", variant="primary")
249
+
250
+ stats_text = gr.Textbox(
251
+ label="Graph Statistics", lines=6, interactive=False
252
+ )
253
+
254
+ with gr.Column(scale=2):
255
+ graph_plot = gr.Plot(label="Interactive Dependency Graph")
256
+
257
+ # Set up event handlers
258
+ all_inputs = [repo_dropdown, layout_dropdown] + edge_checkboxes
259
+
260
+ visualize_btn.click(
261
+ fn=plot_selected_repo,
262
+ inputs=all_inputs,
263
+ outputs=[graph_plot, stats_text],
264
+ )
265
+
266
+ # Update checkboxes when repository changes
267
+ repo_dropdown.change(
268
+ fn=update_edge_checkboxes,
269
+ inputs=[repo_dropdown],
270
+ outputs=edge_checkboxes,
271
+ )
272
+
273
+ # Auto-visualize on dropdown change
274
+ repo_dropdown.change(
275
+ fn=plot_selected_repo,
276
+ inputs=all_inputs,
277
+ outputs=[graph_plot, stats_text],
278
+ )
279
+
280
+ # Auto-visualize on layout change
281
+ layout_dropdown.change(
282
+ fn=plot_selected_repo,
283
+ inputs=all_inputs,
284
+ outputs=[graph_plot, stats_text],
285
+ )
286
+
287
+ # Auto-visualize on checkbox changes
288
+ for checkbox in edge_checkboxes:
289
+ checkbox.change(
290
+ fn=plot_selected_repo,
291
+ inputs=all_inputs,
292
+ outputs=[graph_plot, stats_text],
293
+ )
graph_visualizations.py CHANGED
@@ -1,9 +1,7 @@
1
- import gradio as gr
2
  import pandas as pd
3
  import networkx as nx
4
  import tqdm
5
  import plotly.graph_objects as go
6
- import plotly.express as px
7
  from datasets import load_dataset
8
  import pandas as pd
9
 
@@ -386,137 +384,3 @@ Visible edge types:
386
  """
387
 
388
  return fig, stats
389
-
390
-
391
- def graph_tab():
392
- gr.Markdown("# Dependency Graph Visualization")
393
- gr.Markdown("Select a repository to visualize its dependency graph.")
394
- graphs_dict = init_graphs()
395
- repo_names = list(graphs_dict.keys())
396
-
397
- def plot_selected_repo(repo_name, layout_type, *edge_type_checkboxes):
398
- # Convert checkbox values to selected edge types
399
- edge_types = (
400
- get_available_edge_types(graphs_dict[repo_name])
401
- if repo_name in graphs_dict
402
- else []
403
- )
404
- selected_edge_types = set()
405
- for i, is_selected in enumerate(edge_type_checkboxes):
406
- if is_selected and i < len(edge_types):
407
- selected_edge_types.add(edge_types[i])
408
-
409
- fig, stats = visualize_graph(
410
- repo_name, graphs_dict, layout_type, selected_edge_types
411
- )
412
- return fig, stats
413
-
414
- def update_edge_checkboxes(repo_name):
415
- """Update edge type checkboxes when repository changes"""
416
- if repo_name not in graphs_dict:
417
- return [gr.Checkbox(visible=False)] * 8
418
-
419
- edge_types = get_available_edge_types(graphs_dict[repo_name])
420
- checkboxes = []
421
-
422
- # Create checkboxes for each edge type (up to 8)
423
- for i in range(8):
424
- if i < len(edge_types):
425
- edge_type = edge_types[i]
426
- # function-function should be unchecked by default
427
- default_value = edge_type != "function-function"
428
- checkboxes.append(
429
- gr.Checkbox(label=edge_type, value=default_value, visible=True)
430
- )
431
- else:
432
- checkboxes.append(gr.Checkbox(visible=False))
433
-
434
- return checkboxes
435
-
436
- # Get initial edge types for the first repository
437
- initial_edge_types = []
438
- if repo_names:
439
- initial_edge_types = get_available_edge_types(graphs_dict[repo_names[0]])
440
-
441
- with gr.Row():
442
- with gr.Column(scale=1):
443
- repo_dropdown = gr.Dropdown(
444
- choices=repo_names,
445
- label="Select Repository",
446
- value=repo_names[0] if repo_names else None,
447
- )
448
-
449
- layout_dropdown = gr.Dropdown(
450
- choices=[
451
- ("Spring Layout (Force-directed)", "spring"),
452
- ("Circular Layout", "circular"),
453
- ("Kamada-Kawai Layout", "kamada_kawai"),
454
- ("Fruchterman-Reingold Layout", "fruchterman_reingold"),
455
- ("Shell Layout", "shell"),
456
- ("Spectral Layout", "spectral"),
457
- ("Planar Layout", "planar"),
458
- ],
459
- label="Select Layout",
460
- value="spring",
461
- )
462
-
463
- gr.Markdown("### Edge Type Filters")
464
- gr.Markdown("Select which edge types to display:")
465
-
466
- # Create checkboxes for edge types with initial values
467
- edge_checkboxes = []
468
- for i in range(8): # Support up to 8 edge types
469
- if i < len(initial_edge_types):
470
- checkbox = gr.Checkbox(
471
- label=initial_edge_types[i], value=True, visible=True
472
- )
473
- else:
474
- checkbox = gr.Checkbox(label=f"Edge Type {i+1}", visible=False)
475
- edge_checkboxes.append(checkbox)
476
-
477
- visualize_btn = gr.Button("Visualize Graph", variant="primary")
478
-
479
- stats_text = gr.Textbox(
480
- label="Graph Statistics", lines=6, interactive=False
481
- )
482
-
483
- with gr.Column(scale=2):
484
- graph_plot = gr.Plot(label="Interactive Dependency Graph")
485
-
486
- # Set up event handlers
487
- all_inputs = [repo_dropdown, layout_dropdown] + edge_checkboxes
488
-
489
- visualize_btn.click(
490
- fn=plot_selected_repo,
491
- inputs=all_inputs,
492
- outputs=[graph_plot, stats_text],
493
- )
494
-
495
- # Update checkboxes when repository changes
496
- repo_dropdown.change(
497
- fn=update_edge_checkboxes,
498
- inputs=[repo_dropdown],
499
- outputs=edge_checkboxes,
500
- )
501
-
502
- # Auto-visualize on dropdown change
503
- repo_dropdown.change(
504
- fn=plot_selected_repo,
505
- inputs=all_inputs,
506
- outputs=[graph_plot, stats_text],
507
- )
508
-
509
- # Auto-visualize on layout change
510
- layout_dropdown.change(
511
- fn=plot_selected_repo,
512
- inputs=all_inputs,
513
- outputs=[graph_plot, stats_text],
514
- )
515
-
516
- # Auto-visualize on checkbox changes
517
- for checkbox in edge_checkboxes:
518
- checkbox.change(
519
- fn=plot_selected_repo,
520
- inputs=all_inputs,
521
- outputs=[graph_plot, stats_text],
522
- )
 
 
1
  import pandas as pd
2
  import networkx as nx
3
  import tqdm
4
  import plotly.graph_objects as go
 
5
  from datasets import load_dataset
6
  import pandas as pd
7
 
 
384
  """
385
 
386
  return fig, stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
task_visualizations.py CHANGED
@@ -1,8 +1,5 @@
1
  import pandas as pd
2
- import ast
3
- import json
4
  import plotly.express as px
5
- import plotly.graph_objects as go
6
 
7
 
8
  class TaskVisualizations:
@@ -32,9 +29,11 @@ class TaskVisualizations:
32
  )
33
  print(topk_dict)
34
  sorted_df[by_col] = sorted_df[by_col].apply(
35
- lambda k: k
36
- if k in topk_dict.keys() and topk_dict[k] >= val_threshold
37
- else "other"
 
 
38
  )
39
  sorted_df = sorted_df.groupby(by_col).agg({val_col: sum})
40
  return sorted_df
 
1
  import pandas as pd
 
 
2
  import plotly.express as px
 
3
 
4
 
5
  class TaskVisualizations:
 
29
  )
30
  print(topk_dict)
31
  sorted_df[by_col] = sorted_df[by_col].apply(
32
+ lambda k: (
33
+ k
34
+ if k in topk_dict.keys() and topk_dict[k] >= val_threshold
35
+ else "other"
36
+ )
37
  )
38
  sorted_df = sorted_df.groupby(by_col).agg({val_col: sum})
39
  return sorted_df
text_visualization.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Dict, Any, Iterable
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  import wordcloud
4
  from pydantic import BaseModel, Field
@@ -6,21 +6,28 @@ import numpy as np
6
  import PIL
7
  import plotly.express as px
8
  import pandas as pd
9
- import plotly.graph_objects as go
10
 
11
 
12
  class WordCloudExtractor(BaseModel):
13
  max_words: int = 50
14
  wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
15
- tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
 
 
16
 
17
  def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
18
- frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
19
- wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
 
 
 
 
20
  return wc.to_image()
21
 
22
  @classmethod
23
- def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
 
 
24
  """
25
  Extract word frequencies from a corpus using TF-IDF vectorization
26
  and generate word cloud frequencies.
@@ -33,10 +40,7 @@ class WordCloudExtractor(BaseModel):
33
  Dictionary of word frequencies suitable for WordCloud
34
  """
35
  # Initialize TF-IDF vectorizer
36
- tfidf = TfidfVectorizer(
37
- max_features=max_words,
38
- **tfidf_params
39
- )
40
 
41
  # Fit and transform the texts
42
  tfidf_matrix = tfidf.fit_transform(texts)
@@ -55,17 +59,21 @@ class WordCloudExtractor(BaseModel):
55
 
56
  class EmbeddingVisualizer(BaseModel):
57
  display_df: pd.DataFrame
58
- plot_kwargs: Dict[str, Any] = Field(default_factory=lambda: dict(
59
- range_x=(3, 16.5),
60
- range_y=(-3, 11),
61
- width=1200,
62
- height=800,
63
- x="x",
64
- y="y",
65
- template="plotly_white",
66
- ))
67
-
68
- def make_embedding_plots(self, color_col=None, hover_data=["name"], filter_df_fn=None):
 
 
 
 
69
  """
70
  plots Plotly scatterplot of UMAP embeddings
71
  """
@@ -74,20 +82,44 @@ class EmbeddingVisualizer(BaseModel):
74
  display_df = filter_df_fn(display_df)
75
 
76
  display_df = display_df.sort_values("representation", ascending=False)
77
- readme_df = display_df[display_df["representation"].isin(["readme", "generated_readme", "task"])]
78
- raw_df = display_df[display_df["representation"].isin(["dependency_signature", "selected_code", "task"])]
79
- dependency_df = display_df[display_df["representation"].isin(["repository_signature", "dependency_signature", "generated_tasks", "task"])]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  plots = [
82
  self._make_task_and_repos_scatterplot(df, hover_data, color_col)
83
  for df in [readme_df, raw_df, dependency_df]
84
  ]
85
- return dict(zip(["READMEs", "Basic representations", "Dependency graph based representations"], plots))
 
 
 
 
 
 
 
 
 
86
 
87
  def _make_task_and_repos_scatterplot(self, df, hover_data, color_col):
88
  # Set opacity and symbol based on is_task
89
- df['size'] = df['is_task'].apply(lambda x: 0.25 if x else 0.1)
90
- df['symbol'] = df['is_task'].apply(int)
91
 
92
  combined_fig = px.scatter(
93
  df,
@@ -96,7 +128,7 @@ class EmbeddingVisualizer(BaseModel):
96
  color=color_col,
97
  color_discrete_sequence=px.colors.qualitative.Set1,
98
  opacity=0.5,
99
- **self.plot_kwargs
100
  )
101
  combined_fig.data = combined_fig.data[::-1]
102
 
@@ -104,10 +136,26 @@ class EmbeddingVisualizer(BaseModel):
104
 
105
  def make_task_area_scatterplot(self, n_areas=6):
106
  display_df = self.display_df
107
- displayed_tasks_df = display_df[display_df["representation"] == "task"].sort_values("representation")
108
- displayed_tasks_df = displayed_tasks_df.merge(pd.read_csv("data/paperswithcode_tasks.csv"), left_on="name", right_on="task")
109
- displayed_tasks_df= displayed_tasks_df[displayed_tasks_df["area"].isin(displayed_tasks_df["area"].value_counts().head(n_areas).index)]
110
- tasks_fig = px.scatter(displayed_tasks_df, color="area", hover_data=["name"], opacity=0.7, **self.plot_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  print("N DISPLAYED TASKS", len(displayed_tasks_df))
112
  return tasks_fig
113
 
 
1
+ from typing import Dict, Any
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  import wordcloud
4
  from pydantic import BaseModel, Field
 
6
  import PIL
7
  import plotly.express as px
8
  import pandas as pd
 
9
 
10
 
11
  class WordCloudExtractor(BaseModel):
12
  max_words: int = 50
13
  wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
14
+ tfidf_params: Dict[str, Any] = Field(
15
+ default_factory=lambda: {"stop_words": "english"}
16
+ )
17
 
18
  def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
19
+ frequencies = self._extract_frequencies(
20
+ texts, self.max_words, tfidf_params=self.tfidf_params
21
+ )
22
+ wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(
23
+ frequencies
24
+ )
25
  return wc.to_image()
26
 
27
  @classmethod
28
+ def _extract_frequencies(
29
+ cls, texts, max_words=100, tfidf_params: dict = {}
30
+ ) -> Dict[str, float]:
31
  """
32
  Extract word frequencies from a corpus using TF-IDF vectorization
33
  and generate word cloud frequencies.
 
40
  Dictionary of word frequencies suitable for WordCloud
41
  """
42
  # Initialize TF-IDF vectorizer
43
+ tfidf = TfidfVectorizer(max_features=max_words, **tfidf_params)
 
 
 
44
 
45
  # Fit and transform the texts
46
  tfidf_matrix = tfidf.fit_transform(texts)
 
59
 
60
  class EmbeddingVisualizer(BaseModel):
61
  display_df: pd.DataFrame
62
+ plot_kwargs: Dict[str, Any] = Field(
63
+ default_factory=lambda: dict(
64
+ range_x=(3, 16.5),
65
+ range_y=(-3, 11),
66
+ width=1200,
67
+ height=800,
68
+ x="x",
69
+ y="y",
70
+ template="plotly_white",
71
+ )
72
+ )
73
+
74
+ def make_embedding_plots(
75
+ self, color_col=None, hover_data=["name"], filter_df_fn=None
76
+ ):
77
  """
78
  plots Plotly scatterplot of UMAP embeddings
79
  """
 
82
  display_df = filter_df_fn(display_df)
83
 
84
  display_df = display_df.sort_values("representation", ascending=False)
85
+ readme_df = display_df[
86
+ display_df["representation"].isin(["readme", "generated_readme", "task"])
87
+ ]
88
+ raw_df = display_df[
89
+ display_df["representation"].isin(
90
+ ["dependency_signature", "selected_code", "task"]
91
+ )
92
+ ]
93
+ dependency_df = display_df[
94
+ display_df["representation"].isin(
95
+ [
96
+ "repository_signature",
97
+ "dependency_signature",
98
+ "generated_tasks",
99
+ "task",
100
+ ]
101
+ )
102
+ ]
103
 
104
  plots = [
105
  self._make_task_and_repos_scatterplot(df, hover_data, color_col)
106
  for df in [readme_df, raw_df, dependency_df]
107
  ]
108
+ return dict(
109
+ zip(
110
+ [
111
+ "READMEs",
112
+ "Basic representations",
113
+ "Dependency graph based representations",
114
+ ],
115
+ plots,
116
+ )
117
+ )
118
 
119
  def _make_task_and_repos_scatterplot(self, df, hover_data, color_col):
120
  # Set opacity and symbol based on is_task
121
+ df["size"] = df["is_task"].apply(lambda x: 0.25 if x else 0.1)
122
+ df["symbol"] = df["is_task"].apply(int)
123
 
124
  combined_fig = px.scatter(
125
  df,
 
128
  color=color_col,
129
  color_discrete_sequence=px.colors.qualitative.Set1,
130
  opacity=0.5,
131
+ **self.plot_kwargs,
132
  )
133
  combined_fig.data = combined_fig.data[::-1]
134
 
 
136
 
137
  def make_task_area_scatterplot(self, n_areas=6):
138
  display_df = self.display_df
139
+ displayed_tasks_df = display_df[
140
+ display_df["representation"] == "task"
141
+ ].sort_values("representation")
142
+ displayed_tasks_df = displayed_tasks_df.merge(
143
+ pd.read_csv("data/paperswithcode_tasks.csv"),
144
+ left_on="name",
145
+ right_on="task",
146
+ )
147
+ displayed_tasks_df = displayed_tasks_df[
148
+ displayed_tasks_df["area"].isin(
149
+ displayed_tasks_df["area"].value_counts().head(n_areas).index
150
+ )
151
+ ]
152
+ tasks_fig = px.scatter(
153
+ displayed_tasks_df,
154
+ color="area",
155
+ hover_data=["name"],
156
+ opacity=0.7,
157
+ **self.plot_kwargs,
158
+ )
159
  print("N DISPLAYED TASKS", len(displayed_tasks_df))
160
  return tasks_fig
161