Spaces:

lambdaofgod
/

github_search_visualizations

Sleeping

App Files Files Community

lambdaofgod commited on 18 days ago

Commit

01ca586

1 Parent(s): b387020

tabs refactor

Browse files

Files changed (5) hide show

app.py +21 -165
gradio_tabs.py +293 -0
graph_visualizations.py +0 -136
task_visualizations.py +5 -6
text_visualization.py +80 -32

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import gradio as gr
 import pandas as pd
 import logging
-import re
 from task_visualizations import TaskVisualizations
-import plotly.graph_objects as go
-from functools import partial
-from text_visualization import WordCloudExtractor, EmbeddingVisualizer
-from graph_visualizations import graph_tab
 logging.basicConfig(level=logging.INFO)
@@ -28,107 +32,6 @@ def load_repo_df(repo_representations_path):
     )
-def display_representations(repo, representation1, representation2):
-    repo_data = repos_df[repos_df["repo_name"] == repo]
-    logging.info(f"repo_data: {repo_data}")
-    text1 = (
-        repo_data[repo_data["representation"] == representation1]["text"].iloc[0]
-        if not repo_data[repo_data["representation"] == representation1].empty
-        else "No data available"
-    )
-    text2 = (
-        repo_data[repo_data["representation"] == representation2]["text"].iloc[0]
-        if not repo_data[repo_data["representation"] == representation2].empty
-        else "No data available"
-    )
-    return text1, text2
-def get_representation_wordclouds(representations, repos_df):
-    wordclouds = dict()
-    for representation in representations:
-        texts = list(repos_df[repos_df["representation"] == representation]["text"])
-        wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
-    return wordclouds
-def setup_repository_representations_tab(repos, representation_types):
-    wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
-    gr.Markdown("## Wordclouds")
-    gr.Gallery(
-        [
-            (wordcloud, representation_type)
-            for representation_type, wordcloud in wordcloud_dict.items()
-        ],
-        columns=[3],
-        rows=[4],
-        height=300,
-    )
-    gr.Markdown("Select a repository and two representation types to compare them.")
-    with gr.Row():
-        repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
-        representation1 = gr.Dropdown(
-            choices=representation_types, label="Representation 1", value="readme"
-        )
-        representation2 = gr.Dropdown(
-            choices=representation_types,
-            label="Representation 2",
-            value="generated_readme",
-        )
-    with gr.Row():
-        with gr.Column(
-            elem_id="column1",
-            variant="panel",
-            scale=1,
-            min_width=300,
-        ):
-            text1 = gr.Markdown()
-        with gr.Column(
-            elem_id="column2",
-            variant="panel",
-            scale=1,
-            min_width=300,
-        ):
-            text2 = gr.Markdown()
-    def update_representations(repo, representation1, representation2):
-        text1_content, text2_content = display_representations(
-            repo, representation1, representation2
-        )
-        return (
-            f"### Representation 1: {representation1}\n\n{text1_content}",
-            f"### Representation 2: {representation2}\n\n{text2_content}",
-        )
-    # Initial call to populate textboxes with default values
-    text1.value, text2.value = update_representations(
-        repos[0], "readme", "generated_readme"
-    )
-    for component in [repo, representation1, representation2]:
-        component.change(
-            fn=update_representations,
-            inputs=[repo, representation1, representation2],
-            outputs=[text1, text2],
-        )
-def load_embeddings_intro_description():
-    return """
-    The following plots show embeddings obtained with MPNet sentence transformer after applying 2d UMAP algorithm for dimensionality reduction.
-    In the first scatterplot we display PapersWithCode tasks that are colored by area.
-    """
-def load_embeddings_description():
-    return
 ## main
 repos_df = load_repo_df(AppConfig.repo_representations_path)
 repos = list(repos_df["repo_name"].unique())
@@ -145,6 +48,14 @@ display_df["is_task"] = display_df["representation"] == "task"
 embedding_visualizer = EmbeddingVisualizer(display_df=display_df)
 descriptions = {
     "intro": load_embeddings_intro_description(),
     "Basic representations": """Now we show the embeddings of tasks and repos, using various texts or representations.
@@ -167,68 +78,13 @@ descriptions = {
 with gr.Blocks() as demo:
     with gr.Tab("Explore Dependency Graphs"):
-        graph_tab()
     with gr.Tab("Explore Repository Embeddings"):
-        tab_elems = [
-            gr.Markdown("## Tasks by area"),
-            gr.Markdown(descriptions["intro"]),
-            gr.Plot(embedding_visualizer.make_task_area_scatterplot()),
-        ]
-        embedding_plots = embedding_visualizer.make_embedding_plots(
-            color_col="representation"
-        )
-        for plot_name in [
-            "Basic representations",
-            "Dependency graph based representations",
-            "READMEs",
-        ]:
-            tab_elems.append(gr.Markdown(f"## {plot_name}"))
-            if descriptions.get(plot_name):
-                tab_elems.append(gr.Markdown(descriptions[plot_name]))
-            tab_elems.append(gr.Plot(embedding_plots[plot_name]))
-        gr.Column(tab_elems)
     with gr.Tab("Explore Repository Representations"):
-        setup_repository_representations_tab(repos, representation_types)
     with gr.Tab("Explore PapersWithCode Tasks"):
-        gr.Markdown(descriptions["task_counts_description"])
-        with gr.Row():
-            min_task_counts_slider_all = gr.Slider(
-                minimum=50,
-                maximum=1000,
-                value=150,
-                step=50,
-                label="Minimum Task Count (All Repositories)",
-            )
-            update_button = gr.Button("Update Plots")
-            min_task_counts_slider_selected = gr.Slider(
-                minimum=10,
-                maximum=100,
-                value=50,
-                step=10,
-                label="Minimum Task Count (Selected Repositories)",
-            )
-            update_selected_button = gr.Button("Update Plots")
-        with gr.Row("Task Counts"):
-            all_repos_tasks_plot = gr.Plot(label="All Repositories")
-            selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")
-        update_button.click(
-            fn=partial(task_visualizations.get_tasks_sunburst, which_df="all"),
-            inputs=[min_task_counts_slider_all],
-            outputs=[all_repos_tasks_plot],
-        )
-        update_selected_button.click(
-            fn=partial(task_visualizations.get_tasks_sunburst, which_df="selected"),
-            inputs=[min_task_counts_slider_selected],
-            outputs=[selected_repos_tasks_plot],
-        )
-        gr.Plot(embedding_visualizer.make_task_area_scatterplot())
 demo.launch(share=True)

 import gradio as gr
 import pandas as pd
 import logging
 from task_visualizations import TaskVisualizations
+from text_visualization import (
+    EmbeddingVisualizer,
+)
+from gradio_tabs import (
+    setup_embeddings_tab,
+    setup_tasks_tab,
+    setup_graph_tab,
+    setup_repository_representations_tab,
+)
 logging.basicConfig(level=logging.INFO)
     )
 ## main
 repos_df = load_repo_df(AppConfig.repo_representations_path)
 repos = list(repos_df["repo_name"].unique())
 embedding_visualizer = EmbeddingVisualizer(display_df=display_df)
+def load_embeddings_intro_description():
+    return """
+    The following plots show embeddings obtained with MPNet sentence transformer after applying 2d UMAP algorithm for dimensionality reduction.
+    In the first scatterplot we display PapersWithCode tasks that are colored by area.
+    """
 descriptions = {
     "intro": load_embeddings_intro_description(),
     "Basic representations": """Now we show the embeddings of tasks and repos, using various texts or representations.
 with gr.Blocks() as demo:
     with gr.Tab("Explore Dependency Graphs"):
+        setup_graph_tab()
     with gr.Tab("Explore Repository Embeddings"):
+        setup_embeddings_tab(descriptions, embedding_visualizer)
     with gr.Tab("Explore Repository Representations"):
+        setup_repository_representations_tab(repos_df, repos, representation_types)
     with gr.Tab("Explore PapersWithCode Tasks"):
+        setup_tasks_tab(descriptions, task_visualizations)
 demo.launch(share=True)

gradio_tabs.py ADDED Viewed

	@@ -0,0 +1,293 @@

+from graph_visualizations import *
+from text_visualization import WordCloudExtractor
+import logging
+from functools import partial
+import gradio as gr
+def display_representations(repos_df, repo, representation1, representation2):
+    repo_data = repos_df[repos_df["repo_name"] == repo]
+    logging.info(f"repo_data: {repo_data}")
+    text1 = (
+        repo_data[repo_data["representation"] == representation1]["text"].iloc[0]
+        if not repo_data[repo_data["representation"] == representation1].empty
+        else "No data available"
+    )
+    text2 = (
+        repo_data[repo_data["representation"] == representation2]["text"].iloc[0]
+        if not repo_data[repo_data["representation"] == representation2].empty
+        else "No data available"
+    )
+    return text1, text2
+def get_representation_wordclouds(representations, repos_df):
+    wordclouds = dict()
+    for representation in representations:
+        texts = list(repos_df[repos_df["representation"] == representation]["text"])
+        wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
+    return wordclouds
+def load_embeddings_description():
+    return
+def setup_repository_representations_tab(repos_df, repos, representation_types):
+    wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
+    gr.Markdown("## Wordclouds")
+    gr.Gallery(
+        [
+            (wordcloud, representation_type)
+            for representation_type, wordcloud in wordcloud_dict.items()
+        ],
+        columns=[3],
+        rows=[4],
+        height=300,
+    )
+    gr.Markdown("Select a repository and two representation types to compare them.")
+    with gr.Row():
+        repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
+        representation1 = gr.Dropdown(
+            choices=representation_types, label="Representation 1", value="readme"
+        )
+        representation2 = gr.Dropdown(
+            choices=representation_types,
+            label="Representation 2",
+            value="generated_readme",
+        )
+    with gr.Row():
+        with gr.Column(
+            elem_id="column1",
+            variant="panel",
+            scale=1,
+            min_width=300,
+        ):
+            text1 = gr.Markdown()
+        with gr.Column(
+            elem_id="column2",
+            variant="panel",
+            scale=1,
+            min_width=300,
+        ):
+            text2 = gr.Markdown()
+    def update_representations(repo, representation1, representation2):
+        text1_content, text2_content = display_representations(
+            repos_df, repo, representation1, representation2
+        )
+        return (
+            f"### Representation 1: {representation1}\n\n{text1_content}",
+            f"### Representation 2: {representation2}\n\n{text2_content}",
+        )
+    # Initial call to populate textboxes with default values
+    text1.value, text2.value = update_representations(
+        repos[0], "readme", "generated_readme"
+    )
+    for component in [repo, representation1, representation2]:
+        component.change(
+            fn=update_representations,
+            inputs=[repo, representation1, representation2],
+            outputs=[text1, text2],
+        )
+def setup_tasks_tab(descriptions, task_visualizations):
+    gr.Markdown(descriptions["task_counts_description"])
+    with gr.Row():
+        min_task_counts_slider_all = gr.Slider(
+            minimum=50,
+            maximum=1000,
+            value=150,
+            step=50,
+            label="Minimum Task Count (All Repositories)",
+        )
+        update_button = gr.Button("Update Plots")
+        min_task_counts_slider_selected = gr.Slider(
+            minimum=10,
+            maximum=100,
+            value=50,
+            step=10,
+            label="Minimum Task Count (Selected Repositories)",
+        )
+        update_selected_button = gr.Button("Update Plots")
+    with gr.Row("Task Counts"):
+        all_repos_tasks_plot = gr.Plot(label="All Repositories")
+        selected_repos_tasks_plot = gr.Plot(label="Selected Repositories")
+    update_button.click(
+        fn=partial(task_visualizations.get_tasks_sunburst, which_df="all"),
+        inputs=[min_task_counts_slider_all],
+        outputs=[all_repos_tasks_plot],
+    )
+    update_selected_button.click(
+        fn=partial(task_visualizations.get_tasks_sunburst, which_df="selected"),
+        inputs=[min_task_counts_slider_selected],
+        outputs=[selected_repos_tasks_plot],
+    )
+def setup_embeddings_tab(descriptions, embedding_visualizer):
+    tab_elems = [
+        gr.Markdown("## Tasks by area"),
+        gr.Markdown(descriptions["intro"]),
+        gr.Plot(embedding_visualizer.make_task_area_scatterplot()),
+    ]
+    embedding_plots = embedding_visualizer.make_embedding_plots(
+        color_col="representation"
+    )
+    for plot_name in [
+        "Basic representations",
+        "Dependency graph based representations",
+        "READMEs",
+    ]:
+        tab_elems.append(gr.Markdown(f"## {plot_name}"))
+        if descriptions.get(plot_name):
+            tab_elems.append(gr.Markdown(descriptions[plot_name]))
+        tab_elems.append(gr.Plot(embedding_plots[plot_name]))
+    gr.Column(tab_elems)
+def setup_graph_tab():
+    gr.Markdown("# Dependency Graph Visualization")
+    gr.Markdown("Select a repository to visualize its dependency graph.")
+    graphs_dict = init_graphs()
+    repo_names = list(graphs_dict.keys())
+    def plot_selected_repo(repo_name, layout_type, *edge_type_checkboxes):
+        # Convert checkbox values to selected edge types
+        edge_types = (
+            get_available_edge_types(graphs_dict[repo_name])
+            if repo_name in graphs_dict
+            else []
+        )
+        selected_edge_types = set()
+        for i, is_selected in enumerate(edge_type_checkboxes):
+            if is_selected and i < len(edge_types):
+                selected_edge_types.add(edge_types[i])
+        fig, stats = visualize_graph(
+            repo_name, graphs_dict, layout_type, selected_edge_types
+        )
+        return fig, stats
+    def update_edge_checkboxes(repo_name):
+        """Update edge type checkboxes when repository changes"""
+        if repo_name not in graphs_dict:
+            return [gr.Checkbox(visible=False)] * 8
+        edge_types = get_available_edge_types(graphs_dict[repo_name])
+        checkboxes = []
+        # Create checkboxes for each edge type (up to 8)
+        for i in range(8):
+            if i < len(edge_types):
+                edge_type = edge_types[i]
+                # function-function should be unchecked by default
+                default_value = edge_type != "function-function"
+                checkboxes.append(
+                    gr.Checkbox(label=edge_type, value=default_value, visible=True)
+                )
+            else:
+                checkboxes.append(gr.Checkbox(visible=False))
+        return checkboxes
+    # Get initial edge types for the first repository
+    initial_edge_types = []
+    if repo_names:
+        initial_edge_types = get_available_edge_types(graphs_dict[repo_names[0]])
+    with gr.Row():
+        with gr.Column(scale=1):
+            repo_dropdown = gr.Dropdown(
+                choices=repo_names,
+                label="Select Repository",
+                value=repo_names[0] if repo_names else None,
+            )
+            layout_dropdown = gr.Dropdown(
+                choices=[
+                    ("Spring Layout (Force-directed)", "spring"),
+                    ("Circular Layout", "circular"),
+                    ("Kamada-Kawai Layout", "kamada_kawai"),
+                    ("Fruchterman-Reingold Layout", "fruchterman_reingold"),
+                    ("Shell Layout", "shell"),
+                    ("Spectral Layout", "spectral"),
+                    ("Planar Layout", "planar"),
+                ],
+                label="Select Layout",
+                value="spring",
+            )
+            gr.Markdown("### Edge Type Filters")
+            gr.Markdown("Select which edge types to display:")
+            # Create checkboxes for edge types with initial values
+            edge_checkboxes = []
+            for i in range(8):  # Support up to 8 edge types
+                if i < len(initial_edge_types):
+                    checkbox = gr.Checkbox(
+                        label=initial_edge_types[i], value=True, visible=True
+                    )
+                else:
+                    checkbox = gr.Checkbox(label=f"Edge Type {i+1}", visible=False)
+                edge_checkboxes.append(checkbox)
+            visualize_btn = gr.Button("Visualize Graph", variant="primary")
+            stats_text = gr.Textbox(
+                label="Graph Statistics", lines=6, interactive=False
+            )
+        with gr.Column(scale=2):
+            graph_plot = gr.Plot(label="Interactive Dependency Graph")
+    # Set up event handlers
+    all_inputs = [repo_dropdown, layout_dropdown] + edge_checkboxes
+    visualize_btn.click(
+        fn=plot_selected_repo,
+        inputs=all_inputs,
+        outputs=[graph_plot, stats_text],
+    )
+    # Update checkboxes when repository changes
+    repo_dropdown.change(
+        fn=update_edge_checkboxes,
+        inputs=[repo_dropdown],
+        outputs=edge_checkboxes,
+    )
+    # Auto-visualize on dropdown change
+    repo_dropdown.change(
+        fn=plot_selected_repo,
+        inputs=all_inputs,
+        outputs=[graph_plot, stats_text],
+    )
+    # Auto-visualize on layout change
+    layout_dropdown.change(
+        fn=plot_selected_repo,
+        inputs=all_inputs,
+        outputs=[graph_plot, stats_text],
+    )
+    # Auto-visualize on checkbox changes
+    for checkbox in edge_checkboxes:
+        checkbox.change(
+            fn=plot_selected_repo,
+            inputs=all_inputs,
+            outputs=[graph_plot, stats_text],
+        )

graph_visualizations.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import gradio as gr
 import pandas as pd
 import networkx as nx
 import tqdm
 import plotly.graph_objects as go
-import plotly.express as px
 from datasets import load_dataset
 import pandas as pd
@@ -386,137 +384,3 @@ Visible edge types:
 """
     return fig, stats
-def graph_tab():
-    gr.Markdown("# Dependency Graph Visualization")
-    gr.Markdown("Select a repository to visualize its dependency graph.")
-    graphs_dict = init_graphs()
-    repo_names = list(graphs_dict.keys())
-    def plot_selected_repo(repo_name, layout_type, *edge_type_checkboxes):
-        # Convert checkbox values to selected edge types
-        edge_types = (
-            get_available_edge_types(graphs_dict[repo_name])
-            if repo_name in graphs_dict
-            else []
-        )
-        selected_edge_types = set()
-        for i, is_selected in enumerate(edge_type_checkboxes):
-            if is_selected and i < len(edge_types):
-                selected_edge_types.add(edge_types[i])
-        fig, stats = visualize_graph(
-            repo_name, graphs_dict, layout_type, selected_edge_types
-        )
-        return fig, stats
-    def update_edge_checkboxes(repo_name):
-        """Update edge type checkboxes when repository changes"""
-        if repo_name not in graphs_dict:
-            return [gr.Checkbox(visible=False)] * 8
-        edge_types = get_available_edge_types(graphs_dict[repo_name])
-        checkboxes = []
-        # Create checkboxes for each edge type (up to 8)
-        for i in range(8):
-            if i < len(edge_types):
-                edge_type = edge_types[i]
-                # function-function should be unchecked by default
-                default_value = edge_type != "function-function"
-                checkboxes.append(
-                    gr.Checkbox(label=edge_type, value=default_value, visible=True)
-                )
-            else:
-                checkboxes.append(gr.Checkbox(visible=False))
-        return checkboxes
-    # Get initial edge types for the first repository
-    initial_edge_types = []
-    if repo_names:
-        initial_edge_types = get_available_edge_types(graphs_dict[repo_names[0]])
-    with gr.Row():
-        with gr.Column(scale=1):
-            repo_dropdown = gr.Dropdown(
-                choices=repo_names,
-                label="Select Repository",
-                value=repo_names[0] if repo_names else None,
-            )
-            layout_dropdown = gr.Dropdown(
-                choices=[
-                    ("Spring Layout (Force-directed)", "spring"),
-                    ("Circular Layout", "circular"),
-                    ("Kamada-Kawai Layout", "kamada_kawai"),
-                    ("Fruchterman-Reingold Layout", "fruchterman_reingold"),
-                    ("Shell Layout", "shell"),
-                    ("Spectral Layout", "spectral"),
-                    ("Planar Layout", "planar"),
-                ],
-                label="Select Layout",
-                value="spring",
-            )
-            gr.Markdown("### Edge Type Filters")
-            gr.Markdown("Select which edge types to display:")
-            # Create checkboxes for edge types with initial values
-            edge_checkboxes = []
-            for i in range(8):  # Support up to 8 edge types
-                if i < len(initial_edge_types):
-                    checkbox = gr.Checkbox(
-                        label=initial_edge_types[i], value=True, visible=True
-                    )
-                else:
-                    checkbox = gr.Checkbox(label=f"Edge Type {i+1}", visible=False)
-                edge_checkboxes.append(checkbox)
-            visualize_btn = gr.Button("Visualize Graph", variant="primary")
-            stats_text = gr.Textbox(
-                label="Graph Statistics", lines=6, interactive=False
-            )
-        with gr.Column(scale=2):
-            graph_plot = gr.Plot(label="Interactive Dependency Graph")
-    # Set up event handlers
-    all_inputs = [repo_dropdown, layout_dropdown] + edge_checkboxes
-    visualize_btn.click(
-        fn=plot_selected_repo,
-        inputs=all_inputs,
-        outputs=[graph_plot, stats_text],
-    )
-    # Update checkboxes when repository changes
-    repo_dropdown.change(
-        fn=update_edge_checkboxes,
-        inputs=[repo_dropdown],
-        outputs=edge_checkboxes,
-    )
-    # Auto-visualize on dropdown change
-    repo_dropdown.change(
-        fn=plot_selected_repo,
-        inputs=all_inputs,
-        outputs=[graph_plot, stats_text],
-    )
-    # Auto-visualize on layout change
-    layout_dropdown.change(
-        fn=plot_selected_repo,
-        inputs=all_inputs,
-        outputs=[graph_plot, stats_text],
-    )
-    # Auto-visualize on checkbox changes
-    for checkbox in edge_checkboxes:
-        checkbox.change(
-            fn=plot_selected_repo,
-            inputs=all_inputs,
-            outputs=[graph_plot, stats_text],
-        )

 import pandas as pd
 import networkx as nx
 import tqdm
 import plotly.graph_objects as go
 from datasets import load_dataset
 import pandas as pd
 """
     return fig, stats

task_visualizations.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import pandas as pd
-import ast
-import json
 import plotly.express as px
-import plotly.graph_objects as go
 class TaskVisualizations:
@@ -32,9 +29,11 @@ class TaskVisualizations:
         )
         print(topk_dict)
         sorted_df[by_col] = sorted_df[by_col].apply(
-            lambda k: k
-            if k in topk_dict.keys() and topk_dict[k] >= val_threshold
-            else "other"
         )
         sorted_df = sorted_df.groupby(by_col).agg({val_col: sum})
         return sorted_df

 import pandas as pd
 import plotly.express as px
 class TaskVisualizations:
         )
         print(topk_dict)
         sorted_df[by_col] = sorted_df[by_col].apply(
+            lambda k: (
+                k
+                if k in topk_dict.keys() and topk_dict[k] >= val_threshold
+                else "other"
+            )
         )
         sorted_df = sorted_df.groupby(by_col).agg({val_col: sum})
         return sorted_df

text_visualization.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Any, Iterable
 from sklearn.feature_extraction.text import TfidfVectorizer
 import wordcloud
 from pydantic import BaseModel, Field
@@ -6,21 +6,28 @@ import numpy as np
 import PIL
 import plotly.express as px
 import pandas as pd
-import plotly.graph_objects as go
 class WordCloudExtractor(BaseModel):
     max_words: int = 50
     wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
-    tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
     def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
-        frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
-        wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
         return wc.to_image()
     @classmethod
-    def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
         """
         Extract word frequencies from a corpus using TF-IDF vectorization
         and generate word cloud frequencies.
@@ -33,10 +40,7 @@ class WordCloudExtractor(BaseModel):
             Dictionary of word frequencies suitable for WordCloud
         """
         # Initialize TF-IDF vectorizer
-        tfidf = TfidfVectorizer(
-            max_features=max_words,
-            **tfidf_params
-        )
         # Fit and transform the texts
         tfidf_matrix = tfidf.fit_transform(texts)
@@ -55,17 +59,21 @@ class WordCloudExtractor(BaseModel):
 class EmbeddingVisualizer(BaseModel):
     display_df: pd.DataFrame
-    plot_kwargs: Dict[str, Any] = Field(default_factory=lambda: dict(
-        range_x=(3, 16.5),
-        range_y=(-3, 11),
-        width=1200,
-        height=800,
-        x="x",
-        y="y",
-        template="plotly_white",
-    ))
-    def make_embedding_plots(self, color_col=None, hover_data=["name"], filter_df_fn=None):
         """
         plots Plotly scatterplot of UMAP embeddings
         """
@@ -74,20 +82,44 @@ class EmbeddingVisualizer(BaseModel):
             display_df = filter_df_fn(display_df)
         display_df = display_df.sort_values("representation", ascending=False)
-        readme_df = display_df[display_df["representation"].isin(["readme", "generated_readme", "task"])]
-        raw_df = display_df[display_df["representation"].isin(["dependency_signature", "selected_code", "task"])]
-        dependency_df = display_df[display_df["representation"].isin(["repository_signature", "dependency_signature", "generated_tasks", "task"])]
         plots = [
             self._make_task_and_repos_scatterplot(df, hover_data, color_col)
             for df in [readme_df, raw_df, dependency_df]
         ]
-        return dict(zip(["READMEs", "Basic representations", "Dependency graph based representations"], plots))
     def _make_task_and_repos_scatterplot(self, df, hover_data, color_col):
         # Set opacity and symbol based on is_task
-        df['size'] = df['is_task'].apply(lambda x: 0.25 if x else 0.1)
-        df['symbol'] = df['is_task'].apply(int)
         combined_fig = px.scatter(
             df,
@@ -96,7 +128,7 @@ class EmbeddingVisualizer(BaseModel):
             color=color_col,
             color_discrete_sequence=px.colors.qualitative.Set1,
             opacity=0.5,
-            **self.plot_kwargs
         )
         combined_fig.data = combined_fig.data[::-1]
@@ -104,10 +136,26 @@ class EmbeddingVisualizer(BaseModel):
     def make_task_area_scatterplot(self, n_areas=6):
         display_df = self.display_df
-        displayed_tasks_df = display_df[display_df["representation"] == "task"].sort_values("representation")
-        displayed_tasks_df = displayed_tasks_df.merge(pd.read_csv("data/paperswithcode_tasks.csv"), left_on="name", right_on="task")
-        displayed_tasks_df= displayed_tasks_df[displayed_tasks_df["area"].isin(displayed_tasks_df["area"].value_counts().head(n_areas).index)]
-        tasks_fig = px.scatter(displayed_tasks_df, color="area", hover_data=["name"], opacity=0.7, **self.plot_kwargs)
         print("N DISPLAYED TASKS", len(displayed_tasks_df))
         return tasks_fig

+from typing import Dict, Any
 from sklearn.feature_extraction.text import TfidfVectorizer
 import wordcloud
 from pydantic import BaseModel, Field
 import PIL
 import plotly.express as px
 import pandas as pd
 class WordCloudExtractor(BaseModel):
     max_words: int = 50
     wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
+    tfidf_params: Dict[str, Any] = Field(
+        default_factory=lambda: {"stop_words": "english"}
+    )
     def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
+        frequencies = self._extract_frequencies(
+            texts, self.max_words, tfidf_params=self.tfidf_params
+        )
+        wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(
+            frequencies
+        )
         return wc.to_image()
     @classmethod
+    def _extract_frequencies(
+        cls, texts, max_words=100, tfidf_params: dict = {}
+    ) -> Dict[str, float]:
         """
         Extract word frequencies from a corpus using TF-IDF vectorization
         and generate word cloud frequencies.
             Dictionary of word frequencies suitable for WordCloud
         """
         # Initialize TF-IDF vectorizer
+        tfidf = TfidfVectorizer(max_features=max_words, **tfidf_params)
         # Fit and transform the texts
         tfidf_matrix = tfidf.fit_transform(texts)
 class EmbeddingVisualizer(BaseModel):
     display_df: pd.DataFrame
+    plot_kwargs: Dict[str, Any] = Field(
+        default_factory=lambda: dict(
+            range_x=(3, 16.5),
+            range_y=(-3, 11),
+            width=1200,
+            height=800,
+            x="x",
+            y="y",
+            template="plotly_white",
+        )
+    )
+    def make_embedding_plots(
+        self, color_col=None, hover_data=["name"], filter_df_fn=None
+    ):
         """
         plots Plotly scatterplot of UMAP embeddings
         """
             display_df = filter_df_fn(display_df)
         display_df = display_df.sort_values("representation", ascending=False)
+        readme_df = display_df[
+            display_df["representation"].isin(["readme", "generated_readme", "task"])
+        ]
+        raw_df = display_df[
+            display_df["representation"].isin(
+                ["dependency_signature", "selected_code", "task"]
+            )
+        ]
+        dependency_df = display_df[
+            display_df["representation"].isin(
+                [
+                    "repository_signature",
+                    "dependency_signature",
+                    "generated_tasks",
+                    "task",
+                ]
+            )
+        ]
         plots = [
             self._make_task_and_repos_scatterplot(df, hover_data, color_col)
             for df in [readme_df, raw_df, dependency_df]
         ]
+        return dict(
+            zip(
+                [
+                    "READMEs",
+                    "Basic representations",
+                    "Dependency graph based representations",
+                ],
+                plots,
+            )
+        )
     def _make_task_and_repos_scatterplot(self, df, hover_data, color_col):
         # Set opacity and symbol based on is_task
+        df["size"] = df["is_task"].apply(lambda x: 0.25 if x else 0.1)
+        df["symbol"] = df["is_task"].apply(int)
         combined_fig = px.scatter(
             df,
             color=color_col,
             color_discrete_sequence=px.colors.qualitative.Set1,
             opacity=0.5,
+            **self.plot_kwargs,
         )
         combined_fig.data = combined_fig.data[::-1]
     def make_task_area_scatterplot(self, n_areas=6):
         display_df = self.display_df
+        displayed_tasks_df = display_df[
+            display_df["representation"] == "task"
+        ].sort_values("representation")
+        displayed_tasks_df = displayed_tasks_df.merge(
+            pd.read_csv("data/paperswithcode_tasks.csv"),
+            left_on="name",
+            right_on="task",
+        )
+        displayed_tasks_df = displayed_tasks_df[
+            displayed_tasks_df["area"].isin(
+                displayed_tasks_df["area"].value_counts().head(n_areas).index
+            )
+        ]
+        tasks_fig = px.scatter(
+            displayed_tasks_df,
+            color="area",
+            hover_data=["name"],
+            opacity=0.7,
+            **self.plot_kwargs,
+        )
         print("N DISPLAYED TASKS", len(displayed_tasks_df))
         return tasks_fig