lambdaofgod's picture
updated embeddings file
1c73ae0
import gradio as gr
import pandas as pd
import logging
from task_visualizations import TaskVisualizations
from text_visualization import (
EmbeddingVisualizer,
)
from gradio_tabs import (
setup_embeddings_tab,
setup_tasks_tab,
setup_graph_tab,
setup_repository_representations_tab,
)
import datasets
logging.basicConfig(level=logging.INFO)
class AppConfig:
repo_representations_path = "repo_representations.parquet"
task_counts_path = "repos_task_counts.csv"
selected_task_counts_path = "selected_repos_task_counts.csv"
tasks_path = "paperswithcode_tasks.csv"
def load_repo_df(repo_representations_path):
data = datasets.load_dataset(
"lambdaofgod/pwc_github_search",
data_files=AppConfig.repo_representations_path,
)["train"].to_pandas()
return data.assign(
text=data["text"]
.str.replace(r"<img.*\/>", "", regex=True)
.str.replace("│", "\n")
.str.replace("⋮", "\n")
)
## main
repos_df = datasets.load_dataset(
"lambdaofgod/pwc_github_search",
data_files=AppConfig.repo_representations_path,
)["train"].to_pandas()
repos = list(repos_df["repo_name"].unique())
representation_types = list(repos_df["representation"].unique())
logging.info(f"found {len(repos)} repositories")
logging.info(f"representation types: {representation_types}")
task_visualizations = TaskVisualizations(
AppConfig.task_counts_path,
AppConfig.selected_task_counts_path,
AppConfig.tasks_path,
)
display_df = datasets.load_dataset(
"lambdaofgod/pwc_github_search",
data_files="repo_representations_umap2d.parquet",
)["train"].to_pandas()
display_df["is_task"] = display_df["representation"] == "task"
embedding_visualizer = EmbeddingVisualizer(display_df=display_df)
def load_embeddings_intro_description():
return """
The following plots show embeddings obtained with MPNet sentence transformer after applying 2d UMAP algorithm for dimensionality reduction.
In the first scatterplot we display PapersWithCode tasks that are colored by area.
"""
descriptions = {
"intro": load_embeddings_intro_description(),
"Basic representations": """Now we show the embeddings of tasks and repos, using various texts or representations.
The fact that selected code and/or dependency signatures (containing mostly repo's file names) are dissimilar from task names
should not be surprising. For our problem this illustrates the fact that these representations work poorly for retrieval.
""",
"Dependency graph based representations": """
Note the difference between embeddings of generated tasks and repository signatures (which contain them)
""",
"READMEs": """
""",
"task_counts_description": """
PapersWithCode tasks are grouped by areas. In the following plots we can see overall distribution of area given tasks, and the distribution in repository subset that will be used for visualization.
Below we can also see embeddings of task names with MPNet after dimensionality reduction with UMAP.
MPNet, a sentence-transformer model, the embeddings visibly separate tasks by area.
""",
}
with gr.Blocks() as demo:
with gr.Tab("Explore Dependency Graphs"):
setup_graph_tab()
with gr.Tab("Explore Repository Embeddings"):
setup_embeddings_tab(descriptions, embedding_visualizer)
with gr.Tab("Explore Repository Representations"):
setup_repository_representations_tab(repos_df, repos, representation_types)
with gr.Tab("Explore PapersWithCode Tasks"):
setup_tasks_tab(descriptions, task_visualizations)
demo.launch(share=True)