from typing import Dict, Any
from sklearn.feature_extraction.text import TfidfVectorizer
import wordcloud
from pydantic import BaseModel, Field
import numpy as np
import PIL
import plotly.express as px
import pandas as pd
import datasets


class WordCloudExtractor(BaseModel):
    max_words: int = 50
    wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
    tfidf_params: Dict[str, Any] = Field(
        default_factory=lambda: {"stop_words": "english"}
    )

    def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
        frequencies = self._extract_frequencies(
            texts, self.max_words, tfidf_params=self.tfidf_params
        )
        wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(
            frequencies
        )
        return wc.to_image()

    @classmethod
    def _extract_frequencies(
        cls, texts, max_words=100, tfidf_params: dict = {}
    ) -> Dict[str, float]:
        """
        Extract word frequencies from a corpus using TF-IDF vectorization
        and generate word cloud frequencies.

        Args:
            texts: List of text documents
            max_features: Maximum number of words to include

        Returns:
            Dictionary of word frequencies suitable for WordCloud
        """
        # Initialize TF-IDF vectorizer
        tfidf = TfidfVectorizer(max_features=max_words, **tfidf_params)

        # Fit and transform the texts
        tfidf_matrix = tfidf.fit_transform(texts)

        # Get feature names (words)
        feature_names = tfidf.get_feature_names_out()

        # Calculate mean TF-IDF scores across documents
        mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()

        # Create frequency dictionary
        frequencies = dict(zip(feature_names, mean_tfidf))

        return frequencies


class EmbeddingVisualizer(BaseModel):
    display_df: pd.DataFrame
    plot_kwargs: Dict[str, Any] = Field(
        default_factory=lambda: dict(
            range_x=(3, 16.5),
            range_y=(-3, 11),
            width=1200,
            height=800,
            x="x",
            y="y",
            template="plotly_white",
        )
    )

    def make_embedding_plots(
        self, color_col=None, hover_data=["name"], filter_df_fn=None
    ):
        """
        plots Plotly scatterplot of UMAP embeddings
        """
        display_df = self.display_df
        if filter_df_fn is not None:
            display_df = filter_df_fn(display_df)

        display_df = display_df.sort_values("representation", ascending=False)
        readme_df = display_df[
            display_df["representation"].isin(
                ["readme", "code2doc_generated_readme", "task"]
            )
        ]
        raw_df = display_df[
            display_df["representation"].isin(
                ["dependency_signature", "selected_code", "task"]
            )
        ]
        dependency_df = display_df[
            display_df["representation"].isin(
                [
                    "repository_signature",
                    "dependency_signature",
                    "generated_tasks",
                    "task",
                ]
            )
        ]

        plots = [
            self._make_task_and_repos_scatterplot(df, hover_data, color_col)
            for df in [readme_df, raw_df, dependency_df]
        ]
        return dict(
            zip(
                [
                    "READMEs",
                    "Basic representations",
                    "Dependency graph based representations",
                ],
                plots,
            )
        )

    def _make_task_and_repos_scatterplot(self, df, hover_data, color_col):
        # Set opacity and symbol based on is_task
        df["size"] = df["is_task"].apply(lambda x: 0.25 if x else 0.1)
        df["symbol"] = df["is_task"].apply(int)

        combined_fig = px.scatter(
            df,
            hover_name="name",
            hover_data=hover_data,
            color=color_col,
            color_discrete_sequence=px.colors.qualitative.Set1,
            opacity=0.5,
            **self.plot_kwargs,
        )
        combined_fig.data = combined_fig.data[::-1]

        return combined_fig

    def make_task_area_scatterplot(self, n_areas=6):
        display_df = self.display_df
        displayed_tasks_df = display_df[
            display_df["representation"] == "task"
        ].sort_values("representation")
        pwc_tasks_df = datasets.load_dataset(
            "lambdaofgod/pwc_github_search", data_files="paperswithcode_tasks.csv"
        )["train"].to_pandas()
        displayed_tasks_df = displayed_tasks_df.merge(
            pwc_tasks_df,
            left_on="name",
            right_on="task",
        )
        displayed_tasks_df = displayed_tasks_df[
            displayed_tasks_df["area"].isin(
                displayed_tasks_df["area"].value_counts().head(n_areas).index
            )
        ]
        tasks_fig = px.scatter(
            displayed_tasks_df,
            color="area",
            hover_data=["name"],
            opacity=0.7,
            **self.plot_kwargs,
        )
        print("N DISPLAYED TASKS", len(displayed_tasks_df))
        return tasks_fig

    class Config:
        arbitrary_types_allowed = True