Spaces:
Runtime error
Runtime error
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
from bertopic import BERTopic | |
from umap import UMAP | |
from Functionalities import NLP_Helper | |
# Visualization | |
import plotly.graph_objects as go | |
class TopicClustering: | |
def __init__(self, keyword_df, text_col, representation_model, sentence_model): | |
self.topic_names = None | |
self.topic_model = None | |
self.embeddings = None | |
self.topic_name_mapping = {} | |
self.keyword_df, self.text_col = keyword_df, text_col | |
self.sentence_model = SentenceTransformer(sentence_model) | |
self.representation_model = NLP_Helper.get_bertopic_representation(representation_model) | |
def topic_cluster_bert(self) -> None: | |
self.embeddings = self.sentence_model.encode(self.keyword_df[self.text_col], show_progress_bar=True) | |
self.topic_model = BERTopic(representation_model=self.representation_model, | |
embedding_model=self.sentence_model, | |
n_gram_range=(1, 3), top_n_words=2) | |
topics, _ = self.topic_model.fit_transform(self.keyword_df[self.text_col]) | |
topic_labels = self.topic_model.generate_topic_labels(nr_words=1, topic_prefix=False) | |
if self.topic_model.get_topic_info()['Topic'].values[0] == -1: | |
topic_labels[0] = 'Unknown' | |
self.topic_model.set_topic_labels(topic_labels) | |
self.keyword_df['Topic'] = topics | |
topic_info = self.topic_model.get_topic_info() | |
topic_info['Name'] = topic_labels | |
self.keyword_df = pd.merge(topic_info, self.keyword_df, on=['Topic']) | |
self.keyword_df.rename(columns={'Name': 'Topic Name'}, inplace=True) | |
self.keyword_df.drop(columns=['CustomName'], inplace=True) | |
self.topic_names = topic_labels | |
def visualize_documents(self, n_neighbors) -> go.Figure: | |
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=0.0, metric='cosine').fit_transform( | |
self.embeddings) | |
fig = self.topic_model.visualize_documents(self.keyword_df[self.text_col], | |
reduced_embeddings=reduced_embeddings, | |
custom_labels=True) | |
return fig | |
def visualize_topic_distribution(self) -> go.Figure: | |
fig = self.topic_model.visualize_barchart(custom_labels=True, top_n_topics=5, n_words=20, | |
title='Topic Distribution') | |
return fig | |
def update_topic_names(self): | |
for k in self.topic_name_mapping: | |
self.keyword_df['Topic Name'][self.keyword_df['Topic Name'] == k] = self.topic_name_mapping[k] | |
self.topic_names = self.topic_name_mapping.values() | |
self.topic_name_mapping = {} | |
def get_df_in_google_ads_format(self, campaign_name): | |
keyword_df_google_ads = pd.DataFrame( | |
columns=['Action', 'Keyword status', 'Campaign', 'Ad group', 'Keyword', 'Match Type']) | |
keyword_df_google_ads['Ad group'] = self.keyword_df['Topic Name'] | |
keyword_df_google_ads['Keyword'] = self.keyword_df[self.text_col] | |
keyword_df_google_ads['Match Type'] = 'Phrase' | |
keyword_df_google_ads['Action'] = 'Add' | |
keyword_df_google_ads['Keyword status'] = 'Enabled' | |
keyword_df_google_ads['Campaign'] = campaign_name | |
return keyword_df_google_ads | |