Spaces:
Runtime error
Runtime error
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
from bertopic import BERTopic | |
from umap import UMAP | |
from Functionalities import NLP_Helper | |
# Visualization | |
import plotly.graph_objects as go | |
class TopicClustering: | |
def __init__(self, keyword_df, text_col, representation_model, sentence_model): | |
self.topic_model = None | |
self.embeddings = None | |
self.keyword_df, self.text_col = keyword_df, text_col | |
self.sentence_model = SentenceTransformer(sentence_model) | |
self.representation_model = NLP_Helper.get_bertopic_representation(representation_model) | |
def topic_cluster_bert(self) -> None: | |
self.embeddings = self.sentence_model.encode(self.keyword_df[self.text_col], show_progress_bar=False) | |
self.topic_model = BERTopic(representation_model=self.representation_model, | |
embedding_model=self.sentence_model, | |
n_gram_range=(1, 3), top_n_words=2) | |
topics, _ = self.topic_model.fit_transform(self.keyword_df[self.text_col]) | |
topic_labels = self.topic_model.generate_topic_labels(nr_words=1, topic_prefix=False) | |
if self.topic_model.get_topic_info()['Topic'].values[0] == -1: | |
topic_labels[0] = 'Unknown' | |
self.topic_model.set_topic_labels(topic_labels) | |
self.keyword_df['Topic'] = topics | |
topic_info = self.topic_model.get_topic_info() | |
topic_info['Name'] = topic_labels | |
self.keyword_df = pd.merge(topic_info, self.keyword_df, on=['Topic']) | |
self.keyword_df.rename(columns={'Name': 'Topic Name'}, inplace=True) | |
self.keyword_df.drop(columns=['CustomName'], inplace=True) | |
def visualize_documents(self, n_neighbors) -> go.Figure: | |
reduced_embeddings = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=0.0, metric='cosine').fit_transform( | |
self.embeddings) | |
fig = self.topic_model.visualize_documents(self.keyword_df[self.text_col], | |
reduced_embeddings=reduced_embeddings, | |
custom_labels=True) | |
return fig | |
def visualize_topic_distribution(self) -> go.Figure: | |
fig = self.topic_model.visualize_barchart(custom_labels=True, top_n_topics=5, n_words=20, title='Topic Distribution') | |
return fig | |