try bertopic
Browse files
app.py
CHANGED
@@ -449,7 +449,7 @@ def compute_coherence_value_bertopic(topic_model):
|
|
449 |
|
450 |
return coherence_score
|
451 |
|
452 |
-
def base_bertopic():
|
453 |
df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x))
|
454 |
global id2word
|
455 |
id2word = Dictionary(df['lemma_tokens'])
|
@@ -472,7 +472,7 @@ def base_bertopic():
|
|
472 |
except:
|
473 |
print('Unable to generate meaningful topics (Base BERTopic model)')
|
474 |
|
475 |
-
def optimized_bertopic():
|
476 |
vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english")
|
477 |
optimized_topic_model = BERTopic(umap_model=umap_model,
|
478 |
language="multilingual",
|
@@ -505,6 +505,7 @@ def optimized_bertopic():
|
|
505 |
tweets.append(df.loc[index, 'original_tweets'])
|
506 |
print(tweets)
|
507 |
top_tweets.append(tweets)
|
|
|
508 |
|
509 |
global examples
|
510 |
|
@@ -536,8 +537,8 @@ def main(dataset, model, progress=gr.Progress(track_tqdm=True)):
|
|
536 |
print('done lda')
|
537 |
place_data = 'test'
|
538 |
else:
|
539 |
-
base_bertopic()
|
540 |
-
optimized_bertopic()
|
541 |
|
542 |
print('doing topic summarization')
|
543 |
headlines = topic_summarization(top_tweets)
|
|
|
449 |
|
450 |
return coherence_score
|
451 |
|
452 |
+
def base_bertopic(df):
|
453 |
df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x))
|
454 |
global id2word
|
455 |
id2word = Dictionary(df['lemma_tokens'])
|
|
|
472 |
except:
|
473 |
print('Unable to generate meaningful topics (Base BERTopic model)')
|
474 |
|
475 |
+
def optimized_bertopic(df):
|
476 |
vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english")
|
477 |
optimized_topic_model = BERTopic(umap_model=umap_model,
|
478 |
language="multilingual",
|
|
|
505 |
tweets.append(df.loc[index, 'original_tweets'])
|
506 |
print(tweets)
|
507 |
top_tweets.append(tweets)
|
508 |
+
return top_tweets
|
509 |
|
510 |
global examples
|
511 |
|
|
|
537 |
print('done lda')
|
538 |
place_data = 'test'
|
539 |
else:
|
540 |
+
base_bertopic(df)
|
541 |
+
top_tweets = optimized_bertopic()
|
542 |
|
543 |
print('doing topic summarization')
|
544 |
headlines = topic_summarization(top_tweets)
|