Spaces:
Running
Running
Commit
·
463e4f9
1
Parent(s):
60bd52d
Update app.py
Browse files
app.py
CHANGED
@@ -26,7 +26,7 @@ form.header("Main Settings")
|
|
26 |
#form.image("https://maartengr.github.io/BERTopic/img/algorithm.png", width = 270)
|
27 |
|
28 |
|
29 |
-
dataset_name = form.text_area("Enter the name of the huggingface
|
30 |
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
|
31 |
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
|
32 |
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
|
@@ -43,6 +43,7 @@ form.header("BERTopic Settings")
|
|
43 |
use_topic_reduction = form.selectbox("How do you want to handle topic reduction", ["HDBScan", "Auto", "Manual"])
|
44 |
form.caption("Leave this if you want HDBScan to choose the number of topics (clusters) for you. Set to Auto to have BERTopic prune these topics further, set to Manual to specify the number yourself")
|
45 |
number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 3)
|
|
|
46 |
|
47 |
|
48 |
form.header("CounterVectorizer Settings")
|
@@ -103,7 +104,10 @@ def load_and_process_data(path, name, streaming, split_name, number_of_records):
|
|
103 |
|
104 |
|
105 |
hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
|
106 |
-
|
|
|
|
|
|
|
107 |
vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
|
108 |
|
109 |
|
|
|
26 |
#form.image("https://maartengr.github.io/BERTopic/img/algorithm.png", width = 270)
|
27 |
|
28 |
|
29 |
+
dataset_name = form.text_area("Enter the name of the huggingface dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
|
30 |
dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
|
31 |
split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
|
32 |
number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
|
|
|
43 |
use_topic_reduction = form.selectbox("How do you want to handle topic reduction", ["HDBScan", "Auto", "Manual"])
|
44 |
form.caption("Leave this if you want HDBScan to choose the number of topics (clusters) for you. Set to Auto to have BERTopic prune these topics further, set to Manual to specify the number yourself")
|
45 |
number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 3)
|
46 |
+
use_random_seed = form.checkbox("Do you want to make the results reproducible? This significantly slows down BERTopic", value = False)
|
47 |
|
48 |
|
49 |
form.header("CounterVectorizer Settings")
|
|
|
104 |
|
105 |
|
106 |
hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
|
107 |
+
if use_random_seed:
|
108 |
+
umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric, random_state = 42)
|
109 |
+
else:
|
110 |
+
umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
|
111 |
vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
|
112 |
|
113 |
|