Hellisotherpeople commited on
Commit
463e4f9
·
1 Parent(s): 60bd52d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -2
app.py CHANGED
@@ -26,7 +26,7 @@ form.header("Main Settings")
26
  #form.image("https://maartengr.github.io/BERTopic/img/algorithm.png", width = 270)
27
 
28
 
29
- dataset_name = form.text_area("Enter the name of the huggingface Dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
30
  dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
31
  split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
32
  number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
@@ -43,6 +43,7 @@ form.header("BERTopic Settings")
43
  use_topic_reduction = form.selectbox("How do you want to handle topic reduction", ["HDBScan", "Auto", "Manual"])
44
  form.caption("Leave this if you want HDBScan to choose the number of topics (clusters) for you. Set to Auto to have BERTopic prune these topics further, set to Manual to specify the number yourself")
45
  number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 3)
 
46
 
47
 
48
  form.header("CounterVectorizer Settings")
@@ -103,7 +104,10 @@ def load_and_process_data(path, name, streaming, split_name, number_of_records):
103
 
104
 
105
  hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
106
- umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric, random_state = 42)
 
 
 
107
  vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
108
 
109
 
 
26
  #form.image("https://maartengr.github.io/BERTopic/img/algorithm.png", width = 270)
27
 
28
 
29
+ dataset_name = form.text_area("Enter the name of the huggingface dataset to do analysis of:", value = "Hellisotherpeople/DebateSum")
30
  dataset_name_2 = form.text_area("Enter the name of the config for the dataset if it has one", value = "")
31
  split_name = form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
32
  number_of_records = form.number_input("Enter the number of documents that you want to analyze from the dataset", value = 200)
 
43
  use_topic_reduction = form.selectbox("How do you want to handle topic reduction", ["HDBScan", "Auto", "Manual"])
44
  form.caption("Leave this if you want HDBScan to choose the number of topics (clusters) for you. Set to Auto to have BERTopic prune these topics further, set to Manual to specify the number yourself")
45
  number_of_topics = form.number_input("Enter the number of topics to use if doing Manual topic reduction", value = 3)
46
+ use_random_seed = form.checkbox("Do you want to make the results reproducible? This significantly slows down BERTopic", value = False)
47
 
48
 
49
  form.header("CounterVectorizer Settings")
 
104
 
105
 
106
  hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples = hdbscan_min_samples, metric=hdbscan_metric, prediction_data=True)
107
+ if use_random_seed:
108
+ umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric, random_state = 42)
109
+ else:
110
+ umap_model = UMAP(n_neighbors=umap_n_neighbors, n_components=umap_n_components, min_dist=umap_min_dist, metric=umap_metric)
111
  vectorizer_model = CountVectorizer(lowercase = cv_lowercase, ngram_range=(cv_ngram_min, cv_ngram_max), analyzer=cv_analyzer, max_df=cv_max_df, min_df=cv_min_df, stop_words="english")
112
 
113