KevSun commited on
Commit
a9d0211
verified
1 Parent(s): 36e3982

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -34
app.py CHANGED
@@ -8,10 +8,12 @@ import numpy as np
8
  import matplotlib.pyplot as plt
9
  import pandas as pd
10
 
11
- DetectorFactory.seed = 0
 
 
12
 
13
- # Load models for embedding and similarity
14
- multi_embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
15
 
16
  class WordEmbeddingAgent:
17
  def __init__(self, model):
@@ -30,17 +32,17 @@ class SimilarityAgent:
30
  return util.pytorch_cos_sim(embedding1, embedding2).item()
31
 
32
  class TopicModelingAgent:
33
- def __init__(self, n_components=10):
34
  self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
35
 
36
  def fit_transform(self, texts, lang):
37
- stop_words = 'english' if lang == 'en' else 'spanish'
38
  vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
39
  dtm = vectorizer.fit_transform(texts)
40
  self.lda_model.fit(dtm)
41
  return self.lda_model.transform(dtm), vectorizer
42
 
43
- def get_topics(self, vectorizer, num_words=10):
44
  topics = {}
45
  for idx, topic in enumerate(self.lda_model.components_):
46
  topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
@@ -52,6 +54,7 @@ def detect_language(text):
52
  except:
53
  return "unknown"
54
 
 
55
  def tsne_visualization(embeddings, words):
56
  tsne = TSNE(n_components=2, random_state=42)
57
  embeddings_2d = tsne.fit_transform(embeddings)
@@ -72,38 +75,35 @@ def main():
72
  similarity_agent = SimilarityAgent(multi_embedding_model)
73
  topic_modeling_agent = TopicModelingAgent()
74
 
75
- # Tokenize the input text into words
76
  words = user_input.split()
77
 
78
- # Generate Embeddings
79
- embeddings = embedding_agent.get_embeddings(words)
80
- st.write("Word Embeddings Generated.")
81
-
82
- # t-SNE Visualization
83
- tsne_df = tsne_visualization(embeddings, words)
84
- fig, ax = plt.subplots()
85
- ax.scatter(tsne_df['x'], tsne_df['y'])
86
-
87
- for i, word in enumerate(tsne_df['word']):
88
- ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
89
-
90
- st.pyplot(fig)
91
-
92
- # Topic Modeling
93
- texts = [user_input, "Another text to improve topic modeling."]
94
- topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
95
- topics = topic_modeling_agent.get_topics(vectorizer)
96
- st.write("Topics Extracted:")
97
- for topic, words in topics.items():
98
- st.write(f"Topic {topic}: {', '.join(words)}")
99
-
100
- # Sentence Similarity (example with another text)
101
- text2 = "Otro texto de ejemplo para comparaci贸n de similitud." if lang != 'en' else "Another example text for similarity comparison."
102
- similarity_score = similarity_agent.compute_similarity(user_input, text2)
103
- st.write(f"Similarity Score with example text: {similarity_score:.4f}")
104
 
105
  else:
106
  st.warning("Please enter some text to analyze.")
107
 
108
  if __name__ == "__main__":
109
- main()
 
8
  import matplotlib.pyplot as plt
9
  import pandas as pd
10
 
11
+ @st.cache_resource
12
+ def load_model():
13
+ return SentenceTransformer('distiluse-base-multilingual-cased-v1')
14
 
15
+ DetectorFactory.seed = 0
16
+ multi_embedding_model = load_model()
17
 
18
  class WordEmbeddingAgent:
19
  def __init__(self, model):
 
32
  return util.pytorch_cos_sim(embedding1, embedding2).item()
33
 
34
  class TopicModelingAgent:
35
+ def __init__(self, n_components=5):
36
  self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
37
 
38
  def fit_transform(self, texts, lang):
39
+ stop_words = 'english' if lang == 'en' else None
40
  vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
41
  dtm = vectorizer.fit_transform(texts)
42
  self.lda_model.fit(dtm)
43
  return self.lda_model.transform(dtm), vectorizer
44
 
45
+ def get_topics(self, vectorizer, num_words=5):
46
  topics = {}
47
  for idx, topic in enumerate(self.lda_model.components_):
48
  topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
 
54
  except:
55
  return "unknown"
56
 
57
+ @st.cache_data
58
  def tsne_visualization(embeddings, words):
59
  tsne = TSNE(n_components=2, random_state=42)
60
  embeddings_2d = tsne.fit_transform(embeddings)
 
75
  similarity_agent = SimilarityAgent(multi_embedding_model)
76
  topic_modeling_agent = TopicModelingAgent()
77
 
 
78
  words = user_input.split()
79
 
80
+ with st.spinner("Generating word embeddings..."):
81
+ embeddings = embedding_agent.get_embeddings(words)
82
+ st.success("Word Embeddings Generated.")
83
+
84
+ with st.spinner("Creating t-SNE visualization..."):
85
+ tsne_df = tsne_visualization(embeddings, words)
86
+ fig, ax = plt.subplots()
87
+ ax.scatter(tsne_df['x'], tsne_df['y'])
88
+ for i, word in enumerate(tsne_df['word']):
89
+ ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
90
+ st.pyplot(fig)
91
+
92
+ with st.spinner("Extracting topics..."):
93
+ texts = [user_input, "Another text to improve topic modeling."]
94
+ topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
95
+ topics = topic_modeling_agent.get_topics(vectorizer)
96
+ st.subheader("Topics Extracted:")
97
+ for topic, words in topics.items():
98
+ st.write(f"Topic {topic}: {', '.join(words)}")
99
+
100
+ with st.spinner("Computing similarity..."):
101
+ text2 = "Otro texto de ejemplo para comparaci贸n de similitud." if lang != 'en' else "Another example text for similarity comparison."
102
+ similarity_score = similarity_agent.compute_similarity(user_input, text2)
103
+ st.write(f"Similarity Score with example text: {similarity_score:.4f}")
 
 
104
 
105
  else:
106
  st.warning("Please enter some text to analyze.")
107
 
108
  if __name__ == "__main__":
109
+ main()