Spaces:

KevSun
/

LinguisticAnalysis

No application file

App Files Files Community

KevSun commited on Jul 25, 2024

Commit

a9d0211

verified ·

1 Parent(s): 36e3982

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -34

app.py CHANGED Viewed

@@ -8,10 +8,12 @@ import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
-DetectorFactory.seed = 0
-# Load models for embedding and similarity
-multi_embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
 class WordEmbeddingAgent:
     def __init__(self, model):
@@ -30,17 +32,17 @@ class SimilarityAgent:
         return util.pytorch_cos_sim(embedding1, embedding2).item()
 class TopicModelingAgent:
-    def __init__(self, n_components=10):
         self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
     def fit_transform(self, texts, lang):
-        stop_words = 'english' if lang == 'en' else 'spanish'
         vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
         dtm = vectorizer.fit_transform(texts)
         self.lda_model.fit(dtm)
         return self.lda_model.transform(dtm), vectorizer
-    def get_topics(self, vectorizer, num_words=10):
         topics = {}
         for idx, topic in enumerate(self.lda_model.components_):
             topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
@@ -52,6 +54,7 @@ def detect_language(text):
     except:
         return "unknown"
 def tsne_visualization(embeddings, words):
     tsne = TSNE(n_components=2, random_state=42)
     embeddings_2d = tsne.fit_transform(embeddings)
@@ -72,38 +75,35 @@ def main():
             similarity_agent = SimilarityAgent(multi_embedding_model)
             topic_modeling_agent = TopicModelingAgent()
-            # Tokenize the input text into words
             words = user_input.split()
-            # Generate Embeddings
-            embeddings = embedding_agent.get_embeddings(words)
-            st.write("Word Embeddings Generated.")
-            # t-SNE Visualization
-            tsne_df = tsne_visualization(embeddings, words)
-            fig, ax = plt.subplots()
-            ax.scatter(tsne_df['x'], tsne_df['y'])
-            for i, word in enumerate(tsne_df['word']):
-                ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
-            st.pyplot(fig)
-            # Topic Modeling
-            texts = [user_input, "Another text to improve topic modeling."]
-            topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
-            topics = topic_modeling_agent.get_topics(vectorizer)
-            st.write("Topics Extracted:")
-            for topic, words in topics.items():
-                st.write(f"Topic {topic}: {', '.join(words)}")
-            # Sentence Similarity (example with another text)
-            text2 = "Otro texto de ejemplo para comparación de similitud." if lang != 'en' else "Another example text for similarity comparison."
-            similarity_score = similarity_agent.compute_similarity(user_input, text2)
-            st.write(f"Similarity Score with example text: {similarity_score:.4f}")
         else:
             st.warning("Please enter some text to analyze.")
 if __name__ == "__main__":
-    main()

 import matplotlib.pyplot as plt
 import pandas as pd
+@st.cache_resource
+def load_model():
+    return SentenceTransformer('distiluse-base-multilingual-cased-v1')
+DetectorFactory.seed = 0
+multi_embedding_model = load_model()
 class WordEmbeddingAgent:
     def __init__(self, model):
         return util.pytorch_cos_sim(embedding1, embedding2).item()
 class TopicModelingAgent:
+    def __init__(self, n_components=5):
         self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
     def fit_transform(self, texts, lang):
+        stop_words = 'english' if lang == 'en' else None
         vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
         dtm = vectorizer.fit_transform(texts)
         self.lda_model.fit(dtm)
         return self.lda_model.transform(dtm), vectorizer
+    def get_topics(self, vectorizer, num_words=5):
         topics = {}
         for idx, topic in enumerate(self.lda_model.components_):
             topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
     except:
         return "unknown"
+@st.cache_data
 def tsne_visualization(embeddings, words):
     tsne = TSNE(n_components=2, random_state=42)
     embeddings_2d = tsne.fit_transform(embeddings)
             similarity_agent = SimilarityAgent(multi_embedding_model)
             topic_modeling_agent = TopicModelingAgent()
             words = user_input.split()
+            with st.spinner("Generating word embeddings..."):
+                embeddings = embedding_agent.get_embeddings(words)
+            st.success("Word Embeddings Generated.")
+            with st.spinner("Creating t-SNE visualization..."):
+                tsne_df = tsne_visualization(embeddings, words)
+                fig, ax = plt.subplots()
+                ax.scatter(tsne_df['x'], tsne_df['y'])
+                for i, word in enumerate(tsne_df['word']):
+                    ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
+                st.pyplot(fig)
+            with st.spinner("Extracting topics..."):
+                texts = [user_input, "Another text to improve topic modeling."]
+                topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
+                topics = topic_modeling_agent.get_topics(vectorizer)
+                st.subheader("Topics Extracted:")
+                for topic, words in topics.items():
+                    st.write(f"Topic {topic}: {', '.join(words)}")
+            with st.spinner("Computing similarity..."):
+                text2 = "Otro texto de ejemplo para comparación de similitud." if lang != 'en' else "Another example text for similarity comparison."
+                similarity_score = similarity_agent.compute_similarity(user_input, text2)
+                st.write(f"Similarity Score with example text: {similarity_score:.4f}")
         else:
             st.warning("Please enter some text to analyze.")
 if __name__ == "__main__":
+    main()