KevSun commited on
Commit
ee54fd3
verified
1 Parent(s): 5761cdf

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py CHANGED
@@ -107,4 +107,113 @@ def main():
107
 
108
  if __name__ == "__main__":
109
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
 
107
 
108
  if __name__ == "__main__":
109
  main()
110
+ import streamlit as st
111
+ from sentence_transformers import SentenceTransformer, util
112
+ from sklearn.decomposition import LatentDirichletAllocation
113
+ from sklearn.feature_extraction.text import CountVectorizer
114
+ from sklearn.manifold import TSNE
115
+ from langdetect import detect, DetectorFactory
116
+ import numpy as np
117
+ import matplotlib.pyplot as plt
118
+ import pandas as pd
119
+
120
+ DetectorFactory.seed = 0
121
+
122
+ # Load models for embedding and similarity
123
+ multi_embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
124
+
125
+ class WordEmbeddingAgent:
126
+ def __init__(self, model):
127
+ self.model = model
128
+
129
+ def get_embeddings(self, words):
130
+ return self.model.encode(words)
131
+
132
+ class SimilarityAgent:
133
+ def __init__(self, model):
134
+ self.model = model
135
+
136
+ def compute_similarity(self, text1, text2):
137
+ embedding1 = self.model.encode(text1, convert_to_tensor=True)
138
+ embedding2 = self.model.encode(text2, convert_to_tensor=True)
139
+ return util.pytorch_cos_sim(embedding1, embedding2).item()
140
+
141
+ class TopicModelingAgent:
142
+ def __init__(self, n_components=10):
143
+ self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
144
+
145
+ def fit_transform(self, texts, lang):
146
+ stop_words = 'english' if lang == 'en' else 'spanish'
147
+ vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
148
+ dtm = vectorizer.fit_transform(texts)
149
+ self.lda_model.fit(dtm)
150
+ return self.lda_model.transform(dtm), vectorizer
151
+
152
+ def get_topics(self, vectorizer, num_words=10):
153
+ topics = {}
154
+ for idx, topic in enumerate(self.lda_model.components_):
155
+ topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
156
+ return topics
157
+
158
+ def detect_language(text):
159
+ try:
160
+ return detect(text)
161
+ except:
162
+ return "unknown"
163
+
164
+ def tsne_visualization(embeddings, words):
165
+ tsne = TSNE(n_components=2, random_state=42)
166
+ embeddings_2d = tsne.fit_transform(embeddings)
167
+ df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
168
+ df['word'] = words
169
+ return df
170
+
171
+ def main():
172
+ st.title("Multilingual Text Analysis System")
173
+ user_input = st.text_area("Enter your text here:")
174
+
175
+ if st.button("Analyze"):
176
+ if user_input:
177
+ lang = detect_language(user_input)
178
+ st.write(f"Detected language: {lang}")
179
+
180
+ embedding_agent = WordEmbeddingAgent(multi_embedding_model)
181
+ similarity_agent = SimilarityAgent(multi_embedding_model)
182
+ topic_modeling_agent = TopicModelingAgent()
183
+
184
+ # Tokenize the input text into words
185
+ words = user_input.split()
186
+
187
+ # Generate Embeddings
188
+ embeddings = embedding_agent.get_embeddings(words)
189
+ st.write("Word Embeddings Generated.")
190
+
191
+ # t-SNE Visualization
192
+ tsne_df = tsne_visualization(embeddings, words)
193
+ fig, ax = plt.subplots()
194
+ ax.scatter(tsne_df['x'], tsne_df['y'])
195
+
196
+ for i, word in enumerate(tsne_df['word']):
197
+ ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
198
+
199
+ st.pyplot(fig)
200
+
201
+ # Topic Modeling
202
+ texts = [user_input, "Another text to improve topic modeling."]
203
+ topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
204
+ topics = topic_modeling_agent.get_topics(vectorizer)
205
+ st.write("Topics Extracted:")
206
+ for topic, words in topics.items():
207
+ st.write(f"Topic {topic}: {', '.join(words)}")
208
+
209
+ # Sentence Similarity (example with another text)
210
+ text2 = "Otro texto de ejemplo para comparaci贸n de similitud." if lang != 'en' else "Another example text for similarity comparison."
211
+ similarity_score = similarity_agent.compute_similarity(user_input, text2)
212
+ st.write(f"Similarity Score with example text: {similarity_score:.4f}")
213
+
214
+ else:
215
+ st.warning("Please enter some text to analyze.")
216
+
217
+ if __name__ == "__main__":
218
+ main()
219