KevSun commited on
Commit
3447d6e
verified
1 Parent(s): ee54fd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -110
app.py CHANGED
@@ -107,113 +107,3 @@ def main():
107
 
108
  if __name__ == "__main__":
109
  main()
110
- import streamlit as st
111
- from sentence_transformers import SentenceTransformer, util
112
- from sklearn.decomposition import LatentDirichletAllocation
113
- from sklearn.feature_extraction.text import CountVectorizer
114
- from sklearn.manifold import TSNE
115
- from langdetect import detect, DetectorFactory
116
- import numpy as np
117
- import matplotlib.pyplot as plt
118
- import pandas as pd
119
-
120
- DetectorFactory.seed = 0
121
-
122
- # Load models for embedding and similarity
123
- multi_embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
124
-
125
- class WordEmbeddingAgent:
126
- def __init__(self, model):
127
- self.model = model
128
-
129
- def get_embeddings(self, words):
130
- return self.model.encode(words)
131
-
132
- class SimilarityAgent:
133
- def __init__(self, model):
134
- self.model = model
135
-
136
- def compute_similarity(self, text1, text2):
137
- embedding1 = self.model.encode(text1, convert_to_tensor=True)
138
- embedding2 = self.model.encode(text2, convert_to_tensor=True)
139
- return util.pytorch_cos_sim(embedding1, embedding2).item()
140
-
141
- class TopicModelingAgent:
142
- def __init__(self, n_components=10):
143
- self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
144
-
145
- def fit_transform(self, texts, lang):
146
- stop_words = 'english' if lang == 'en' else 'spanish'
147
- vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
148
- dtm = vectorizer.fit_transform(texts)
149
- self.lda_model.fit(dtm)
150
- return self.lda_model.transform(dtm), vectorizer
151
-
152
- def get_topics(self, vectorizer, num_words=10):
153
- topics = {}
154
- for idx, topic in enumerate(self.lda_model.components_):
155
- topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
156
- return topics
157
-
158
- def detect_language(text):
159
- try:
160
- return detect(text)
161
- except:
162
- return "unknown"
163
-
164
- def tsne_visualization(embeddings, words):
165
- tsne = TSNE(n_components=2, random_state=42)
166
- embeddings_2d = tsne.fit_transform(embeddings)
167
- df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
168
- df['word'] = words
169
- return df
170
-
171
- def main():
172
- st.title("Multilingual Text Analysis System")
173
- user_input = st.text_area("Enter your text here:")
174
-
175
- if st.button("Analyze"):
176
- if user_input:
177
- lang = detect_language(user_input)
178
- st.write(f"Detected language: {lang}")
179
-
180
- embedding_agent = WordEmbeddingAgent(multi_embedding_model)
181
- similarity_agent = SimilarityAgent(multi_embedding_model)
182
- topic_modeling_agent = TopicModelingAgent()
183
-
184
- # Tokenize the input text into words
185
- words = user_input.split()
186
-
187
- # Generate Embeddings
188
- embeddings = embedding_agent.get_embeddings(words)
189
- st.write("Word Embeddings Generated.")
190
-
191
- # t-SNE Visualization
192
- tsne_df = tsne_visualization(embeddings, words)
193
- fig, ax = plt.subplots()
194
- ax.scatter(tsne_df['x'], tsne_df['y'])
195
-
196
- for i, word in enumerate(tsne_df['word']):
197
- ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
198
-
199
- st.pyplot(fig)
200
-
201
- # Topic Modeling
202
- texts = [user_input, "Another text to improve topic modeling."]
203
- topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
204
- topics = topic_modeling_agent.get_topics(vectorizer)
205
- st.write("Topics Extracted:")
206
- for topic, words in topics.items():
207
- st.write(f"Topic {topic}: {', '.join(words)}")
208
-
209
- # Sentence Similarity (example with another text)
210
- text2 = "Otro texto de ejemplo para comparaci贸n de similitud." if lang != 'en' else "Another example text for similarity comparison."
211
- similarity_score = similarity_agent.compute_similarity(user_input, text2)
212
- st.write(f"Similarity Score with example text: {similarity_score:.4f}")
213
-
214
- else:
215
- st.warning("Please enter some text to analyze.")
216
-
217
- if __name__ == "__main__":
218
- main()
219
-
 
107
 
108
  if __name__ == "__main__":
109
  main()