import streamlit as st from sentence_transformers import SentenceTransformer, util from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer from sklearn.manifold import TSNE from langdetect import detect, DetectorFactory import numpy as np import matplotlib.pyplot as plt import pandas as pd DetectorFactory.seed = 0 # Load models for embedding and similarity multi_embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1') class WordEmbeddingAgent: def __init__(self, model): self.model = model def get_embeddings(self, words): return self.model.encode(words) class SimilarityAgent: def __init__(self, model): self.model = model def compute_similarity(self, text1, text2): embedding1 = self.model.encode(text1, convert_to_tensor=True) embedding2 = self.model.encode(text2, convert_to_tensor=True) return util.pytorch_cos_sim(embedding1, embedding2).item() class TopicModelingAgent: def __init__(self, n_components=10): self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42) def fit_transform(self, texts, lang): stop_words = 'english' if lang == 'en' else 'spanish' vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words) dtm = vectorizer.fit_transform(texts) self.lda_model.fit(dtm) return self.lda_model.transform(dtm), vectorizer def get_topics(self, vectorizer, num_words=10): topics = {} for idx, topic in enumerate(self.lda_model.components_): topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]] return topics def detect_language(text): try: return detect(text) except: return "unknown" def tsne_visualization(embeddings, words): tsne = TSNE(n_components=2, random_state=42) embeddings_2d = tsne.fit_transform(embeddings) df = pd.DataFrame(embeddings_2d, columns=['x', 'y']) df['word'] = words return df def main(): st.title("Multilingual Text Analysis System") user_input = st.text_area("Enter your text here:") if st.button("Analyze"): if user_input: lang = detect_language(user_input) st.write(f"Detected language: {lang}") embedding_agent = WordEmbeddingAgent(multi_embedding_model) similarity_agent = SimilarityAgent(multi_embedding_model) topic_modeling_agent = TopicModelingAgent() # Tokenize the input text into words words = user_input.split() # Generate Embeddings embeddings = embedding_agent.get_embeddings(words) st.write("Word Embeddings Generated.") # t-SNE Visualization tsne_df = tsne_visualization(embeddings, words) fig, ax = plt.subplots() ax.scatter(tsne_df['x'], tsne_df['y']) for i, word in enumerate(tsne_df['word']): ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i])) st.pyplot(fig) # Topic Modeling texts = [user_input, "Another text to improve topic modeling."] topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang) topics = topic_modeling_agent.get_topics(vectorizer) st.write("Topics Extracted:") for topic, words in topics.items(): st.write(f"Topic {topic}: {', '.join(words)}") # Sentence Similarity (example with another text) text2 = "Otro texto de ejemplo para comparaciĆ³n de similitud." if lang != 'en' else "Another example text for similarity comparison." similarity_score = similarity_agent.compute_similarity(user_input, text2) st.write(f"Similarity Score with example text: {similarity_score:.4f}") else: st.warning("Please enter some text to analyze.") if __name__ == "__main__": main()