Spaces:

kushalpatel0265
/

simppl

Sleeping

App Files Files Community

kushalpatel0265 commited on Mar 20

Commit

6d185a4

verified ·

1 Parent(s): 936ff44

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -79

app.py CHANGED Viewed

@@ -191,103 +191,96 @@ else:
     st.info(f"No '{text_col}' column available for sentiment analysis.")
 # --------------------------------------------------------------------------------
-# ---------------------------- Optional Features ---------------------------------
-# Use sidebar checkboxes to toggle optional features
 # --------------------------------------------------------------------------------
-st.sidebar.markdown("### Optional Features")
-show_topic_embedding = st.sidebar.checkbox("Topic Embedding Visualization")
-show_ts_genai_summary = st.sidebar.checkbox("GenAI Summary for Time Series")
-show_offline_events = st.sidebar.checkbox("Offline Events (Wikipedia)")
-show_semantic_search = st.sidebar.checkbox("Semantic Search on Posts")
-# ---------------------------------------------------------------------
 # (a) Topic Embedding Visualization using LDA + TSNE
-# ---------------------------------------------------------------------
-if show_topic_embedding:
-    st.markdown("## Topic Embedding Visualization")
-    if text_col in df.columns:
-        texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
-        vectorizer = CountVectorizer(stop_words='english', max_features=1000)
-        X = vectorizer.fit_transform(texts)
-        lda = LatentDirichletAllocation(n_components=5, random_state=42)
-        topic_matrix = lda.fit_transform(X)
-        dominant_topic = topic_matrix.argmax(axis=1)
-        tsne_model = TSNE(n_components=2, random_state=42)
-        tsne_values = tsne_model.fit_transform(topic_matrix)
-        tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
-        tsne_df["Dominant Topic"] = dominant_topic.astype(str)
-        fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
-                                title="TSNE Embedding of Topics")
-        st.plotly_chart(fig_topics)
-    else:
-        st.info("No text data available for topic embedding.")
-# ---------------------------------------------------------------------
 # (b) GenAI Summary for Time Series Plot
-# ---------------------------------------------------------------------
-if show_ts_genai_summary:
-    st.markdown("## GenAI Summary for Time Series")
     if not time_series.empty:
-        start = time_series["date"].min()
-        end = time_series["date"].max()
         avg_posts = time_series["count"].mean()
         peak = time_series.loc[time_series["count"].idxmax()]
         description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
-                       f"The highest activity was on {peak['date']} with {peak['count']} posts.")
         st.write("Time Series Description:")
         st.write(description)
-        ts_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
         try:
-            ts_summary = ts_summarizer(description, max_length=80, min_length=40, do_sample=False)[0]['summary_text']
             st.markdown("**GenAI Summary:**")
             st.write(ts_summary)
         except Exception as e:
             st.error("Error generating time series summary.")
     else:
         st.info("Time series data not available for summarization.")
-# ---------------------------------------------------------------------
-# (d) Offline Events from Wikipedia for a Given Topic
-# ---------------------------------------------------------------------
-if show_offline_events:
-    st.markdown("## Offline Events from Wikipedia")
-    wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
-    if wiki_topic:
-        try:
-            wiki_summary = wikipedia.summary(wiki_topic, sentences=5)
-            st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
-            st.write(wiki_summary)
-        except Exception as e:
-            st.error("Error retrieving Wikipedia data. Please check the topic name.")
-# ---------------------------------------------------------------------
-# (f) Semantic Search on Posts using Sentence Transformers
-# ---------------------------------------------------------------------
-if show_semantic_search:
-    st.markdown("## Semantic Search on Posts")
-    search_query = st.text_input("Enter your semantic search query:")
-    if search_query and text_col in df.columns:
-        @st.cache_data
-        def get_post_embeddings(texts):
-            model = SentenceTransformer("all-MiniLM-L6-v2")
-            return model.encode(texts, convert_to_tensor=True)
-        posts = df[text_col].dropna().tolist()
-        embeddings = get_post_embeddings(posts)
-        query_embedding = SentenceTransformer("all-MiniLM-L6-v2").encode(search_query, convert_to_tensor=True)
-        cos_scores = util.cos_sim(query_embedding, embeddings)[0]
-        top_results = cos_scores.topk(5)
-        st.markdown("**Top Matching Posts:**")
-        for score, idx in zip(top_results.values, top_results.indices):
-            st.write(f"Score: {score.item():.3f}")
-            st.write(posts[idx])
-            st.write("---")
-# ---------------------------------------------------------------------
-# (Optional) AI-Generated Summary on Posts (Existing Feature)
-# ---------------------------------------------------------------------
 st.markdown("## AI-Generated Summary of Posts")
 if text_col in df.columns:
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
     def generate_summary(text, summarizer, max_chunk_length=1000):
         chunks, current_chunk = [], ""
         for sentence in text.split('. '):
@@ -299,13 +292,18 @@ if text_col in df.columns:
                 current_chunk = sentence
         if current_chunk:
             chunks.append(current_chunk.strip())
         summaries = []
         for chunk in chunks:
             if len(chunk) > 50:
-                summary_chunk = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
                 summaries.append(summary_chunk)
         combined_summary = " ".join(summaries)
-        final_summary = summarizer(combined_summary, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
         return final_summary
     sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
@@ -323,5 +321,5 @@ else:
 st.markdown("### End of Dashboard")
 st.markdown("""
 This dashboard is a prototype implementation for analyzing Reddit social media data.
-It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality.
 """)

     st.info(f"No '{text_col}' column available for sentiment analysis.")
 # --------------------------------------------------------------------------------
+# ------------------------------ Additional Features -----------------------------
 # --------------------------------------------------------------------------------
 # (a) Topic Embedding Visualization using LDA + TSNE
+st.markdown("## Topic Embedding Visualization")
+if text_col in df.columns:
+    texts = df[text_col].dropna().sample(n=min(500, len(df)), random_state=42).tolist()
+    vectorizer = CountVectorizer(stop_words='english', max_features=1000)
+    X = vectorizer.fit_transform(texts)
+    lda = LatentDirichletAllocation(n_components=5, random_state=42)
+    topic_matrix = lda.fit_transform(X)
+    dominant_topic = topic_matrix.argmax(axis=1)
+    tsne_model = TSNE(n_components=2, random_state=42)
+    tsne_values = tsne_model.fit_transform(topic_matrix)
+    tsne_df = pd.DataFrame(tsne_values, columns=["x", "y"])
+    tsne_df["Dominant Topic"] = dominant_topic.astype(str)
+    fig_topics = px.scatter(tsne_df, x="x", y="y", color="Dominant Topic",
+                            title="TSNE Embedding of Topics")
+    st.plotly_chart(fig_topics)
+else:
+    st.info("No text data available for topic embedding.")
 # (b) GenAI Summary for Time Series Plot
+st.markdown("## GenAI Summary for Time Series")
+if timestamp_col in df.columns:
+    time_series = df.groupby(df[timestamp_col].dt.date).size().reset_index(name="count")
     if not time_series.empty:
+        start = time_series["created_utc"].min()
+        end = time_series["created_utc"].max()
         avg_posts = time_series["count"].mean()
         peak = time_series.loc[time_series["count"].idxmax()]
         description = (f"From {start} to {end}, the average number of posts per day was {avg_posts:.1f}. "
+                       f"The highest activity was on {peak['created_utc']} with {peak['count']} posts.")
         st.write("Time Series Description:")
         st.write(description)
+        # Use a smaller, faster summarization model
+        ts_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
         try:
+            ts_summary = ts_summarizer(
+                description, max_length=80, min_length=40, do_sample=False
+            )[0]['summary_text']
             st.markdown("**GenAI Summary:**")
             st.write(ts_summary)
         except Exception as e:
             st.error("Error generating time series summary.")
     else:
         st.info("Time series data not available for summarization.")
+else:
+    st.info("No timestamp column available for time series summary.")
+# (c) Offline Events from Wikipedia
+st.markdown("## Offline Events from Wikipedia")
+wiki_topic = st.text_input("Enter a topic to fetch offline events (e.g., 'Russian invasion of Ukraine'):")
+if wiki_topic:
+    try:
+        wiki_summary = wikipedia.summary(wiki_topic, sentences=5)
+        st.markdown(f"**Wikipedia Summary for '{wiki_topic}':**")
+        st.write(wiki_summary)
+    except Exception as e:
+        st.error("Error retrieving Wikipedia data. Please check the topic name.")
+# (d) Semantic Search on Posts using Sentence Transformers
+st.markdown("## Semantic Search on Posts")
+search_query = st.text_input("Enter your semantic search query:")
+if search_query and text_col in df.columns:
+    @st.cache_data
+    def get_post_embeddings(texts):
+        # Use a smaller, faster model
+        model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
+        return model.encode(texts, convert_to_tensor=True)
+    posts = df[text_col].dropna().tolist()
+    embeddings = get_post_embeddings(posts)
+    model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
+    query_embedding = model.encode(search_query, convert_to_tensor=True)
+    cos_scores = util.cos_sim(query_embedding, embeddings)[0]
+    top_results = cos_scores.topk(5)
+    st.markdown("**Top Matching Posts:**")
+    for score, idx in zip(top_results.values, top_results.indices):
+        st.write(f"Score: {score.item():.3f}")
+        st.write(posts[idx])
+        st.write("---")
+# (e) AI-Generated Summary of Posts
 st.markdown("## AI-Generated Summary of Posts")
 if text_col in df.columns:
+    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
     def generate_summary(text, summarizer, max_chunk_length=1000):
         chunks, current_chunk = [], ""
         for sentence in text.split('. '):
                 current_chunk = sentence
         if current_chunk:
             chunks.append(current_chunk.strip())
         summaries = []
         for chunk in chunks:
             if len(chunk) > 50:
+                summary_chunk = summarizer(
+                    chunk, max_length=150, min_length=40, do_sample=False
+                )[0]['summary_text']
                 summaries.append(summary_chunk)
         combined_summary = " ".join(summaries)
+        final_summary = summarizer(
+            combined_summary, max_length=150, min_length=40, do_sample=False
+        )[0]['summary_text']
         return final_summary
     sample_text = " ".join(df[text_col].dropna().sample(n=min(10, len(df)), random_state=42).tolist())
 st.markdown("### End of Dashboard")
 st.markdown("""
 This dashboard is a prototype implementation for analyzing Reddit social media data.
+It demonstrates advanced trend analysis, contributor insights, topic embeddings, GenAI summaries, offline event linking, and semantic search functionality **using faster models**.
 """)